{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6002705546370058, "eval_steps": 500, "global_step": 9984, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 2.43573999, "balance_loss_mlp": 1.76983953, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.00561300220404, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "step": 1, "time_per_iteration": 18.059409618377686 }, { "auxiliary_loss_clip": 0.03380539, "auxiliary_loss_mlp": 0.01459449, "balance_loss_clip": 1.62786555, "balance_loss_mlp": 1.18936849, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 34.93149751452764, "language_loss": 1.82606053, "learning_rate": 4.4628432569317594e-07, "loss": 1.87446034, "num_input_tokens_seen": 36175, "step": 2, "time_per_iteration": 2.6318798065185547 }, { "auxiliary_loss_clip": 0.03320229, "auxiliary_loss_mlp": 0.01440978, "balance_loss_clip": 1.62577581, "balance_loss_mlp": 1.18882656, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 32.71870482280511, "language_loss": 1.57573509, "learning_rate": 7.073439208833112e-07, "loss": 1.62334716, "num_input_tokens_seen": 54870, "step": 3, "time_per_iteration": 2.6362481117248535 }, { "auxiliary_loss_clip": 0.03361497, "auxiliary_loss_mlp": 0.01451404, "balance_loss_clip": 1.62418985, "balance_loss_mlp": 1.15500188, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 51.2387172839747, "language_loss": 1.67362881, "learning_rate": 8.925686513863519e-07, "loss": 1.72175777, "num_input_tokens_seen": 74575, "step": 4, "time_per_iteration": 2.7070822715759277 }, { "auxiliary_loss_clip": 0.03402497, "auxiliary_loss_mlp": 0.01505358, "balance_loss_clip": 1.62493396, "balance_loss_mlp": 1.21715808, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 56.088721215944275, "language_loss": 1.91627169, "learning_rate": 1.0362401141348472e-06, "loss": 1.96535027, "num_input_tokens_seen": 92580, "step": 5, "time_per_iteration": 2.91436767578125 }, { "auxiliary_loss_clip": 0.03370454, "auxiliary_loss_mlp": 0.01515599, "balance_loss_clip": 1.61556244, "balance_loss_mlp": 1.22110426, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 33.397169652317885, "language_loss": 1.60591149, "learning_rate": 1.153628246576487e-06, "loss": 1.65477204, "num_input_tokens_seen": 109705, "step": 6, "time_per_iteration": 2.994969367980957 }, { "auxiliary_loss_clip": 0.03354239, "auxiliary_loss_mlp": 0.01486417, "balance_loss_clip": 1.61577415, "balance_loss_mlp": 1.20336628, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 24.6270766983672, "language_loss": 1.53276002, "learning_rate": 1.2528784983718962e-06, "loss": 1.58116663, "num_input_tokens_seen": 129425, "step": 7, "time_per_iteration": 3.0675876140594482 }, { "auxiliary_loss_clip": 0.03321216, "auxiliary_loss_mlp": 0.0144328, "balance_loss_clip": 1.61205018, "balance_loss_mlp": 1.16499734, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 31.71613063643349, "language_loss": 1.43881059, "learning_rate": 1.338852977079528e-06, "loss": 1.48645568, "num_input_tokens_seen": 149210, "step": 8, "time_per_iteration": 3.172358751296997 }, { "auxiliary_loss_clip": 0.03368839, "auxiliary_loss_mlp": 0.01496105, "balance_loss_clip": 1.6120348, "balance_loss_mlp": 1.21229148, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 28.204490849684397, "language_loss": 1.4969244, "learning_rate": 1.4146878417666224e-06, "loss": 1.54557395, "num_input_tokens_seen": 169055, "step": 9, "time_per_iteration": 3.112215280532837 }, { "auxiliary_loss_clip": 0.03308365, "auxiliary_loss_mlp": 0.01475035, "balance_loss_clip": 1.61541438, "balance_loss_mlp": 1.20647991, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 23.420774723604698, "language_loss": 1.44714785, "learning_rate": 1.4825244398280232e-06, "loss": 1.49498188, "num_input_tokens_seen": 188045, "step": 10, "time_per_iteration": 2.9495606422424316 }, { "auxiliary_loss_clip": 0.03364194, "auxiliary_loss_mlp": 0.01494262, "balance_loss_clip": 1.62042511, "balance_loss_mlp": 1.22036684, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 18.353281468858004, "language_loss": 1.4520936, "learning_rate": 1.5438901072051983e-06, "loss": 1.50067806, "num_input_tokens_seen": 207035, "step": 11, "time_per_iteration": 3.0797431468963623 }, { "auxiliary_loss_clip": 0.03292683, "auxiliary_loss_mlp": 0.0145154, "balance_loss_clip": 1.60771322, "balance_loss_mlp": 1.17554641, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 16.61869254675767, "language_loss": 1.45121813, "learning_rate": 1.5999125722696629e-06, "loss": 1.49866033, "num_input_tokens_seen": 223225, "step": 12, "time_per_iteration": 2.9887659549713135 }, { "auxiliary_loss_clip": 0.03321669, "auxiliary_loss_mlp": 0.01405912, "balance_loss_clip": 1.61740756, "balance_loss_mlp": 1.14765704, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 14.02187318243825, "language_loss": 1.23759985, "learning_rate": 1.6514482443788434e-06, "loss": 1.28487587, "num_input_tokens_seen": 242570, "step": 13, "time_per_iteration": 3.032742977142334 }, { "auxiliary_loss_clip": 0.03287474, "auxiliary_loss_mlp": 0.01470749, "balance_loss_clip": 1.61299658, "balance_loss_mlp": 1.20257616, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 5.790568956401358, "language_loss": 1.20684385, "learning_rate": 1.6991628240650723e-06, "loss": 1.254426, "num_input_tokens_seen": 261215, "step": 14, "time_per_iteration": 3.002887487411499 }, { "auxiliary_loss_clip": 0.03272826, "auxiliary_loss_mlp": 0.01431255, "balance_loss_clip": 1.6181426, "balance_loss_mlp": 1.16804111, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 6.353887091300461, "language_loss": 1.12925518, "learning_rate": 1.7435840350181584e-06, "loss": 1.176296, "num_input_tokens_seen": 280035, "step": 15, "time_per_iteration": 3.0238780975341797 }, { "auxiliary_loss_clip": 0.03238489, "auxiliary_loss_mlp": 0.01411651, "balance_loss_clip": 1.60288334, "balance_loss_mlp": 1.16197944, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 4.670310144758637, "language_loss": 1.11125767, "learning_rate": 1.7851373027727038e-06, "loss": 1.15775907, "num_input_tokens_seen": 300265, "step": 16, "time_per_iteration": 4.605847120285034 }, { "auxiliary_loss_clip": 0.03223993, "auxiliary_loss_mlp": 0.01417304, "balance_loss_clip": 1.60910368, "balance_loss_mlp": 1.17774093, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 8.838429022323517, "language_loss": 1.12645221, "learning_rate": 1.8241705979033208e-06, "loss": 1.17286515, "num_input_tokens_seen": 317375, "step": 17, "time_per_iteration": 4.579033851623535 }, { "auxiliary_loss_clip": 0.03161492, "auxiliary_loss_mlp": 0.01379312, "balance_loss_clip": 1.60685277, "balance_loss_mlp": 1.1475693, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 3.823557061532633, "language_loss": 1.08069181, "learning_rate": 1.860972167459798e-06, "loss": 1.12609982, "num_input_tokens_seen": 337975, "step": 18, "time_per_iteration": 3.0132579803466797 }, { "auxiliary_loss_clip": 0.0318761, "auxiliary_loss_mlp": 0.01403306, "balance_loss_clip": 1.60585093, "balance_loss_mlp": 1.13799417, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 4.403621106373983, "language_loss": 1.02445412, "learning_rate": 1.89578346593066e-06, "loss": 1.07036328, "num_input_tokens_seen": 356635, "step": 19, "time_per_iteration": 3.016176462173462 }, { "auxiliary_loss_clip": 0.0313029, "auxiliary_loss_mlp": 0.01342049, "balance_loss_clip": 1.60759044, "balance_loss_mlp": 1.12155962, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.958333686933058, "language_loss": 1.16706228, "learning_rate": 1.928808765521199e-06, "loss": 1.21178555, "num_input_tokens_seen": 375625, "step": 20, "time_per_iteration": 3.0274486541748047 }, { "auxiliary_loss_clip": 0.03118109, "auxiliary_loss_mlp": 0.01378536, "balance_loss_clip": 1.58886433, "balance_loss_mlp": 1.1298182, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 4.333519066420982, "language_loss": 1.06129968, "learning_rate": 1.9602224192552076e-06, "loss": 1.10626626, "num_input_tokens_seen": 394350, "step": 21, "time_per_iteration": 2.9418578147888184 }, { "auxiliary_loss_clip": 0.03013912, "auxiliary_loss_mlp": 0.0137937, "balance_loss_clip": 1.57028937, "balance_loss_mlp": 1.14552903, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 3.63841390311849, "language_loss": 1.05861485, "learning_rate": 1.9901744328983746e-06, "loss": 1.10254765, "num_input_tokens_seen": 413255, "step": 22, "time_per_iteration": 2.9651288986206055 }, { "auxiliary_loss_clip": 0.02966296, "auxiliary_loss_mlp": 0.01334065, "balance_loss_clip": 1.57175612, "balance_loss_mlp": 1.12377954, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 2.8746130742538347, "language_loss": 0.9177655, "learning_rate": 2.018794797290208e-06, "loss": 0.96076906, "num_input_tokens_seen": 433065, "step": 23, "time_per_iteration": 3.049853563308716 }, { "auxiliary_loss_clip": 0.02932793, "auxiliary_loss_mlp": 0.01362183, "balance_loss_clip": 1.56404662, "balance_loss_mlp": 1.14236116, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 3.0897201135857735, "language_loss": 1.08192635, "learning_rate": 2.046196897962839e-06, "loss": 1.12487614, "num_input_tokens_seen": 451175, "step": 24, "time_per_iteration": 3.0543172359466553 }, { "auxiliary_loss_clip": 0.02823838, "auxiliary_loss_mlp": 0.01329007, "balance_loss_clip": 1.55692792, "balance_loss_mlp": 1.11853111, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 4.111246686692462, "language_loss": 1.01367807, "learning_rate": 2.0724802282696944e-06, "loss": 1.05520654, "num_input_tokens_seen": 468775, "step": 25, "time_per_iteration": 3.0059614181518555 }, { "auxiliary_loss_clip": 0.02818207, "auxiliary_loss_mlp": 0.01309454, "balance_loss_clip": 1.55974329, "balance_loss_mlp": 1.10012197, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 2.7163042439620018, "language_loss": 1.0669204, "learning_rate": 2.0977325700720194e-06, "loss": 1.10819697, "num_input_tokens_seen": 488530, "step": 26, "time_per_iteration": 3.1159534454345703 }, { "auxiliary_loss_clip": 0.0276047, "auxiliary_loss_mlp": 0.01325034, "balance_loss_clip": 1.54973662, "balance_loss_mlp": 1.12533486, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 2.562596284241794, "language_loss": 0.95537072, "learning_rate": 2.122031762649933e-06, "loss": 0.99622583, "num_input_tokens_seen": 510495, "step": 27, "time_per_iteration": 3.018643617630005 }, { "auxiliary_loss_clip": 0.02736222, "auxiliary_loss_mlp": 0.01311707, "balance_loss_clip": 1.55399776, "balance_loss_mlp": 1.13089037, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 2.42975125432869, "language_loss": 1.06393945, "learning_rate": 2.1454471497582483e-06, "loss": 1.10441875, "num_input_tokens_seen": 528605, "step": 28, "time_per_iteration": 2.9263083934783936 }, { "auxiliary_loss_clip": 0.0270011, "auxiliary_loss_mlp": 0.0131913, "balance_loss_clip": 1.53841436, "balance_loss_mlp": 1.13297284, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 4.42090805909513, "language_loss": 1.02493238, "learning_rate": 2.1680407726407727e-06, "loss": 1.06512475, "num_input_tokens_seen": 548515, "step": 29, "time_per_iteration": 3.0062997341156006 }, { "auxiliary_loss_clip": 0.0269246, "auxiliary_loss_mlp": 0.01312758, "balance_loss_clip": 1.53459728, "balance_loss_mlp": 1.12631428, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 3.1534114534186446, "language_loss": 1.19265521, "learning_rate": 2.189868360711334e-06, "loss": 1.23270726, "num_input_tokens_seen": 564025, "step": 30, "time_per_iteration": 2.931145429611206 }, { "auxiliary_loss_clip": 0.02610377, "auxiliary_loss_mlp": 0.01337183, "balance_loss_clip": 1.52116311, "balance_loss_mlp": 1.15665221, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 2.735994596991484, "language_loss": 1.02616811, "learning_rate": 2.2109801597326265e-06, "loss": 1.06564379, "num_input_tokens_seen": 583345, "step": 31, "time_per_iteration": 2.993251085281372 }, { "auxiliary_loss_clip": 0.02582044, "auxiliary_loss_mlp": 0.01331305, "balance_loss_clip": 1.522609, "balance_loss_mlp": 1.15163302, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 3.9056907796043654, "language_loss": 0.95266509, "learning_rate": 2.2314216284658796e-06, "loss": 0.99179864, "num_input_tokens_seen": 600010, "step": 32, "time_per_iteration": 2.9459571838378906 }, { "auxiliary_loss_clip": 0.02564836, "auxiliary_loss_mlp": 0.01302659, "balance_loss_clip": 1.51811624, "balance_loss_mlp": 1.13586164, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 3.226486022987097, "language_loss": 0.95143497, "learning_rate": 2.2512340280885094e-06, "loss": 0.99010992, "num_input_tokens_seen": 616295, "step": 33, "time_per_iteration": 2.9855570793151855 }, { "auxiliary_loss_clip": 0.02421202, "auxiliary_loss_mlp": 0.01304214, "balance_loss_clip": 1.48474145, "balance_loss_mlp": 1.14676213, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.1714659525821247, "language_loss": 0.91547924, "learning_rate": 2.270454923596497e-06, "loss": 0.9527334, "num_input_tokens_seen": 637640, "step": 34, "time_per_iteration": 2.981541872024536 }, { "auxiliary_loss_clip": 0.02375249, "auxiliary_loss_mlp": 0.01271963, "balance_loss_clip": 1.45095515, "balance_loss_mlp": 1.11689591, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 2.2635429103650386, "language_loss": 0.76603377, "learning_rate": 2.2891186125067434e-06, "loss": 0.80250585, "num_input_tokens_seen": 659710, "step": 35, "time_per_iteration": 3.2267208099365234 }, { "auxiliary_loss_clip": 0.02347187, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 1.46356034, "balance_loss_mlp": 1.13238275, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.3605884715298506, "language_loss": 0.88713098, "learning_rate": 2.307256493152974e-06, "loss": 0.92336679, "num_input_tokens_seen": 679670, "step": 36, "time_per_iteration": 2.948162078857422 }, { "auxiliary_loss_clip": 0.02289192, "auxiliary_loss_mlp": 0.01338204, "balance_loss_clip": 1.45043015, "balance_loss_mlp": 1.19105196, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.4929063351918166, "language_loss": 0.93038809, "learning_rate": 2.3248973825097614e-06, "loss": 0.96666199, "num_input_tokens_seen": 700170, "step": 37, "time_per_iteration": 2.9556422233581543 }, { "auxiliary_loss_clip": 0.02249098, "auxiliary_loss_mlp": 0.01276785, "balance_loss_clip": 1.44485605, "balance_loss_mlp": 1.15500069, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.177909778954084, "language_loss": 1.03952074, "learning_rate": 2.3420677916238357e-06, "loss": 1.07477951, "num_input_tokens_seen": 718545, "step": 38, "time_per_iteration": 2.9959065914154053 }, { "auxiliary_loss_clip": 0.02216028, "auxiliary_loss_mlp": 0.01260768, "balance_loss_clip": 1.43807542, "balance_loss_mlp": 1.13726676, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.22652515093943, "language_loss": 0.85297108, "learning_rate": 2.358792165262154e-06, "loss": 0.887739, "num_input_tokens_seen": 739865, "step": 39, "time_per_iteration": 3.035399913787842 }, { "auxiliary_loss_clip": 0.02192275, "auxiliary_loss_mlp": 0.01250434, "balance_loss_clip": 1.4289664, "balance_loss_mlp": 1.12216496, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 3.258228308703562, "language_loss": 0.90279335, "learning_rate": 2.3750930912143747e-06, "loss": 0.93722045, "num_input_tokens_seen": 755770, "step": 40, "time_per_iteration": 3.060368299484253 }, { "auxiliary_loss_clip": 0.02142113, "auxiliary_loss_mlp": 0.01273783, "balance_loss_clip": 1.41895449, "balance_loss_mlp": 1.16086745, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 3.245861029799582, "language_loss": 0.93271625, "learning_rate": 2.3909914837471044e-06, "loss": 0.9668752, "num_input_tokens_seen": 773440, "step": 41, "time_per_iteration": 2.9518353939056396 }, { "auxiliary_loss_clip": 0.02105753, "auxiliary_loss_mlp": 0.01254821, "balance_loss_clip": 1.41097844, "balance_loss_mlp": 1.15168142, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 3.3039479788253536, "language_loss": 0.97533798, "learning_rate": 2.4065067449483835e-06, "loss": 1.0089438, "num_input_tokens_seen": 790455, "step": 42, "time_per_iteration": 2.933177947998047 }, { "auxiliary_loss_clip": 0.020675, "auxiliary_loss_mlp": 0.01298422, "balance_loss_clip": 1.41198874, "balance_loss_mlp": 1.19189644, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 3.15071165872949, "language_loss": 0.97562659, "learning_rate": 2.4216569070848724e-06, "loss": 1.00928593, "num_input_tokens_seen": 810645, "step": 43, "time_per_iteration": 2.9760589599609375 }, { "auxiliary_loss_clip": 0.02086351, "auxiliary_loss_mlp": 0.01314601, "balance_loss_clip": 1.41042757, "balance_loss_mlp": 1.20283043, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.3612650137146574, "language_loss": 0.93435001, "learning_rate": 2.4364587585915504e-06, "loss": 0.96835947, "num_input_tokens_seen": 827470, "step": 44, "time_per_iteration": 2.9239895343780518 }, { "auxiliary_loss_clip": 0.02043996, "auxiliary_loss_mlp": 0.01272131, "balance_loss_clip": 1.40557313, "balance_loss_mlp": 1.17399764, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.1476860292916644, "language_loss": 0.98677421, "learning_rate": 2.450927955901469e-06, "loss": 1.01993537, "num_input_tokens_seen": 847285, "step": 45, "time_per_iteration": 2.9626305103302 }, { "auxiliary_loss_clip": 0.02018804, "auxiliary_loss_mlp": 0.01228873, "balance_loss_clip": 1.39126372, "balance_loss_mlp": 1.14208817, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.8862192248435494, "language_loss": 1.02800822, "learning_rate": 2.465079122983384e-06, "loss": 1.06048501, "num_input_tokens_seen": 867545, "step": 46, "time_per_iteration": 2.9913573265075684 }, { "auxiliary_loss_clip": 0.0198766, "auxiliary_loss_mlp": 0.01272862, "balance_loss_clip": 1.38388658, "balance_loss_mlp": 1.182549, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.1076645953887696, "language_loss": 0.87839413, "learning_rate": 2.4789259401737868e-06, "loss": 0.9109993, "num_input_tokens_seen": 889915, "step": 47, "time_per_iteration": 3.0189881324768066 }, { "auxiliary_loss_clip": 0.01949271, "auxiliary_loss_mlp": 0.01255947, "balance_loss_clip": 1.37360096, "balance_loss_mlp": 1.16963911, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 4.4561049138068, "language_loss": 0.87809587, "learning_rate": 2.492481223656015e-06, "loss": 0.91014802, "num_input_tokens_seen": 908975, "step": 48, "time_per_iteration": 2.863565444946289 }, { "auxiliary_loss_clip": 0.01949016, "auxiliary_loss_mlp": 0.0124182, "balance_loss_clip": 1.36337733, "balance_loss_mlp": 1.15069616, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.9451035624229855, "language_loss": 0.89691317, "learning_rate": 2.5057569967437924e-06, "loss": 0.9288215, "num_input_tokens_seen": 929810, "step": 49, "time_per_iteration": 2.9967453479766846 }, { "auxiliary_loss_clip": 0.0194038, "auxiliary_loss_mlp": 0.01234077, "balance_loss_clip": 1.35742152, "balance_loss_mlp": 1.14996314, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 3.162716210197168, "language_loss": 0.90914285, "learning_rate": 2.51876455396287e-06, "loss": 0.94088745, "num_input_tokens_seen": 948650, "step": 50, "time_per_iteration": 2.8832523822784424 }, { "auxiliary_loss_clip": 0.01938537, "auxiliary_loss_mlp": 0.01199505, "balance_loss_clip": 1.36240602, "balance_loss_mlp": 1.11844242, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 6.098010360158733, "language_loss": 0.86977792, "learning_rate": 2.5315145187866316e-06, "loss": 0.90115827, "num_input_tokens_seen": 966455, "step": 51, "time_per_iteration": 2.9061717987060547 }, { "auxiliary_loss_clip": 0.01895637, "auxiliary_loss_mlp": 0.01206588, "balance_loss_clip": 1.35252357, "balance_loss_mlp": 1.12829173, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 2.043292881862276, "language_loss": 0.95171362, "learning_rate": 2.5440168957651953e-06, "loss": 0.98273587, "num_input_tokens_seen": 988110, "step": 52, "time_per_iteration": 3.0266616344451904 }, { "auxiliary_loss_clip": 0.01893195, "auxiliary_loss_mlp": 0.01241159, "balance_loss_clip": 1.34894896, "balance_loss_mlp": 1.16162264, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 4.2358840345824635, "language_loss": 0.92323011, "learning_rate": 2.5562811176888872e-06, "loss": 0.95457363, "num_input_tokens_seen": 1008550, "step": 53, "time_per_iteration": 2.8850226402282715 }, { "auxiliary_loss_clip": 0.01882736, "auxiliary_loss_mlp": 0.01197045, "balance_loss_clip": 1.35264134, "balance_loss_mlp": 1.11669779, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.290226623360683, "language_loss": 0.8260113, "learning_rate": 2.5683160883431093e-06, "loss": 0.85680908, "num_input_tokens_seen": 1026840, "step": 54, "time_per_iteration": 2.9433553218841553 }, { "auxiliary_loss_clip": 0.01880073, "auxiliary_loss_mlp": 0.01210775, "balance_loss_clip": 1.34162152, "balance_loss_mlp": 1.13233542, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.911577423572303, "language_loss": 0.81303245, "learning_rate": 2.580130221340046e-06, "loss": 0.84394085, "num_input_tokens_seen": 1048875, "step": 55, "time_per_iteration": 3.0040643215179443 }, { "auxiliary_loss_clip": 0.01870075, "auxiliary_loss_mlp": 0.0120375, "balance_loss_clip": 1.33644819, "balance_loss_mlp": 1.12521541, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 2.639118679342801, "language_loss": 0.87089968, "learning_rate": 2.5917314754514246e-06, "loss": 0.90163803, "num_input_tokens_seen": 1066435, "step": 56, "time_per_iteration": 2.830453395843506 }, { "auxiliary_loss_clip": 0.01869912, "auxiliary_loss_mlp": 0.01161425, "balance_loss_clip": 1.32921791, "balance_loss_mlp": 1.08851671, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 2.101574700040827, "language_loss": 0.92890096, "learning_rate": 2.6031273868139713e-06, "loss": 0.95921433, "num_input_tokens_seen": 1090330, "step": 57, "time_per_iteration": 7.0071024894714355 }, { "auxiliary_loss_clip": 0.01833802, "auxiliary_loss_mlp": 0.0121675, "balance_loss_clip": 1.33333457, "balance_loss_mlp": 1.14493799, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 14.610065921505914, "language_loss": 0.9972856, "learning_rate": 2.614325098333948e-06, "loss": 1.02779114, "num_input_tokens_seen": 1109840, "step": 58, "time_per_iteration": 2.830960273742676 }, { "auxiliary_loss_clip": 0.0181804, "auxiliary_loss_mlp": 0.01199311, "balance_loss_clip": 1.32073379, "balance_loss_mlp": 1.12835753, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 2.120622270947527, "language_loss": 0.88172519, "learning_rate": 2.625331386578098e-06, "loss": 0.91189873, "num_input_tokens_seen": 1128415, "step": 59, "time_per_iteration": 2.8507089614868164 }, { "auxiliary_loss_clip": 0.01839573, "auxiliary_loss_mlp": 0.01163328, "balance_loss_clip": 1.32924581, "balance_loss_mlp": 1.09075332, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 2.021991994360373, "language_loss": 0.93542433, "learning_rate": 2.63615268640451e-06, "loss": 0.96545339, "num_input_tokens_seen": 1146515, "step": 60, "time_per_iteration": 2.8517534732818604 }, { "auxiliary_loss_clip": 0.0181893, "auxiliary_loss_mlp": 0.01176948, "balance_loss_clip": 1.31414318, "balance_loss_mlp": 1.10923755, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.908283338489548, "language_loss": 0.90021706, "learning_rate": 2.6467951135575943e-06, "loss": 0.9301759, "num_input_tokens_seen": 1166330, "step": 61, "time_per_iteration": 2.8853390216827393 }, { "auxiliary_loss_clip": 0.01803943, "auxiliary_loss_mlp": 0.01142904, "balance_loss_clip": 1.31131864, "balance_loss_mlp": 1.07581341, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 1.8428161811646855, "language_loss": 0.88479733, "learning_rate": 2.657264485425803e-06, "loss": 0.91426575, "num_input_tokens_seen": 1186010, "step": 62, "time_per_iteration": 2.8860812187194824 }, { "auxiliary_loss_clip": 0.01785338, "auxiliary_loss_mlp": 0.0116457, "balance_loss_clip": 1.30233741, "balance_loss_mlp": 1.09504724, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 2.4385306002926512, "language_loss": 0.96280968, "learning_rate": 2.6675663401385186e-06, "loss": 0.99230874, "num_input_tokens_seen": 1204985, "step": 63, "time_per_iteration": 2.9081404209136963 }, { "auxiliary_loss_clip": 0.01795068, "auxiliary_loss_mlp": 0.01171321, "balance_loss_clip": 1.31071985, "balance_loss_mlp": 1.10499322, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 3.0781639748926697, "language_loss": 0.98840165, "learning_rate": 2.677705954159056e-06, "loss": 1.01806557, "num_input_tokens_seen": 1223545, "step": 64, "time_per_iteration": 2.893603801727295 }, { "auxiliary_loss_clip": 0.01801311, "auxiliary_loss_mlp": 0.01151112, "balance_loss_clip": 1.30960393, "balance_loss_mlp": 1.08368695, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.4813676281781554, "language_loss": 0.85397774, "learning_rate": 2.6876883585136904e-06, "loss": 0.88350195, "num_input_tokens_seen": 1241175, "step": 65, "time_per_iteration": 2.8768796920776367 }, { "auxiliary_loss_clip": 0.01777474, "auxiliary_loss_mlp": 0.01155217, "balance_loss_clip": 1.29563761, "balance_loss_mlp": 1.087888, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 1.8550079005121831, "language_loss": 0.85281348, "learning_rate": 2.697518353781685e-06, "loss": 0.88214046, "num_input_tokens_seen": 1259315, "step": 66, "time_per_iteration": 2.769274950027466 }, { "auxiliary_loss_clip": 0.01779987, "auxiliary_loss_mlp": 0.01151372, "balance_loss_clip": 1.29312515, "balance_loss_mlp": 1.07650828, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 2.74895944689593, "language_loss": 0.96567476, "learning_rate": 2.7072005239581103e-06, "loss": 0.99498826, "num_input_tokens_seen": 1277055, "step": 67, "time_per_iteration": 2.889369249343872 }, { "auxiliary_loss_clip": 0.01752442, "auxiliary_loss_mlp": 0.01152779, "balance_loss_clip": 1.28765118, "balance_loss_mlp": 1.08120584, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.109359538419204, "language_loss": 0.94516367, "learning_rate": 2.7167392492896727e-06, "loss": 0.97421581, "num_input_tokens_seen": 1294355, "step": 68, "time_per_iteration": 2.8107409477233887 }, { "auxiliary_loss_clip": 0.01747204, "auxiliary_loss_mlp": 0.0115424, "balance_loss_clip": 1.28511512, "balance_loss_mlp": 1.08476448, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.2931216646069092, "language_loss": 0.96014255, "learning_rate": 2.7261387181735195e-06, "loss": 0.98915702, "num_input_tokens_seen": 1313525, "step": 69, "time_per_iteration": 2.8138387203216553 }, { "auxiliary_loss_clip": 0.01741342, "auxiliary_loss_mlp": 0.01160375, "balance_loss_clip": 1.28807163, "balance_loss_mlp": 1.09581161, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.1764096137707494, "language_loss": 0.98070192, "learning_rate": 2.7354029381999196e-06, "loss": 1.00971913, "num_input_tokens_seen": 1330505, "step": 70, "time_per_iteration": 2.8319084644317627 }, { "auxiliary_loss_clip": 0.0174721, "auxiliary_loss_mlp": 0.01145619, "balance_loss_clip": 1.27791202, "balance_loss_mlp": 1.07685876, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 2.9300158782571324, "language_loss": 0.94016141, "learning_rate": 2.7445357464116983e-06, "loss": 0.96908975, "num_input_tokens_seen": 1349615, "step": 71, "time_per_iteration": 2.8469433784484863 }, { "auxiliary_loss_clip": 0.01815227, "auxiliary_loss_mlp": 0.01294388, "balance_loss_clip": 1.43495834, "balance_loss_mlp": 1.25490558, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.409331683106634, "language_loss": 0.65682542, "learning_rate": 2.75354081884615e-06, "loss": 0.68792164, "num_input_tokens_seen": 1410275, "step": 72, "time_per_iteration": 3.2019593715667725 }, { "auxiliary_loss_clip": 0.01799527, "auxiliary_loss_mlp": 0.01271558, "balance_loss_clip": 1.43197393, "balance_loss_mlp": 1.2316941, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.25068040880696, "language_loss": 0.63694263, "learning_rate": 2.7624216794188286e-06, "loss": 0.66765356, "num_input_tokens_seen": 1473020, "step": 73, "time_per_iteration": 3.3545596599578857 }, { "auxiliary_loss_clip": 0.01720805, "auxiliary_loss_mlp": 0.01140553, "balance_loss_clip": 1.26912856, "balance_loss_mlp": 1.07279444, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.554977860093902, "language_loss": 0.86212188, "learning_rate": 2.771181708202938e-06, "loss": 0.89073551, "num_input_tokens_seen": 1490385, "step": 74, "time_per_iteration": 2.823498487472534 }, { "auxiliary_loss_clip": 0.0172287, "auxiliary_loss_mlp": 0.01162493, "balance_loss_clip": 1.26811171, "balance_loss_mlp": 1.09344697, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 3.0087618017840105, "language_loss": 0.97196102, "learning_rate": 2.779824149153005e-06, "loss": 1.00081468, "num_input_tokens_seen": 1509725, "step": 75, "time_per_iteration": 2.888415575027466 }, { "auxiliary_loss_clip": 0.0170198, "auxiliary_loss_mlp": 0.01142315, "balance_loss_clip": 1.26420689, "balance_loss_mlp": 1.07608271, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.6610382542709043, "language_loss": 0.87740695, "learning_rate": 2.788352117317012e-06, "loss": 0.90584993, "num_input_tokens_seen": 1527245, "step": 76, "time_per_iteration": 2.9226863384246826 }, { "auxiliary_loss_clip": 0.01702512, "auxiliary_loss_mlp": 0.01145374, "balance_loss_clip": 1.26239479, "balance_loss_mlp": 1.07656646, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 2.4272090643104574, "language_loss": 0.91791159, "learning_rate": 2.796768605577095e-06, "loss": 0.94639051, "num_input_tokens_seen": 1548930, "step": 77, "time_per_iteration": 2.8720929622650146 }, { "auxiliary_loss_clip": 0.01693018, "auxiliary_loss_mlp": 0.01165978, "balance_loss_clip": 1.26398146, "balance_loss_mlp": 1.09569168, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.2822185142383034, "language_loss": 0.9211635, "learning_rate": 2.80507649095533e-06, "loss": 0.94975346, "num_input_tokens_seen": 1565695, "step": 78, "time_per_iteration": 2.7832391262054443 }, { "auxiliary_loss_clip": 0.01689271, "auxiliary_loss_mlp": 0.01153255, "balance_loss_clip": 1.25836253, "balance_loss_mlp": 1.08482933, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 2.263191265943929, "language_loss": 0.82771945, "learning_rate": 2.813278540517843e-06, "loss": 0.85614467, "num_input_tokens_seen": 1582625, "step": 79, "time_per_iteration": 2.7723355293273926 }, { "auxiliary_loss_clip": 0.01702468, "auxiliary_loss_mlp": 0.01130708, "balance_loss_clip": 1.26147008, "balance_loss_mlp": 1.0609467, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 1.9992491725405546, "language_loss": 0.91272199, "learning_rate": 2.8213774169075505e-06, "loss": 0.94105375, "num_input_tokens_seen": 1601725, "step": 80, "time_per_iteration": 2.742046356201172 }, { "auxiliary_loss_clip": 0.01671156, "auxiliary_loss_mlp": 0.01144048, "balance_loss_clip": 1.25365841, "balance_loss_mlp": 1.07371473, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.0371265012476742, "language_loss": 0.95241439, "learning_rate": 2.829375683533245e-06, "loss": 0.9805665, "num_input_tokens_seen": 1622420, "step": 81, "time_per_iteration": 2.8996386528015137 }, { "auxiliary_loss_clip": 0.01686092, "auxiliary_loss_mlp": 0.01147828, "balance_loss_clip": 1.25779653, "balance_loss_mlp": 1.08149946, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 2.9441337112970296, "language_loss": 0.96288472, "learning_rate": 2.8372758094402803e-06, "loss": 0.99122393, "num_input_tokens_seen": 1640715, "step": 82, "time_per_iteration": 2.819120407104492 }, { "auxiliary_loss_clip": 0.01668255, "auxiliary_loss_mlp": 0.01156428, "balance_loss_clip": 1.2461338, "balance_loss_mlp": 1.08709574, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 2.6601797838877856, "language_loss": 0.86762071, "learning_rate": 2.84508017388607e-06, "loss": 0.89586747, "num_input_tokens_seen": 1662210, "step": 83, "time_per_iteration": 2.7959344387054443 }, { "auxiliary_loss_clip": 0.01662665, "auxiliary_loss_mlp": 0.01154043, "balance_loss_clip": 1.24844718, "balance_loss_mlp": 1.084234, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.5416281292503986, "language_loss": 0.92081314, "learning_rate": 2.852791070641559e-06, "loss": 0.94898021, "num_input_tokens_seen": 1681070, "step": 84, "time_per_iteration": 2.7176246643066406 }, { "auxiliary_loss_clip": 0.01647627, "auxiliary_loss_mlp": 0.01154949, "balance_loss_clip": 1.36429358, "balance_loss_mlp": 1.11527622, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.4023430227621099, "language_loss": 0.6252538, "learning_rate": 2.8604107120381682e-06, "loss": 0.65327954, "num_input_tokens_seen": 1747140, "step": 85, "time_per_iteration": 3.296835422515869 }, { "auxiliary_loss_clip": 0.01649469, "auxiliary_loss_mlp": 0.0112642, "balance_loss_clip": 1.23797417, "balance_loss_mlp": 1.05642033, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 1.805253124779358, "language_loss": 0.90709531, "learning_rate": 2.8679412327780482e-06, "loss": 0.93485421, "num_input_tokens_seen": 1767475, "step": 86, "time_per_iteration": 2.761484146118164 }, { "auxiliary_loss_clip": 0.01653351, "auxiliary_loss_mlp": 0.01158608, "balance_loss_clip": 1.24437881, "balance_loss_mlp": 1.08741617, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.3398213465495776, "language_loss": 0.81961077, "learning_rate": 2.8753846935240833e-06, "loss": 0.8477304, "num_input_tokens_seen": 1784980, "step": 87, "time_per_iteration": 2.763185739517212 }, { "auxiliary_loss_clip": 0.01641581, "auxiliary_loss_mlp": 0.01152623, "balance_loss_clip": 1.24129367, "balance_loss_mlp": 1.08457828, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 3.1951080427559857, "language_loss": 0.95790672, "learning_rate": 2.8827430842847267e-06, "loss": 0.98584872, "num_input_tokens_seen": 1803030, "step": 88, "time_per_iteration": 2.7855517864227295 }, { "auxiliary_loss_clip": 0.01658657, "auxiliary_loss_mlp": 0.01147064, "balance_loss_clip": 1.24130976, "balance_loss_mlp": 1.07978201, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 3.405407923072192, "language_loss": 0.86023164, "learning_rate": 2.8900183276075957e-06, "loss": 0.88828892, "num_input_tokens_seen": 1822865, "step": 89, "time_per_iteration": 2.7517924308776855 }, { "auxiliary_loss_clip": 0.01647446, "auxiliary_loss_mlp": 0.01133456, "balance_loss_clip": 1.23541856, "balance_loss_mlp": 1.06727123, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.130771496386599, "language_loss": 0.9150058, "learning_rate": 2.8972122815946455e-06, "loss": 0.94281483, "num_input_tokens_seen": 1842435, "step": 90, "time_per_iteration": 2.7526872158050537 }, { "auxiliary_loss_clip": 0.01629409, "auxiliary_loss_mlp": 0.01133822, "balance_loss_clip": 1.23219132, "balance_loss_mlp": 1.06582534, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 2.6928798867856796, "language_loss": 0.86073506, "learning_rate": 2.90432674275074e-06, "loss": 0.88836741, "num_input_tokens_seen": 1860065, "step": 91, "time_per_iteration": 2.7995588779449463 }, { "auxiliary_loss_clip": 0.01628638, "auxiliary_loss_mlp": 0.01138916, "balance_loss_clip": 1.22774827, "balance_loss_mlp": 1.07335091, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 5.062847798051961, "language_loss": 0.87041199, "learning_rate": 2.91136344867656e-06, "loss": 0.8980875, "num_input_tokens_seen": 1878135, "step": 92, "time_per_iteration": 2.7813079357147217 }, { "auxiliary_loss_clip": 0.01620799, "auxiliary_loss_mlp": 0.01174163, "balance_loss_clip": 1.21933174, "balance_loss_mlp": 1.10650027, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 4.340668874696889, "language_loss": 0.9210887, "learning_rate": 2.918324080615938e-06, "loss": 0.94903833, "num_input_tokens_seen": 1894895, "step": 93, "time_per_iteration": 2.7582218647003174 }, { "auxiliary_loss_clip": 0.0163427, "auxiliary_loss_mlp": 0.01153574, "balance_loss_clip": 1.22659743, "balance_loss_mlp": 1.08238208, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 4.327341326162078, "language_loss": 0.87578797, "learning_rate": 2.925210265866963e-06, "loss": 0.90366644, "num_input_tokens_seen": 1913220, "step": 94, "time_per_iteration": 2.783581256866455 }, { "auxiliary_loss_clip": 0.01570285, "auxiliary_loss_mlp": 0.01051726, "balance_loss_clip": 1.31970167, "balance_loss_mlp": 1.01376939, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.3608185384271176, "language_loss": 0.68098927, "learning_rate": 2.932023580065507e-06, "loss": 0.70720935, "num_input_tokens_seen": 1970970, "step": 95, "time_per_iteration": 3.1328847408294678 }, { "auxiliary_loss_clip": 0.01612519, "auxiliary_loss_mlp": 0.01150182, "balance_loss_clip": 1.21488237, "balance_loss_mlp": 1.08318627, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 6.736145376327001, "language_loss": 0.90221369, "learning_rate": 2.9387655493491906e-06, "loss": 0.92984068, "num_input_tokens_seen": 1988930, "step": 96, "time_per_iteration": 2.8015241622924805 }, { "auxiliary_loss_clip": 0.01605814, "auxiliary_loss_mlp": 0.01142022, "balance_loss_clip": 1.21851277, "balance_loss_mlp": 1.08003318, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 3.8307865500968044, "language_loss": 0.89869905, "learning_rate": 2.9454376524092147e-06, "loss": 0.92617744, "num_input_tokens_seen": 2006285, "step": 97, "time_per_iteration": 4.387299060821533 }, { "auxiliary_loss_clip": 0.01593214, "auxiliary_loss_mlp": 0.01140673, "balance_loss_clip": 1.2102325, "balance_loss_mlp": 1.07200789, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 2.291581893082518, "language_loss": 0.76274347, "learning_rate": 2.952041322436969e-06, "loss": 0.79008234, "num_input_tokens_seen": 2024905, "step": 98, "time_per_iteration": 2.751507043838501 }, { "auxiliary_loss_clip": 0.01533926, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.29271698, "balance_loss_mlp": 1.00129879, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0388395506080574, "language_loss": 0.65518898, "learning_rate": 2.9585779489718204e-06, "loss": 0.68089598, "num_input_tokens_seen": 2086220, "step": 99, "time_per_iteration": 3.3125040531158447 }, { "auxiliary_loss_clip": 0.01595694, "auxiliary_loss_mlp": 0.01142556, "balance_loss_clip": 1.21028757, "balance_loss_mlp": 1.07217503, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 2.051483688350497, "language_loss": 0.90885437, "learning_rate": 2.9650488796560464e-06, "loss": 0.93623686, "num_input_tokens_seen": 2103365, "step": 100, "time_per_iteration": 2.7632548809051514 }, { "auxiliary_loss_clip": 0.01607235, "auxiliary_loss_mlp": 0.01150276, "balance_loss_clip": 1.21294045, "balance_loss_mlp": 1.08394814, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 2.0181737234491566, "language_loss": 0.91081136, "learning_rate": 2.971455421902446e-06, "loss": 0.9383865, "num_input_tokens_seen": 2121995, "step": 101, "time_per_iteration": 2.7214279174804688 }, { "auxiliary_loss_clip": 0.015938, "auxiliary_loss_mlp": 0.01152009, "balance_loss_clip": 1.21248627, "balance_loss_mlp": 1.08124638, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.076276442041171, "language_loss": 0.90774924, "learning_rate": 2.9777988444798075e-06, "loss": 0.93520737, "num_input_tokens_seen": 2141815, "step": 102, "time_per_iteration": 2.8389108180999756 }, { "auxiliary_loss_clip": 0.01588155, "auxiliary_loss_mlp": 0.01133785, "balance_loss_clip": 1.20914173, "balance_loss_mlp": 1.06912589, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 2.3272829989328456, "language_loss": 0.88006896, "learning_rate": 2.9840803790210285e-06, "loss": 0.90728837, "num_input_tokens_seen": 2161125, "step": 103, "time_per_iteration": 2.768784761428833 }, { "auxiliary_loss_clip": 0.01588751, "auxiliary_loss_mlp": 0.01136216, "balance_loss_clip": 1.21138883, "balance_loss_mlp": 1.06998372, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 1.9182889224259552, "language_loss": 0.93644351, "learning_rate": 2.990301221458371e-06, "loss": 0.96369314, "num_input_tokens_seen": 2179510, "step": 104, "time_per_iteration": 2.7109038829803467 }, { "auxiliary_loss_clip": 0.01579421, "auxiliary_loss_mlp": 0.01146524, "balance_loss_clip": 1.20086741, "balance_loss_mlp": 1.08258009, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 3.0437899698059367, "language_loss": 0.96655375, "learning_rate": 2.9964625333900544e-06, "loss": 0.99381316, "num_input_tokens_seen": 2197870, "step": 105, "time_per_iteration": 2.7254133224487305 }, { "auxiliary_loss_clip": 0.01578331, "auxiliary_loss_mlp": 0.01158544, "balance_loss_clip": 1.20144236, "balance_loss_mlp": 1.08768642, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 3.1837681777002302, "language_loss": 0.87119448, "learning_rate": 3.002565443382063e-06, "loss": 0.89856327, "num_input_tokens_seen": 2217495, "step": 106, "time_per_iteration": 2.7705447673797607 }, { "auxiliary_loss_clip": 0.01561845, "auxiliary_loss_mlp": 0.01143018, "balance_loss_clip": 1.18746924, "balance_loss_mlp": 1.0751636, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.228856706842439, "language_loss": 0.83398581, "learning_rate": 3.008611048208843e-06, "loss": 0.86103439, "num_input_tokens_seen": 2236520, "step": 107, "time_per_iteration": 2.6885263919830322 }, { "auxiliary_loss_clip": 0.01469631, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.25210869, "balance_loss_mlp": 1.00179863, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9900995959758047, "language_loss": 0.64796811, "learning_rate": 3.014600414036285e-06, "loss": 0.67299712, "num_input_tokens_seen": 2300140, "step": 108, "time_per_iteration": 3.278621196746826 }, { "auxiliary_loss_clip": 0.01552898, "auxiliary_loss_mlp": 0.01132858, "balance_loss_clip": 1.18960094, "balance_loss_mlp": 1.06424141, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 2.019247660217844, "language_loss": 0.97709465, "learning_rate": 3.0205345775501937e-06, "loss": 1.00395215, "num_input_tokens_seen": 2317320, "step": 109, "time_per_iteration": 2.750502347946167 }, { "auxiliary_loss_clip": 0.01550996, "auxiliary_loss_mlp": 0.01140204, "balance_loss_clip": 1.19136214, "balance_loss_mlp": 1.07430482, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.9540987754213832, "language_loss": 0.84243041, "learning_rate": 3.0264145470332218e-06, "loss": 0.86934245, "num_input_tokens_seen": 2337820, "step": 110, "time_per_iteration": 2.82443904876709 }, { "auxiliary_loss_clip": 0.01544634, "auxiliary_loss_mlp": 0.01151549, "balance_loss_clip": 1.18396342, "balance_loss_mlp": 1.08493507, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.4580319150483563, "language_loss": 0.82940048, "learning_rate": 3.032241303393073e-06, "loss": 0.85636234, "num_input_tokens_seen": 2358560, "step": 111, "time_per_iteration": 2.8308968544006348 }, { "auxiliary_loss_clip": 0.0154596, "auxiliary_loss_mlp": 0.01133366, "balance_loss_clip": 1.18776846, "balance_loss_mlp": 1.06970847, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.356589096997363, "language_loss": 0.93989801, "learning_rate": 3.0380158011446e-06, "loss": 0.9666912, "num_input_tokens_seen": 2379005, "step": 112, "time_per_iteration": 2.8007922172546387 }, { "auxiliary_loss_clip": 0.01549647, "auxiliary_loss_mlp": 0.01136979, "balance_loss_clip": 1.18394601, "balance_loss_mlp": 1.07322621, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.521639841990545, "language_loss": 0.79509294, "learning_rate": 3.0437389693482466e-06, "loss": 0.82195914, "num_input_tokens_seen": 2395610, "step": 113, "time_per_iteration": 2.7599966526031494 }, { "auxiliary_loss_clip": 0.0153736, "auxiliary_loss_mlp": 0.01131524, "balance_loss_clip": 1.18028498, "balance_loss_mlp": 1.06562555, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 2.343117351168218, "language_loss": 0.93439317, "learning_rate": 3.0494117125071475e-06, "loss": 0.96108204, "num_input_tokens_seen": 2415005, "step": 114, "time_per_iteration": 2.723540782928467 }, { "auxiliary_loss_clip": 0.01544971, "auxiliary_loss_mlp": 0.01138932, "balance_loss_clip": 1.17997146, "balance_loss_mlp": 1.07918465, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 1.9509019191057126, "language_loss": 0.9463321, "learning_rate": 3.055034911425055e-06, "loss": 0.97317111, "num_input_tokens_seen": 2433965, "step": 115, "time_per_iteration": 2.7077698707580566 }, { "auxiliary_loss_clip": 0.01537699, "auxiliary_loss_mlp": 0.01118178, "balance_loss_clip": 1.17675614, "balance_loss_mlp": 1.05151677, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 10.363795807176915, "language_loss": 0.82148951, "learning_rate": 3.0606094240271244e-06, "loss": 0.84804827, "num_input_tokens_seen": 2451605, "step": 116, "time_per_iteration": 2.681190013885498 }, { "auxiliary_loss_clip": 0.01528803, "auxiliary_loss_mlp": 0.01126189, "balance_loss_clip": 1.17677391, "balance_loss_mlp": 1.06219721, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.4150591879391627, "language_loss": 0.88368428, "learning_rate": 3.0661360861454656e-06, "loss": 0.91023421, "num_input_tokens_seen": 2472035, "step": 117, "time_per_iteration": 2.776143789291382 }, { "auxiliary_loss_clip": 0.01527909, "auxiliary_loss_mlp": 0.01146127, "balance_loss_clip": 1.17495561, "balance_loss_mlp": 1.08041906, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 2.3639764059040265, "language_loss": 0.8454417, "learning_rate": 3.071615712271274e-06, "loss": 0.87218207, "num_input_tokens_seen": 2489285, "step": 118, "time_per_iteration": 2.7110469341278076 }, { "auxiliary_loss_clip": 0.01538161, "auxiliary_loss_mlp": 0.01163868, "balance_loss_clip": 1.1759789, "balance_loss_mlp": 1.0984937, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 2.231843342078736, "language_loss": 0.99319011, "learning_rate": 3.0770490962752172e-06, "loss": 1.02021039, "num_input_tokens_seen": 2506460, "step": 119, "time_per_iteration": 2.674121856689453 }, { "auxiliary_loss_clip": 0.01540018, "auxiliary_loss_mlp": 0.01120611, "balance_loss_clip": 1.17242217, "balance_loss_mlp": 1.05738258, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 2.7981733983226764, "language_loss": 0.8963809, "learning_rate": 3.082437012097686e-06, "loss": 0.92298722, "num_input_tokens_seen": 2525565, "step": 120, "time_per_iteration": 2.745962381362915 }, { "auxiliary_loss_clip": 0.01524916, "auxiliary_loss_mlp": 0.01129465, "balance_loss_clip": 1.1734432, "balance_loss_mlp": 1.06513989, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.797716104424251, "language_loss": 0.93491542, "learning_rate": 3.0877802144103967e-06, "loss": 0.96145928, "num_input_tokens_seen": 2546605, "step": 121, "time_per_iteration": 2.7924466133117676 }, { "auxiliary_loss_clip": 0.01526294, "auxiliary_loss_mlp": 0.0114832, "balance_loss_clip": 1.17395604, "balance_loss_mlp": 1.08490098, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.3704869501778285, "language_loss": 0.90462255, "learning_rate": 3.09307943925077e-06, "loss": 0.93136871, "num_input_tokens_seen": 2560730, "step": 122, "time_per_iteration": 2.930413246154785 }, { "auxiliary_loss_clip": 0.01521826, "auxiliary_loss_mlp": 0.01146566, "balance_loss_clip": 1.1681807, "balance_loss_mlp": 1.07861674, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.4867163179710037, "language_loss": 0.92660481, "learning_rate": 3.0983354046304154e-06, "loss": 0.95328873, "num_input_tokens_seen": 2579550, "step": 123, "time_per_iteration": 2.7484309673309326 }, { "auxiliary_loss_clip": 0.01519363, "auxiliary_loss_mlp": 0.01127611, "balance_loss_clip": 1.16324139, "balance_loss_mlp": 1.0651449, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 2.366639004459226, "language_loss": 0.71187961, "learning_rate": 3.103548811118979e-06, "loss": 0.73834932, "num_input_tokens_seen": 2600390, "step": 124, "time_per_iteration": 2.8419976234436035 }, { "auxiliary_loss_clip": 0.01506936, "auxiliary_loss_mlp": 0.01125571, "balance_loss_clip": 1.16464007, "balance_loss_mlp": 1.06167519, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.1632751269766106, "language_loss": 0.88450015, "learning_rate": 3.108720342404542e-06, "loss": 0.91082525, "num_input_tokens_seen": 2620770, "step": 125, "time_per_iteration": 2.823296308517456 }, { "auxiliary_loss_clip": 0.01522239, "auxiliary_loss_mlp": 0.01142214, "balance_loss_clip": 1.16456664, "balance_loss_mlp": 1.07912827, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 2.6632616920164067, "language_loss": 0.82381976, "learning_rate": 3.1138506658316945e-06, "loss": 0.85046428, "num_input_tokens_seen": 2639900, "step": 126, "time_per_iteration": 2.7325809001922607 }, { "auxiliary_loss_clip": 0.015153, "auxiliary_loss_mlp": 0.01142869, "balance_loss_clip": 1.16330886, "balance_loss_mlp": 1.08088017, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 3.925284628341409, "language_loss": 0.6743899, "learning_rate": 3.1189404329183404e-06, "loss": 0.7009716, "num_input_tokens_seen": 2657450, "step": 127, "time_per_iteration": 2.709821939468384 }, { "auxiliary_loss_clip": 0.01503057, "auxiliary_loss_mlp": 0.01132416, "balance_loss_clip": 1.165169, "balance_loss_mlp": 1.06861567, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.0535131533503734, "language_loss": 0.8819322, "learning_rate": 3.1239902798522317e-06, "loss": 0.90828693, "num_input_tokens_seen": 2678150, "step": 128, "time_per_iteration": 2.764707565307617 }, { "auxiliary_loss_clip": 0.01505955, "auxiliary_loss_mlp": 0.01144223, "balance_loss_clip": 1.16043079, "balance_loss_mlp": 1.08042252, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 2.6427711693827005, "language_loss": 0.84719259, "learning_rate": 3.129000827968184e-06, "loss": 0.87369436, "num_input_tokens_seen": 2698290, "step": 129, "time_per_iteration": 2.7472774982452393 }, { "auxiliary_loss_clip": 0.01497871, "auxiliary_loss_mlp": 0.01130211, "balance_loss_clip": 1.15871263, "balance_loss_mlp": 1.06655347, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.366492959329914, "language_loss": 0.97564614, "learning_rate": 3.133972684206866e-06, "loss": 1.00192702, "num_input_tokens_seen": 2717630, "step": 130, "time_per_iteration": 2.6955018043518066 }, { "auxiliary_loss_clip": 0.01492272, "auxiliary_loss_mlp": 0.01134965, "balance_loss_clip": 1.15630865, "balance_loss_mlp": 1.06987715, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 2.2164470079204572, "language_loss": 0.82658112, "learning_rate": 3.138906441556014e-06, "loss": 0.85285342, "num_input_tokens_seen": 2735835, "step": 131, "time_per_iteration": 2.722247362136841 }, { "auxiliary_loss_clip": 0.01500937, "auxiliary_loss_mlp": 0.01128359, "balance_loss_clip": 1.15885806, "balance_loss_mlp": 1.06694245, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 2.7663180664822193, "language_loss": 0.82781422, "learning_rate": 3.143802679474861e-06, "loss": 0.85410714, "num_input_tokens_seen": 2756335, "step": 132, "time_per_iteration": 2.7937612533569336 }, { "auxiliary_loss_clip": 0.01491919, "auxiliary_loss_mlp": 0.01128624, "balance_loss_clip": 1.15346444, "balance_loss_mlp": 1.0664922, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.182366740159355, "language_loss": 0.95499313, "learning_rate": 3.1486619643025565e-06, "loss": 0.98119843, "num_input_tokens_seen": 2775090, "step": 133, "time_per_iteration": 2.7380354404449463 }, { "auxiliary_loss_clip": 0.01487746, "auxiliary_loss_mlp": 0.0112871, "balance_loss_clip": 1.16170454, "balance_loss_mlp": 1.06843781, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 1.8164116645967854, "language_loss": 0.73478442, "learning_rate": 3.153484849651286e-06, "loss": 0.76094896, "num_input_tokens_seen": 2795320, "step": 134, "time_per_iteration": 2.7483408451080322 }, { "auxiliary_loss_clip": 0.01484621, "auxiliary_loss_mlp": 0.01132134, "balance_loss_clip": 1.15115011, "balance_loss_mlp": 1.06695068, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 5.027018494085059, "language_loss": 0.88792509, "learning_rate": 3.1582718767847806e-06, "loss": 0.91409266, "num_input_tokens_seen": 2812815, "step": 135, "time_per_iteration": 2.6838128566741943 }, { "auxiliary_loss_clip": 0.01487119, "auxiliary_loss_mlp": 0.0113257, "balance_loss_clip": 1.15490174, "balance_loss_mlp": 1.06714821, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.9282722528396903, "language_loss": 0.89138198, "learning_rate": 3.1630235749828485e-06, "loss": 0.91757882, "num_input_tokens_seen": 2830445, "step": 136, "time_per_iteration": 2.726475238800049 }, { "auxiliary_loss_clip": 0.01483417, "auxiliary_loss_mlp": 0.01110724, "balance_loss_clip": 1.1494019, "balance_loss_mlp": 1.05078554, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.2984339413846078, "language_loss": 0.84091324, "learning_rate": 3.1677404618925676e-06, "loss": 0.86685467, "num_input_tokens_seen": 2846965, "step": 137, "time_per_iteration": 7.4708640575408936 }, { "auxiliary_loss_clip": 0.01481848, "auxiliary_loss_mlp": 0.01118837, "balance_loss_clip": 1.1500535, "balance_loss_mlp": 1.05894589, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 1.69378413504035, "language_loss": 0.9018681, "learning_rate": 3.1724230438666953e-06, "loss": 0.92787492, "num_input_tokens_seen": 2867520, "step": 138, "time_per_iteration": 4.311830520629883 }, { "auxiliary_loss_clip": 0.01469655, "auxiliary_loss_mlp": 0.01123604, "balance_loss_clip": 1.14824438, "balance_loss_mlp": 1.05904007, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.1515203004813785, "language_loss": 0.91478992, "learning_rate": 3.177071816289865e-06, "loss": 0.94072247, "num_input_tokens_seen": 2885675, "step": 139, "time_per_iteration": 2.7678122520446777 }, { "auxiliary_loss_clip": 0.01486799, "auxiliary_loss_mlp": 0.01124947, "balance_loss_clip": 1.15521085, "balance_loss_mlp": 1.06195688, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 2.305315677890536, "language_loss": 0.85667789, "learning_rate": 3.181687263893095e-06, "loss": 0.88279533, "num_input_tokens_seen": 2905960, "step": 140, "time_per_iteration": 2.8557639122009277 }, { "auxiliary_loss_clip": 0.01473538, "auxiliary_loss_mlp": 0.01122701, "balance_loss_clip": 1.14923954, "balance_loss_mlp": 1.06166625, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 2.3443620963590455, "language_loss": 0.84346074, "learning_rate": 3.186269861057098e-06, "loss": 0.86942315, "num_input_tokens_seen": 2922780, "step": 141, "time_per_iteration": 2.7656807899475098 }, { "auxiliary_loss_clip": 0.01477141, "auxiliary_loss_mlp": 0.01135217, "balance_loss_clip": 1.14718878, "balance_loss_mlp": 1.07360983, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.29020652115343, "language_loss": 0.8105557, "learning_rate": 3.1908200721048745e-06, "loss": 0.83667928, "num_input_tokens_seen": 2938765, "step": 142, "time_per_iteration": 2.747598171234131 }, { "auxiliary_loss_clip": 0.01378886, "auxiliary_loss_mlp": 0.01060004, "balance_loss_clip": 1.19240355, "balance_loss_mlp": 1.03406358, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.056887207538052, "language_loss": 0.66899812, "learning_rate": 3.195338351584042e-06, "loss": 0.69338703, "num_input_tokens_seen": 3006665, "step": 143, "time_per_iteration": 3.346982002258301 }, { "auxiliary_loss_clip": 0.01467707, "auxiliary_loss_mlp": 0.01123721, "balance_loss_clip": 1.14666772, "balance_loss_mlp": 1.06273365, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.6467048454978523, "language_loss": 0.84356761, "learning_rate": 3.1998251445393258e-06, "loss": 0.86948192, "num_input_tokens_seen": 3024335, "step": 144, "time_per_iteration": 2.762087345123291 }, { "auxiliary_loss_clip": 0.01455701, "auxiliary_loss_mlp": 0.01114511, "balance_loss_clip": 1.14058816, "balance_loss_mlp": 1.05085373, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 1.8692883316747366, "language_loss": 0.88353741, "learning_rate": 3.204280886775619e-06, "loss": 0.90923953, "num_input_tokens_seen": 3043300, "step": 145, "time_per_iteration": 2.7050039768218994 }, { "auxiliary_loss_clip": 0.01470385, "auxiliary_loss_mlp": 0.01121817, "balance_loss_clip": 1.14247775, "balance_loss_mlp": 1.05873132, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 1.860830881508538, "language_loss": 0.86182559, "learning_rate": 3.208706005112005e-06, "loss": 0.88774765, "num_input_tokens_seen": 3064610, "step": 146, "time_per_iteration": 2.741013288497925 }, { "auxiliary_loss_clip": 0.01356998, "auxiliary_loss_mlp": 0.01029681, "balance_loss_clip": 1.18072379, "balance_loss_mlp": 1.00431335, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8598047517885464, "language_loss": 0.60122073, "learning_rate": 3.213100917627104e-06, "loss": 0.6250875, "num_input_tokens_seen": 3130385, "step": 147, "time_per_iteration": 3.27382230758667 }, { "auxiliary_loss_clip": 0.01463009, "auxiliary_loss_mlp": 0.01123472, "balance_loss_clip": 1.14658976, "balance_loss_mlp": 1.06548882, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 1.8116070485228748, "language_loss": 0.84620225, "learning_rate": 3.2174660338961135e-06, "loss": 0.87206709, "num_input_tokens_seen": 3149760, "step": 148, "time_per_iteration": 2.72910475730896 }, { "auxiliary_loss_clip": 0.01466623, "auxiliary_loss_mlp": 0.01144944, "balance_loss_clip": 1.14777792, "balance_loss_mlp": 1.07985532, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 2.5530775415688205, "language_loss": 0.88680327, "learning_rate": 3.2218017552198588e-06, "loss": 0.91291893, "num_input_tokens_seen": 3164500, "step": 149, "time_per_iteration": 2.688528537750244 }, { "auxiliary_loss_clip": 0.01463954, "auxiliary_loss_mlp": 0.01114885, "balance_loss_clip": 1.14290714, "balance_loss_mlp": 1.05728304, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 2.1996557200804823, "language_loss": 0.93269086, "learning_rate": 3.226108474846181e-06, "loss": 0.95847929, "num_input_tokens_seen": 3182455, "step": 150, "time_per_iteration": 2.7901580333709717 }, { "auxiliary_loss_clip": 0.01450819, "auxiliary_loss_mlp": 0.01114571, "balance_loss_clip": 1.13812149, "balance_loss_mlp": 1.05839944, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 4.690239135210318, "language_loss": 0.7421813, "learning_rate": 3.2303865781839817e-06, "loss": 0.7678352, "num_input_tokens_seen": 3203995, "step": 151, "time_per_iteration": 2.79590106010437 }, { "auxiliary_loss_clip": 0.01463077, "auxiliary_loss_mlp": 0.01128244, "balance_loss_clip": 1.14311624, "balance_loss_mlp": 1.06954527, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 4.291097242497492, "language_loss": 0.88460332, "learning_rate": 3.234636443010188e-06, "loss": 0.9105165, "num_input_tokens_seen": 3222575, "step": 152, "time_per_iteration": 2.701775550842285 }, { "auxiliary_loss_clip": 0.01462099, "auxiliary_loss_mlp": 0.01122264, "balance_loss_clip": 1.14743185, "balance_loss_mlp": 1.06275451, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 3.861411936226758, "language_loss": 0.83918798, "learning_rate": 3.238858439669943e-06, "loss": 0.8650316, "num_input_tokens_seen": 3240180, "step": 153, "time_per_iteration": 2.730654716491699 }, { "auxiliary_loss_clip": 0.01453756, "auxiliary_loss_mlp": 0.01136244, "balance_loss_clip": 1.14024806, "balance_loss_mlp": 1.07554269, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 1.8788427995178905, "language_loss": 0.89924759, "learning_rate": 3.2430529312702712e-06, "loss": 0.92514759, "num_input_tokens_seen": 3259800, "step": 154, "time_per_iteration": 2.8150386810302734 }, { "auxiliary_loss_clip": 0.01457041, "auxiliary_loss_mlp": 0.01148182, "balance_loss_clip": 1.1422174, "balance_loss_mlp": 1.08934021, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.155148564981828, "language_loss": 0.89730597, "learning_rate": 3.2472202738674737e-06, "loss": 0.9233582, "num_input_tokens_seen": 3280400, "step": 155, "time_per_iteration": 2.7780215740203857 }, { "auxiliary_loss_clip": 0.01462257, "auxiliary_loss_mlp": 0.01115972, "balance_loss_clip": 1.14140153, "balance_loss_mlp": 1.0580368, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 2.6722626388977986, "language_loss": 0.86758631, "learning_rate": 3.2513608166485063e-06, "loss": 0.8933686, "num_input_tokens_seen": 3297600, "step": 156, "time_per_iteration": 2.7195818424224854 }, { "auxiliary_loss_clip": 0.01460326, "auxiliary_loss_mlp": 0.01116019, "balance_loss_clip": 1.14530039, "balance_loss_mlp": 1.05770147, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 2.3212743339319926, "language_loss": 0.99652225, "learning_rate": 3.2554749021065498e-06, "loss": 1.0222857, "num_input_tokens_seen": 3313635, "step": 157, "time_per_iteration": 2.7530624866485596 }, { "auxiliary_loss_clip": 0.01445494, "auxiliary_loss_mlp": 0.01139991, "balance_loss_clip": 1.14011836, "balance_loss_mlp": 1.08162606, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.2650385025378834, "language_loss": 0.88388717, "learning_rate": 3.2595628662110186e-06, "loss": 0.90974212, "num_input_tokens_seen": 3333735, "step": 158, "time_per_iteration": 2.744640588760376 }, { "auxiliary_loss_clip": 0.01451838, "auxiliary_loss_mlp": 0.01122147, "balance_loss_clip": 1.13977575, "balance_loss_mlp": 1.0630666, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.1807440045696165, "language_loss": 0.86407602, "learning_rate": 3.2636250385721982e-06, "loss": 0.88981581, "num_input_tokens_seen": 3348800, "step": 159, "time_per_iteration": 2.7330005168914795 }, { "auxiliary_loss_clip": 0.01441743, "auxiliary_loss_mlp": 0.01137796, "balance_loss_clip": 1.13474953, "balance_loss_mlp": 1.07752383, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 1.7296815250329798, "language_loss": 0.86756837, "learning_rate": 3.2676617426007263e-06, "loss": 0.89336377, "num_input_tokens_seen": 3368595, "step": 160, "time_per_iteration": 2.844817876815796 }, { "auxiliary_loss_clip": 0.01447614, "auxiliary_loss_mlp": 0.0112266, "balance_loss_clip": 1.13978457, "balance_loss_mlp": 1.06725168, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.462408333273543, "language_loss": 0.91543746, "learning_rate": 3.2716732956621042e-06, "loss": 0.94114017, "num_input_tokens_seen": 3384975, "step": 161, "time_per_iteration": 2.667666435241699 }, { "auxiliary_loss_clip": 0.01453392, "auxiliary_loss_mlp": 0.01111804, "balance_loss_clip": 1.14104879, "balance_loss_mlp": 1.05610919, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.7914334411859298, "language_loss": 0.91582954, "learning_rate": 3.2756600092264203e-06, "loss": 0.94148147, "num_input_tokens_seen": 3404755, "step": 162, "time_per_iteration": 2.6779961585998535 }, { "auxiliary_loss_clip": 0.0131522, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.15019548, "balance_loss_mlp": 1.03358769, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.183297200633083, "language_loss": 0.72292268, "learning_rate": 3.279622189013474e-06, "loss": 0.74664438, "num_input_tokens_seen": 3467210, "step": 163, "time_per_iteration": 3.226755142211914 }, { "auxiliary_loss_clip": 0.01439788, "auxiliary_loss_mlp": 0.01116102, "balance_loss_clip": 1.13873029, "balance_loss_mlp": 1.05921507, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 3.3372881081540937, "language_loss": 0.84684807, "learning_rate": 3.283560135133457e-06, "loss": 0.87240696, "num_input_tokens_seen": 3483220, "step": 164, "time_per_iteration": 2.768935203552246 }, { "auxiliary_loss_clip": 0.01430933, "auxiliary_loss_mlp": 0.0110117, "balance_loss_clip": 1.13048434, "balance_loss_mlp": 1.04533219, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 4.079659732294038, "language_loss": 0.89080763, "learning_rate": 3.2874741422233565e-06, "loss": 0.91612864, "num_input_tokens_seen": 3501465, "step": 165, "time_per_iteration": 2.673292875289917 }, { "auxiliary_loss_clip": 0.01433192, "auxiliary_loss_mlp": 0.01128138, "balance_loss_clip": 1.13111067, "balance_loss_mlp": 1.06819916, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 1.7359539169577796, "language_loss": 0.79931343, "learning_rate": 3.2913644995792465e-06, "loss": 0.82492673, "num_input_tokens_seen": 3520480, "step": 166, "time_per_iteration": 2.762742757797241 }, { "auxiliary_loss_clip": 0.01438026, "auxiliary_loss_mlp": 0.01129718, "balance_loss_clip": 1.13488948, "balance_loss_mlp": 1.07066131, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.3252666324684585, "language_loss": 0.92125285, "learning_rate": 3.2952314912845914e-06, "loss": 0.94693023, "num_input_tokens_seen": 3539570, "step": 167, "time_per_iteration": 2.970964193344116 }, { "auxiliary_loss_clip": 0.01429698, "auxiliary_loss_mlp": 0.01133324, "balance_loss_clip": 1.13294363, "balance_loss_mlp": 1.07734346, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 13.512238716069085, "language_loss": 0.90781063, "learning_rate": 3.299075396334735e-06, "loss": 0.93344086, "num_input_tokens_seen": 3555465, "step": 168, "time_per_iteration": 2.8039841651916504 }, { "auxiliary_loss_clip": 0.01424367, "auxiliary_loss_mlp": 0.01104795, "balance_loss_clip": 1.12848639, "balance_loss_mlp": 1.04700291, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.6705351130563955, "language_loss": 0.87173021, "learning_rate": 3.3028964887576868e-06, "loss": 0.89702177, "num_input_tokens_seen": 3578970, "step": 169, "time_per_iteration": 2.8215444087982178 }, { "auxiliary_loss_clip": 0.01425902, "auxiliary_loss_mlp": 0.01110538, "balance_loss_clip": 1.13139379, "balance_loss_mlp": 1.05317438, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 1.7404257397879006, "language_loss": 0.84622329, "learning_rate": 3.306695037731344e-06, "loss": 0.87158769, "num_input_tokens_seen": 3597275, "step": 170, "time_per_iteration": 2.6759181022644043 }, { "auxiliary_loss_clip": 0.0143612, "auxiliary_loss_mlp": 0.01137162, "balance_loss_clip": 1.13149834, "balance_loss_mlp": 1.07874942, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.174517661608974, "language_loss": 0.89936447, "learning_rate": 3.3104713076972827e-06, "loss": 0.92509729, "num_input_tokens_seen": 3618905, "step": 171, "time_per_iteration": 2.800394058227539 }, { "auxiliary_loss_clip": 0.01430673, "auxiliary_loss_mlp": 0.01108779, "balance_loss_clip": 1.1347487, "balance_loss_mlp": 1.05382347, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.938241860949196, "language_loss": 0.88895655, "learning_rate": 3.314225558471224e-06, "loss": 0.91435111, "num_input_tokens_seen": 3639610, "step": 172, "time_per_iteration": 2.755190849304199 }, { "auxiliary_loss_clip": 0.01418638, "auxiliary_loss_mlp": 0.01118471, "balance_loss_clip": 1.12744904, "balance_loss_mlp": 1.06270456, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.7925778946034159, "language_loss": 0.80943549, "learning_rate": 3.317958045350308e-06, "loss": 0.83480656, "num_input_tokens_seen": 3664030, "step": 173, "time_per_iteration": 2.751945734024048 }, { "auxiliary_loss_clip": 0.01429615, "auxiliary_loss_mlp": 0.01107965, "balance_loss_clip": 1.13108575, "balance_loss_mlp": 1.05534625, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 2.1644843911099216, "language_loss": 0.82763064, "learning_rate": 3.3216690192172596e-06, "loss": 0.85300648, "num_input_tokens_seen": 3683615, "step": 174, "time_per_iteration": 2.676630735397339 }, { "auxiliary_loss_clip": 0.01423443, "auxiliary_loss_mlp": 0.01120976, "balance_loss_clip": 1.12816644, "balance_loss_mlp": 1.06523335, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 2.331494685324117, "language_loss": 0.72837007, "learning_rate": 3.325358726641591e-06, "loss": 0.75381434, "num_input_tokens_seen": 3704540, "step": 175, "time_per_iteration": 2.6876866817474365 }, { "auxiliary_loss_clip": 0.01425333, "auxiliary_loss_mlp": 0.01127215, "balance_loss_clip": 1.12866652, "balance_loss_mlp": 1.06980324, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 4.811985773634618, "language_loss": 0.97983754, "learning_rate": 3.329027409977902e-06, "loss": 1.00536299, "num_input_tokens_seen": 3721320, "step": 176, "time_per_iteration": 2.8159937858581543 }, { "auxiliary_loss_clip": 0.0141033, "auxiliary_loss_mlp": 0.01130651, "balance_loss_clip": 1.12546706, "balance_loss_mlp": 1.07738805, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.8326118759658585, "language_loss": 0.76926064, "learning_rate": 3.3326753074614087e-06, "loss": 0.7946704, "num_input_tokens_seen": 3739385, "step": 177, "time_per_iteration": 5.7707555294036865 }, { "auxiliary_loss_clip": 0.01421858, "auxiliary_loss_mlp": 0.01104718, "balance_loss_clip": 1.12455702, "balance_loss_mlp": 1.05002475, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 2.6517911185675014, "language_loss": 0.76942402, "learning_rate": 3.3363026533007716e-06, "loss": 0.79468977, "num_input_tokens_seen": 3756360, "step": 178, "time_per_iteration": 4.337082386016846 }, { "auxiliary_loss_clip": 0.01430293, "auxiliary_loss_mlp": 0.01109414, "balance_loss_clip": 1.1303575, "balance_loss_mlp": 1.05252683, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.6843360372821925, "language_loss": 0.84022826, "learning_rate": 3.3399096777683303e-06, "loss": 0.86562538, "num_input_tokens_seen": 3773930, "step": 179, "time_per_iteration": 2.6826629638671875 }, { "auxiliary_loss_clip": 0.01418094, "auxiliary_loss_mlp": 0.01108667, "balance_loss_clip": 1.12202275, "balance_loss_mlp": 1.05158973, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.0256655839140083, "language_loss": 0.83674574, "learning_rate": 3.3434966072878213e-06, "loss": 0.86201334, "num_input_tokens_seen": 3793630, "step": 180, "time_per_iteration": 2.7483785152435303 }, { "auxiliary_loss_clip": 0.01421326, "auxiliary_loss_mlp": 0.01120347, "balance_loss_clip": 1.12740374, "balance_loss_mlp": 1.0646286, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 3.253139118534122, "language_loss": 0.77958715, "learning_rate": 3.3470636645196674e-06, "loss": 0.80500388, "num_input_tokens_seen": 3813610, "step": 181, "time_per_iteration": 2.698941469192505 }, { "auxiliary_loss_clip": 0.01414948, "auxiliary_loss_mlp": 0.01130231, "balance_loss_clip": 1.12188053, "balance_loss_mlp": 1.07577634, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 2.56637338396407, "language_loss": 0.76438594, "learning_rate": 3.3506110684439156e-06, "loss": 0.78983772, "num_input_tokens_seen": 3831390, "step": 182, "time_per_iteration": 2.6951375007629395 }, { "auxiliary_loss_clip": 0.01412526, "auxiliary_loss_mlp": 0.01126665, "balance_loss_clip": 1.12167537, "balance_loss_mlp": 1.0702554, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.083158831639218, "language_loss": 0.87484097, "learning_rate": 3.3541390344409054e-06, "loss": 0.90023291, "num_input_tokens_seen": 3849705, "step": 183, "time_per_iteration": 2.733753204345703 }, { "auxiliary_loss_clip": 0.01415922, "auxiliary_loss_mlp": 0.01110585, "balance_loss_clip": 1.12529624, "balance_loss_mlp": 1.05922985, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 3.105080129831269, "language_loss": 0.86911464, "learning_rate": 3.357647774369736e-06, "loss": 0.89437973, "num_input_tokens_seen": 3869230, "step": 184, "time_per_iteration": 2.6783828735351562 }, { "auxiliary_loss_clip": 0.01410648, "auxiliary_loss_mlp": 0.01108321, "balance_loss_clip": 1.12499499, "balance_loss_mlp": 1.05203021, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.8650514063709744, "language_loss": 0.83885491, "learning_rate": 3.3611374966446085e-06, "loss": 0.86404455, "num_input_tokens_seen": 3889735, "step": 185, "time_per_iteration": 2.6863327026367188 }, { "auxiliary_loss_clip": 0.01419384, "auxiliary_loss_mlp": 0.01107812, "balance_loss_clip": 1.12355363, "balance_loss_mlp": 1.04999495, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 2.8933407749520743, "language_loss": 0.71027243, "learning_rate": 3.3646084063091142e-06, "loss": 0.73554444, "num_input_tokens_seen": 3908855, "step": 186, "time_per_iteration": 2.819805383682251 }, { "auxiliary_loss_clip": 0.01415699, "auxiliary_loss_mlp": 0.01108312, "balance_loss_clip": 1.12262082, "balance_loss_mlp": 1.05574071, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.4244794785226733, "language_loss": 1.01999915, "learning_rate": 3.3680607051085194e-06, "loss": 1.04523933, "num_input_tokens_seen": 3923865, "step": 187, "time_per_iteration": 2.65875506401062 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.01107995, "balance_loss_clip": 1.12269068, "balance_loss_mlp": 1.05253887, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 2.0089158406542524, "language_loss": 0.74998611, "learning_rate": 3.371494591560139e-06, "loss": 0.77511597, "num_input_tokens_seen": 3946870, "step": 188, "time_per_iteration": 2.8631174564361572 }, { "auxiliary_loss_clip": 0.01298557, "auxiliary_loss_mlp": 0.01067058, "balance_loss_clip": 1.14124644, "balance_loss_mlp": 1.04474187, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7620731385906954, "language_loss": 0.56192517, "learning_rate": 3.3749102610218297e-06, "loss": 0.5855813, "num_input_tokens_seen": 4010005, "step": 189, "time_per_iteration": 3.2704074382781982 }, { "auxiliary_loss_clip": 0.01402206, "auxiliary_loss_mlp": 0.011217, "balance_loss_clip": 1.11730003, "balance_loss_mlp": 1.06662548, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.640219984380571, "language_loss": 0.95085573, "learning_rate": 3.3783079057586833e-06, "loss": 0.97609472, "num_input_tokens_seen": 4029035, "step": 190, "time_per_iteration": 2.6898255348205566 }, { "auxiliary_loss_clip": 0.01405088, "auxiliary_loss_mlp": 0.01103893, "balance_loss_clip": 1.11979234, "balance_loss_mlp": 1.05167961, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 4.133813113517846, "language_loss": 0.8463847, "learning_rate": 3.3816877150079665e-06, "loss": 0.8714745, "num_input_tokens_seen": 4046995, "step": 191, "time_per_iteration": 2.71589994430542 }, { "auxiliary_loss_clip": 0.01403196, "auxiliary_loss_mlp": 0.01118385, "balance_loss_clip": 1.11570346, "balance_loss_mlp": 1.06624269, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 2.0065119945705887, "language_loss": 0.91894913, "learning_rate": 3.385049875042367e-06, "loss": 0.94416493, "num_input_tokens_seen": 4065865, "step": 192, "time_per_iteration": 2.775974988937378 }, { "auxiliary_loss_clip": 0.01398496, "auxiliary_loss_mlp": 0.01118924, "balance_loss_clip": 1.11665678, "balance_loss_mlp": 1.06117916, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.10033302347605, "language_loss": 0.86923265, "learning_rate": 3.3883945692315938e-06, "loss": 0.89440691, "num_input_tokens_seen": 4085305, "step": 193, "time_per_iteration": 2.792947292327881 }, { "auxiliary_loss_clip": 0.01402535, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.11514282, "balance_loss_mlp": 1.05061066, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.2253165290939076, "language_loss": 0.92296255, "learning_rate": 3.3917219781023906e-06, "loss": 0.94801068, "num_input_tokens_seen": 4105185, "step": 194, "time_per_iteration": 2.6886558532714844 }, { "auxiliary_loss_clip": 0.01407209, "auxiliary_loss_mlp": 0.01108641, "balance_loss_clip": 1.11930478, "balance_loss_mlp": 1.05630851, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 2.4241235245311503, "language_loss": 0.89768875, "learning_rate": 3.3950322793970014e-06, "loss": 0.92284721, "num_input_tokens_seen": 4123160, "step": 195, "time_per_iteration": 2.654517889022827 }, { "auxiliary_loss_clip": 0.01400339, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.11779022, "balance_loss_mlp": 1.05981565, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 3.1130999341447385, "language_loss": 0.86019921, "learning_rate": 3.3983256481301445e-06, "loss": 0.88534749, "num_input_tokens_seen": 4140425, "step": 196, "time_per_iteration": 2.643598794937134 }, { "auxiliary_loss_clip": 0.01398067, "auxiliary_loss_mlp": 0.01107082, "balance_loss_clip": 1.11464977, "balance_loss_mlp": 1.05308056, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 3.666533247373141, "language_loss": 0.93052697, "learning_rate": 3.4016022566445335e-06, "loss": 0.95557845, "num_input_tokens_seen": 4159555, "step": 197, "time_per_iteration": 2.7120354175567627 }, { "auxiliary_loss_clip": 0.01396424, "auxiliary_loss_mlp": 0.01112388, "balance_loss_clip": 1.11625624, "balance_loss_mlp": 1.05943501, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 1.9614954763997827, "language_loss": 0.79043806, "learning_rate": 3.4048622746649966e-06, "loss": 0.81552619, "num_input_tokens_seen": 4180480, "step": 198, "time_per_iteration": 2.774059772491455 }, { "auxiliary_loss_clip": 0.0139305, "auxiliary_loss_mlp": 0.01120527, "balance_loss_clip": 1.11708748, "balance_loss_mlp": 1.06821764, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 1.8823459083646328, "language_loss": 0.88239717, "learning_rate": 3.4081058693512278e-06, "loss": 0.90753293, "num_input_tokens_seen": 4198835, "step": 199, "time_per_iteration": 2.6808881759643555 }, { "auxiliary_loss_clip": 0.01403709, "auxiliary_loss_mlp": 0.0112899, "balance_loss_clip": 1.11951399, "balance_loss_mlp": 1.07200766, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 2.0663906916258497, "language_loss": 0.81151628, "learning_rate": 3.411333205349222e-06, "loss": 0.83684325, "num_input_tokens_seen": 4219335, "step": 200, "time_per_iteration": 2.625380516052246 }, { "auxiliary_loss_clip": 0.0140201, "auxiliary_loss_mlp": 0.01104413, "balance_loss_clip": 1.11633158, "balance_loss_mlp": 1.05048287, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 2.253120238884594, "language_loss": 0.87696433, "learning_rate": 3.4145444448414217e-06, "loss": 0.90202856, "num_input_tokens_seen": 4236940, "step": 201, "time_per_iteration": 2.6062326431274414 }, { "auxiliary_loss_clip": 0.01399494, "auxiliary_loss_mlp": 0.01115643, "balance_loss_clip": 1.11764228, "balance_loss_mlp": 1.0614028, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 2.088192664231089, "language_loss": 0.84052485, "learning_rate": 3.4177397475956223e-06, "loss": 0.86567622, "num_input_tokens_seen": 4256755, "step": 202, "time_per_iteration": 2.6981592178344727 }, { "auxiliary_loss_clip": 0.01388741, "auxiliary_loss_mlp": 0.0111019, "balance_loss_clip": 1.11006808, "balance_loss_mlp": 1.05771446, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 1.7861279575653157, "language_loss": 0.89964712, "learning_rate": 3.4209192710126685e-06, "loss": 0.92463642, "num_input_tokens_seen": 4276505, "step": 203, "time_per_iteration": 2.668757438659668 }, { "auxiliary_loss_clip": 0.01276289, "auxiliary_loss_mlp": 0.01095021, "balance_loss_clip": 1.12578154, "balance_loss_mlp": 1.07470798, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.0265297625980543, "language_loss": 0.61255801, "learning_rate": 3.4240831701729837e-06, "loss": 0.63627112, "num_input_tokens_seen": 4330965, "step": 204, "time_per_iteration": 3.161599636077881 }, { "auxiliary_loss_clip": 0.01396271, "auxiliary_loss_mlp": 0.01111806, "balance_loss_clip": 1.11291122, "balance_loss_mlp": 1.05930579, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.3248674300118184, "language_loss": 0.91324663, "learning_rate": 3.4272315978819516e-06, "loss": 0.93832743, "num_input_tokens_seen": 4348200, "step": 205, "time_per_iteration": 2.6764047145843506 }, { "auxiliary_loss_clip": 0.01404558, "auxiliary_loss_mlp": 0.0112167, "balance_loss_clip": 1.11773109, "balance_loss_mlp": 1.06773925, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 2.1088315130515207, "language_loss": 0.89305568, "learning_rate": 3.4303647047142043e-06, "loss": 0.91831797, "num_input_tokens_seen": 4365460, "step": 206, "time_per_iteration": 2.7157227993011475 }, { "auxiliary_loss_clip": 0.0139534, "auxiliary_loss_mlp": 0.01100957, "balance_loss_clip": 1.11176991, "balance_loss_mlp": 1.04888678, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.399816031687551, "language_loss": 0.95542914, "learning_rate": 3.43348263905683e-06, "loss": 0.9803921, "num_input_tokens_seen": 4383650, "step": 207, "time_per_iteration": 2.611348867416382 }, { "auxiliary_loss_clip": 0.01393005, "auxiliary_loss_mlp": 0.01117764, "balance_loss_clip": 1.11658561, "balance_loss_mlp": 1.06497812, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 1.8144323603981871, "language_loss": 0.75985783, "learning_rate": 3.436585547151547e-06, "loss": 0.78496552, "num_input_tokens_seen": 4403765, "step": 208, "time_per_iteration": 2.7184154987335205 }, { "auxiliary_loss_clip": 0.0138146, "auxiliary_loss_mlp": 0.01108623, "balance_loss_clip": 1.11071992, "balance_loss_mlp": 1.05576587, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 2.2326965650696855, "language_loss": 0.98386943, "learning_rate": 3.4396735731358586e-06, "loss": 1.00877023, "num_input_tokens_seen": 4421935, "step": 209, "time_per_iteration": 2.7354249954223633 }, { "auxiliary_loss_clip": 0.01387012, "auxiliary_loss_mlp": 0.0111836, "balance_loss_clip": 1.11136842, "balance_loss_mlp": 1.06490695, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 9.084733304650118, "language_loss": 0.85514843, "learning_rate": 3.4427468590832302e-06, "loss": 0.88020217, "num_input_tokens_seen": 4441470, "step": 210, "time_per_iteration": 2.888749122619629 }, { "auxiliary_loss_clip": 0.01384384, "auxiliary_loss_mlp": 0.01121559, "balance_loss_clip": 1.11018038, "balance_loss_mlp": 1.07115781, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 3.431917100192063, "language_loss": 0.97194636, "learning_rate": 3.445805545042314e-06, "loss": 0.99700582, "num_input_tokens_seen": 4459950, "step": 211, "time_per_iteration": 2.7465193271636963 }, { "auxiliary_loss_clip": 0.01393556, "auxiliary_loss_mlp": 0.01123542, "balance_loss_clip": 1.11511767, "balance_loss_mlp": 1.06999326, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.3992368053115163, "language_loss": 0.9508543, "learning_rate": 3.448849769075239e-06, "loss": 0.97602528, "num_input_tokens_seen": 4478390, "step": 212, "time_per_iteration": 2.6340651512145996 }, { "auxiliary_loss_clip": 0.01381697, "auxiliary_loss_mlp": 0.01116386, "balance_loss_clip": 1.112149, "balance_loss_mlp": 1.06381512, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.701444843398511, "language_loss": 0.76078421, "learning_rate": 3.4518796672950093e-06, "loss": 0.78576505, "num_input_tokens_seen": 4501665, "step": 213, "time_per_iteration": 2.9250640869140625 }, { "auxiliary_loss_clip": 0.01385821, "auxiliary_loss_mlp": 0.01111776, "balance_loss_clip": 1.11002433, "balance_loss_mlp": 1.06056333, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 3.5300370267625922, "language_loss": 0.86698866, "learning_rate": 3.4548953739020187e-06, "loss": 0.89196461, "num_input_tokens_seen": 4519055, "step": 214, "time_per_iteration": 2.645289659500122 }, { "auxiliary_loss_clip": 0.01383455, "auxiliary_loss_mlp": 0.01128262, "balance_loss_clip": 1.1159339, "balance_loss_mlp": 1.07359219, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.14433888305053, "language_loss": 0.77582061, "learning_rate": 3.4578970212197196e-06, "loss": 0.80093777, "num_input_tokens_seen": 4540870, "step": 215, "time_per_iteration": 2.7315175533294678 }, { "auxiliary_loss_clip": 0.01391951, "auxiliary_loss_mlp": 0.01115104, "balance_loss_clip": 1.11440635, "balance_loss_mlp": 1.0638206, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.2964706747038233, "language_loss": 0.90423942, "learning_rate": 3.460884739729461e-06, "loss": 0.92930996, "num_input_tokens_seen": 4560395, "step": 216, "time_per_iteration": 2.724698781967163 }, { "auxiliary_loss_clip": 0.01384729, "auxiliary_loss_mlp": 0.01113374, "balance_loss_clip": 1.10847259, "balance_loss_mlp": 1.06096959, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 3.60062834696173, "language_loss": 0.93473232, "learning_rate": 3.463858658104523e-06, "loss": 0.95971346, "num_input_tokens_seen": 4575785, "step": 217, "time_per_iteration": 5.762276649475098 }, { "auxiliary_loss_clip": 0.01377712, "auxiliary_loss_mlp": 0.0110874, "balance_loss_clip": 1.10726643, "balance_loss_mlp": 1.05433273, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 1.943339896357513, "language_loss": 0.93811166, "learning_rate": 3.4668189032433696e-06, "loss": 0.96297616, "num_input_tokens_seen": 4594985, "step": 218, "time_per_iteration": 5.832701206207275 }, { "auxiliary_loss_clip": 0.01372884, "auxiliary_loss_mlp": 0.01106717, "balance_loss_clip": 1.10647273, "balance_loss_mlp": 1.05552888, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 2.252873600345955, "language_loss": 0.86196327, "learning_rate": 3.46976560030214e-06, "loss": 0.88675928, "num_input_tokens_seen": 4616125, "step": 219, "time_per_iteration": 2.794581651687622 }, { "auxiliary_loss_clip": 0.0137885, "auxiliary_loss_mlp": 0.01102953, "balance_loss_clip": 1.10957599, "balance_loss_mlp": 1.05188394, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 1.897987121161891, "language_loss": 0.8748548, "learning_rate": 3.4726988727263976e-06, "loss": 0.89967287, "num_input_tokens_seen": 4637795, "step": 220, "time_per_iteration": 2.799927234649658 }, { "auxiliary_loss_clip": 0.01370688, "auxiliary_loss_mlp": 0.01115596, "balance_loss_clip": 1.10440111, "balance_loss_mlp": 1.0679127, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 3.2557072980071795, "language_loss": 0.86437249, "learning_rate": 3.475618842282164e-06, "loss": 0.88923532, "num_input_tokens_seen": 4656835, "step": 221, "time_per_iteration": 2.7040672302246094 }, { "auxiliary_loss_clip": 0.01376134, "auxiliary_loss_mlp": 0.01116397, "balance_loss_clip": 1.10384834, "balance_loss_mlp": 1.0637064, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.585706849100757, "language_loss": 0.92369294, "learning_rate": 3.4785256290862486e-06, "loss": 0.94861829, "num_input_tokens_seen": 4673015, "step": 222, "time_per_iteration": 2.6648194789886475 }, { "auxiliary_loss_clip": 0.01373283, "auxiliary_loss_mlp": 0.01106423, "balance_loss_clip": 1.10636806, "balance_loss_mlp": 1.05156267, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 7.739608779999776, "language_loss": 0.95708215, "learning_rate": 3.481419351635897e-06, "loss": 0.98187923, "num_input_tokens_seen": 4692355, "step": 223, "time_per_iteration": 2.7261807918548584 }, { "auxiliary_loss_clip": 0.01374555, "auxiliary_loss_mlp": 0.0110963, "balance_loss_clip": 1.10768425, "balance_loss_mlp": 1.05870414, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.673591615227502, "language_loss": 0.88031876, "learning_rate": 3.484300126837776e-06, "loss": 0.90516055, "num_input_tokens_seen": 4710080, "step": 224, "time_per_iteration": 2.601686477661133 }, { "auxiliary_loss_clip": 0.01374533, "auxiliary_loss_mlp": 0.01103, "balance_loss_clip": 1.10679817, "balance_loss_mlp": 1.04804444, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 3.0722216996453535, "language_loss": 0.89625597, "learning_rate": 3.487168070036317e-06, "loss": 0.9210313, "num_input_tokens_seen": 4728980, "step": 225, "time_per_iteration": 2.6677513122558594 }, { "auxiliary_loss_clip": 0.01369955, "auxiliary_loss_mlp": 0.0112021, "balance_loss_clip": 1.10561275, "balance_loss_mlp": 1.06675696, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 1.9576206039109396, "language_loss": 0.98980033, "learning_rate": 3.4900232950414224e-06, "loss": 1.01470196, "num_input_tokens_seen": 4747020, "step": 226, "time_per_iteration": 2.8320930004119873 }, { "auxiliary_loss_clip": 0.01375268, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.10837173, "balance_loss_mlp": 1.05572701, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.3303410550109245, "language_loss": 0.90965348, "learning_rate": 3.4928659141555727e-06, "loss": 0.93450654, "num_input_tokens_seen": 4765000, "step": 227, "time_per_iteration": 2.648606061935425 }, { "auxiliary_loss_clip": 0.01255161, "auxiliary_loss_mlp": 0.01079249, "balance_loss_clip": 1.11229861, "balance_loss_mlp": 1.06017554, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9472069433514878, "language_loss": 0.57650995, "learning_rate": 3.4956960382003234e-06, "loss": 0.59985405, "num_input_tokens_seen": 4833210, "step": 228, "time_per_iteration": 3.246328592300415 }, { "auxiliary_loss_clip": 0.01366835, "auxiliary_loss_mlp": 0.01117377, "balance_loss_clip": 1.10507822, "balance_loss_mlp": 1.06711841, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.957038430634678, "language_loss": 0.87773621, "learning_rate": 3.4985137765422354e-06, "loss": 0.90257835, "num_input_tokens_seen": 4850120, "step": 229, "time_per_iteration": 2.6319024562835693 }, { "auxiliary_loss_clip": 0.01375278, "auxiliary_loss_mlp": 0.01098609, "balance_loss_clip": 1.10567176, "balance_loss_mlp": 1.04873204, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 4.72663824849547, "language_loss": 0.83937395, "learning_rate": 3.501319237118231e-06, "loss": 0.86411285, "num_input_tokens_seen": 4866215, "step": 230, "time_per_iteration": 2.7026398181915283 }, { "auxiliary_loss_clip": 0.01373544, "auxiliary_loss_mlp": 0.01113683, "balance_loss_clip": 1.10701275, "balance_loss_mlp": 1.06361556, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 2.2562202151287867, "language_loss": 0.904212, "learning_rate": 3.5041125264604056e-06, "loss": 0.9290843, "num_input_tokens_seen": 4885630, "step": 231, "time_per_iteration": 2.6424474716186523 }, { "auxiliary_loss_clip": 0.01377759, "auxiliary_loss_mlp": 0.01110232, "balance_loss_clip": 1.11118639, "balance_loss_mlp": 1.06030726, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.0229562700819215, "language_loss": 0.83624899, "learning_rate": 3.5068937497203002e-06, "loss": 0.86112887, "num_input_tokens_seen": 4905570, "step": 232, "time_per_iteration": 2.621704339981079 }, { "auxiliary_loss_clip": 0.01377798, "auxiliary_loss_mlp": 0.01094369, "balance_loss_clip": 1.10229027, "balance_loss_mlp": 1.04253721, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 5.516695444379509, "language_loss": 0.74727643, "learning_rate": 3.509663010692652e-06, "loss": 0.77199805, "num_input_tokens_seen": 4923535, "step": 233, "time_per_iteration": 2.659188747406006 }, { "auxiliary_loss_clip": 0.01382744, "auxiliary_loss_mlp": 0.01125121, "balance_loss_clip": 1.1099937, "balance_loss_mlp": 1.0723356, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.5763093382937483, "language_loss": 0.85633421, "learning_rate": 3.512420411838642e-06, "loss": 0.88141286, "num_input_tokens_seen": 4939200, "step": 234, "time_per_iteration": 2.610635757446289 }, { "auxiliary_loss_clip": 0.01374562, "auxiliary_loss_mlp": 0.01114672, "balance_loss_clip": 1.10890436, "balance_loss_mlp": 1.06467605, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.467487286445388, "language_loss": 0.89192498, "learning_rate": 3.515166054308634e-06, "loss": 0.91681731, "num_input_tokens_seen": 4956620, "step": 235, "time_per_iteration": 2.668769359588623 }, { "auxiliary_loss_clip": 0.01373018, "auxiliary_loss_mlp": 0.01131641, "balance_loss_clip": 1.11011076, "balance_loss_mlp": 1.08073914, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.143165146200321, "language_loss": 0.85535377, "learning_rate": 3.5179000379644498e-06, "loss": 0.88040036, "num_input_tokens_seen": 4975650, "step": 236, "time_per_iteration": 2.7570323944091797 }, { "auxiliary_loss_clip": 0.01369632, "auxiliary_loss_mlp": 0.01100269, "balance_loss_clip": 1.10296702, "balance_loss_mlp": 1.04905629, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 2.1351980688483136, "language_loss": 0.82550979, "learning_rate": 3.520622461401154e-06, "loss": 0.85020876, "num_input_tokens_seen": 4997415, "step": 237, "time_per_iteration": 2.811617374420166 }, { "auxiliary_loss_clip": 0.01369728, "auxiliary_loss_mlp": 0.01124352, "balance_loss_clip": 1.10659075, "balance_loss_mlp": 1.07085085, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.0241581748099313, "language_loss": 0.77096599, "learning_rate": 3.5233334219683935e-06, "loss": 0.79590684, "num_input_tokens_seen": 5013905, "step": 238, "time_per_iteration": 2.8044662475585938 }, { "auxiliary_loss_clip": 0.01367496, "auxiliary_loss_mlp": 0.01111406, "balance_loss_clip": 1.10897434, "balance_loss_mlp": 1.06343579, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 1.8300428555870456, "language_loss": 0.8707583, "learning_rate": 3.526033015791284e-06, "loss": 0.89554727, "num_input_tokens_seen": 5033645, "step": 239, "time_per_iteration": 2.681452751159668 }, { "auxiliary_loss_clip": 0.01353036, "auxiliary_loss_mlp": 0.01103184, "balance_loss_clip": 1.10036874, "balance_loss_mlp": 1.05516672, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 2.109315431148974, "language_loss": 0.93055749, "learning_rate": 3.528721337790862e-06, "loss": 0.95511973, "num_input_tokens_seen": 5052875, "step": 240, "time_per_iteration": 2.679826021194458 }, { "auxiliary_loss_clip": 0.01360794, "auxiliary_loss_mlp": 0.01103084, "balance_loss_clip": 1.10475957, "balance_loss_mlp": 1.05611515, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 3.7136133710916575, "language_loss": 0.8482846, "learning_rate": 3.531398481704111e-06, "loss": 0.87292337, "num_input_tokens_seen": 5075005, "step": 241, "time_per_iteration": 2.679126262664795 }, { "auxiliary_loss_clip": 0.01359518, "auxiliary_loss_mlp": 0.01119602, "balance_loss_clip": 1.11010456, "balance_loss_mlp": 1.06931913, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 1.8502491938168453, "language_loss": 0.88590866, "learning_rate": 3.534064540103573e-06, "loss": 0.9106999, "num_input_tokens_seen": 5091875, "step": 242, "time_per_iteration": 2.7366583347320557 }, { "auxiliary_loss_clip": 0.01359534, "auxiliary_loss_mlp": 0.01104713, "balance_loss_clip": 1.10356677, "balance_loss_mlp": 1.05342889, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.261458758817042, "language_loss": 0.86688942, "learning_rate": 3.536719604416555e-06, "loss": 0.89153194, "num_input_tokens_seen": 5111290, "step": 243, "time_per_iteration": 2.764378070831299 }, { "auxiliary_loss_clip": 0.01364897, "auxiliary_loss_mlp": 0.01106776, "balance_loss_clip": 1.10636568, "balance_loss_mlp": 1.05656552, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 1.6964959858678799, "language_loss": 0.84256208, "learning_rate": 3.5393637649439464e-06, "loss": 0.86727887, "num_input_tokens_seen": 5132265, "step": 244, "time_per_iteration": 2.630441188812256 }, { "auxiliary_loss_clip": 0.01372266, "auxiliary_loss_mlp": 0.01115072, "balance_loss_clip": 1.10771632, "balance_loss_mlp": 1.06328762, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 8.49550264430495, "language_loss": 0.78613877, "learning_rate": 3.54199711087864e-06, "loss": 0.81101215, "num_input_tokens_seen": 5148575, "step": 245, "time_per_iteration": 2.6991443634033203 }, { "auxiliary_loss_clip": 0.01371598, "auxiliary_loss_mlp": 0.0110404, "balance_loss_clip": 1.10405719, "balance_loss_mlp": 1.05008554, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 2.2582939339926305, "language_loss": 0.84165329, "learning_rate": 3.5446197303235913e-06, "loss": 0.86640966, "num_input_tokens_seen": 5170415, "step": 246, "time_per_iteration": 2.726743221282959 }, { "auxiliary_loss_clip": 0.01365538, "auxiliary_loss_mlp": 0.01101456, "balance_loss_clip": 1.10242295, "balance_loss_mlp": 1.05062532, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 1.9870849133800452, "language_loss": 0.89958012, "learning_rate": 3.5472317103095034e-06, "loss": 0.92425001, "num_input_tokens_seen": 5188565, "step": 247, "time_per_iteration": 2.5998406410217285 }, { "auxiliary_loss_clip": 0.01364581, "auxiliary_loss_mlp": 0.01098108, "balance_loss_clip": 1.09896278, "balance_loss_mlp": 1.0489223, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.0527635487774343, "language_loss": 0.783005, "learning_rate": 3.549833136812155e-06, "loss": 0.80763197, "num_input_tokens_seen": 5207810, "step": 248, "time_per_iteration": 2.689784049987793 }, { "auxiliary_loss_clip": 0.01365896, "auxiliary_loss_mlp": 0.01110511, "balance_loss_clip": 1.10732806, "balance_loss_mlp": 1.06044269, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 1.9405946352322343, "language_loss": 0.83855766, "learning_rate": 3.552424094769381e-06, "loss": 0.86332172, "num_input_tokens_seen": 5226210, "step": 249, "time_per_iteration": 2.8210339546203613 }, { "auxiliary_loss_clip": 0.01358179, "auxiliary_loss_mlp": 0.01106801, "balance_loss_clip": 1.10089588, "balance_loss_mlp": 1.05802023, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.0689026358419786, "language_loss": 0.93631709, "learning_rate": 3.5550046680977174e-06, "loss": 0.96096689, "num_input_tokens_seen": 5241660, "step": 250, "time_per_iteration": 2.7074570655822754 }, { "auxiliary_loss_clip": 0.01368183, "auxiliary_loss_mlp": 0.01115393, "balance_loss_clip": 1.1065619, "balance_loss_mlp": 1.06415713, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.6509740932573127, "language_loss": 0.9678722, "learning_rate": 3.5575749397087034e-06, "loss": 0.99270797, "num_input_tokens_seen": 5261090, "step": 251, "time_per_iteration": 2.6740176677703857 }, { "auxiliary_loss_clip": 0.01361249, "auxiliary_loss_mlp": 0.01108489, "balance_loss_clip": 1.10063529, "balance_loss_mlp": 1.0597558, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.996044018630987, "language_loss": 0.84516245, "learning_rate": 3.5601349915248707e-06, "loss": 0.86985981, "num_input_tokens_seen": 5279175, "step": 252, "time_per_iteration": 2.7198123931884766 }, { "auxiliary_loss_clip": 0.01356789, "auxiliary_loss_mlp": 0.0111346, "balance_loss_clip": 1.1023767, "balance_loss_mlp": 1.06346345, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.3132428526475275, "language_loss": 0.98516917, "learning_rate": 3.5626849044954064e-06, "loss": 1.0098716, "num_input_tokens_seen": 5296975, "step": 253, "time_per_iteration": 2.6751561164855957 }, { "auxiliary_loss_clip": 0.01244193, "auxiliary_loss_mlp": 0.01100072, "balance_loss_clip": 1.1058414, "balance_loss_mlp": 1.08338308, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8719135194962525, "language_loss": 0.55628473, "learning_rate": 3.5652247586115167e-06, "loss": 0.57972741, "num_input_tokens_seen": 5358375, "step": 254, "time_per_iteration": 3.2305996417999268 }, { "auxiliary_loss_clip": 0.0136146, "auxiliary_loss_mlp": 0.01119692, "balance_loss_clip": 1.0985806, "balance_loss_mlp": 1.06952846, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 2.113472843461701, "language_loss": 0.90234184, "learning_rate": 3.567754632921479e-06, "loss": 0.92715329, "num_input_tokens_seen": 5377255, "step": 255, "time_per_iteration": 2.7138473987579346 }, { "auxiliary_loss_clip": 0.01357311, "auxiliary_loss_mlp": 0.01137867, "balance_loss_clip": 1.1001389, "balance_loss_mlp": 1.08803785, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.320838285045027, "language_loss": 0.85392761, "learning_rate": 3.5702746055454075e-06, "loss": 0.87887937, "num_input_tokens_seen": 5395320, "step": 256, "time_per_iteration": 2.7135775089263916 }, { "auxiliary_loss_clip": 0.01363873, "auxiliary_loss_mlp": 0.0112257, "balance_loss_clip": 1.10053098, "balance_loss_mlp": 1.07281172, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 4.480294478847577, "language_loss": 0.71472508, "learning_rate": 3.5727847536897254e-06, "loss": 0.73958945, "num_input_tokens_seen": 5411970, "step": 257, "time_per_iteration": 6.340675592422485 }, { "auxiliary_loss_clip": 0.01355912, "auxiliary_loss_mlp": 0.01112611, "balance_loss_clip": 1.10014856, "balance_loss_mlp": 1.06280565, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 2.0292888191897673, "language_loss": 0.94713151, "learning_rate": 3.5752851536613596e-06, "loss": 0.97181678, "num_input_tokens_seen": 5430245, "step": 258, "time_per_iteration": 5.674164772033691 }, { "auxiliary_loss_clip": 0.01356656, "auxiliary_loss_mlp": 0.01113313, "balance_loss_clip": 1.09867072, "balance_loss_mlp": 1.0645566, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.3215886633849236, "language_loss": 0.93037683, "learning_rate": 3.577775880881658e-06, "loss": 0.95507646, "num_input_tokens_seen": 5448905, "step": 259, "time_per_iteration": 2.6286497116088867 }, { "auxiliary_loss_clip": 0.01348977, "auxiliary_loss_mlp": 0.01102171, "balance_loss_clip": 1.10076857, "balance_loss_mlp": 1.05625176, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.9575053933526474, "language_loss": 0.97368109, "learning_rate": 3.5802570099000424e-06, "loss": 0.99819261, "num_input_tokens_seen": 5466405, "step": 260, "time_per_iteration": 2.625072717666626 }, { "auxiliary_loss_clip": 0.01362999, "auxiliary_loss_mlp": 0.01127943, "balance_loss_clip": 1.1010474, "balance_loss_mlp": 1.07940137, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.2828802632863305, "language_loss": 0.87807435, "learning_rate": 3.5827286144073947e-06, "loss": 0.90298378, "num_input_tokens_seen": 5487055, "step": 261, "time_per_iteration": 2.6737279891967773 }, { "auxiliary_loss_clip": 0.01357008, "auxiliary_loss_mlp": 0.01125312, "balance_loss_clip": 1.09822345, "balance_loss_mlp": 1.07665133, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 5.057676675675106, "language_loss": 0.67100549, "learning_rate": 3.5851907672491904e-06, "loss": 0.69582868, "num_input_tokens_seen": 5506600, "step": 262, "time_per_iteration": 2.651690721511841 }, { "auxiliary_loss_clip": 0.01353953, "auxiliary_loss_mlp": 0.01135541, "balance_loss_clip": 1.09924924, "balance_loss_mlp": 1.08499634, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 3.0820356667611337, "language_loss": 0.68077701, "learning_rate": 3.587643540438383e-06, "loss": 0.70567191, "num_input_tokens_seen": 5524350, "step": 263, "time_per_iteration": 2.6885130405426025 }, { "auxiliary_loss_clip": 0.01355592, "auxiliary_loss_mlp": 0.01116799, "balance_loss_clip": 1.09620881, "balance_loss_mlp": 1.06766081, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 3.9089218881424674, "language_loss": 0.85002583, "learning_rate": 3.590087005168037e-06, "loss": 0.87474978, "num_input_tokens_seen": 5542145, "step": 264, "time_per_iteration": 2.6557912826538086 }, { "auxiliary_loss_clip": 0.01360388, "auxiliary_loss_mlp": 0.01102763, "balance_loss_clip": 1.10088885, "balance_loss_mlp": 1.056319, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.7020928553211476, "language_loss": 1.04234743, "learning_rate": 3.5925212318237344e-06, "loss": 1.06697881, "num_input_tokens_seen": 5557920, "step": 265, "time_per_iteration": 2.6262216567993164 }, { "auxiliary_loss_clip": 0.01364512, "auxiliary_loss_mlp": 0.01120309, "balance_loss_clip": 1.1033864, "balance_loss_mlp": 1.06835794, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 3.1220748516520134, "language_loss": 0.74914098, "learning_rate": 3.5949462899957323e-06, "loss": 0.7739892, "num_input_tokens_seen": 5576290, "step": 266, "time_per_iteration": 2.6244583129882812 }, { "auxiliary_loss_clip": 0.01349738, "auxiliary_loss_mlp": 0.0111189, "balance_loss_clip": 1.1000762, "balance_loss_mlp": 1.06206095, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 1.8166776194063956, "language_loss": 0.90909529, "learning_rate": 3.5973622484909068e-06, "loss": 0.93371153, "num_input_tokens_seen": 5595205, "step": 267, "time_per_iteration": 2.6753580570220947 }, { "auxiliary_loss_clip": 0.01359091, "auxiliary_loss_mlp": 0.01115968, "balance_loss_clip": 1.10122573, "balance_loss_mlp": 1.06797481, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 2.450608875877181, "language_loss": 0.85636413, "learning_rate": 3.599769175344462e-06, "loss": 0.88111478, "num_input_tokens_seen": 5612645, "step": 268, "time_per_iteration": 2.7161567211151123 }, { "auxiliary_loss_clip": 0.01351132, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.10226274, "balance_loss_mlp": 1.05475891, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.1714201716772457, "language_loss": 0.88080788, "learning_rate": 3.602167137831432e-06, "loss": 0.90534198, "num_input_tokens_seen": 5628345, "step": 269, "time_per_iteration": 2.6403756141662598 }, { "auxiliary_loss_clip": 0.01357907, "auxiliary_loss_mlp": 0.01111574, "balance_loss_clip": 1.10001528, "balance_loss_mlp": 1.06021833, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.5848702107942803, "language_loss": 0.97077739, "learning_rate": 3.6045562024779565e-06, "loss": 0.99547219, "num_input_tokens_seen": 5645940, "step": 270, "time_per_iteration": 2.635546922683716 }, { "auxiliary_loss_clip": 0.01356007, "auxiliary_loss_mlp": 0.01118132, "balance_loss_clip": 1.10402, "balance_loss_mlp": 1.06918478, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 2.1115750591463223, "language_loss": 0.86112005, "learning_rate": 3.606936435072361e-06, "loss": 0.8858614, "num_input_tokens_seen": 5665690, "step": 271, "time_per_iteration": 2.6877286434173584 }, { "auxiliary_loss_clip": 0.013537, "auxiliary_loss_mlp": 0.01105687, "balance_loss_clip": 1.0962286, "balance_loss_mlp": 1.057693, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 2.5391912683658413, "language_loss": 0.81550127, "learning_rate": 3.609307900676025e-06, "loss": 0.84009504, "num_input_tokens_seen": 5683190, "step": 272, "time_per_iteration": 2.6728365421295166 }, { "auxiliary_loss_clip": 0.01348527, "auxiliary_loss_mlp": 0.01120864, "balance_loss_clip": 1.09806561, "balance_loss_mlp": 1.07368064, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.3613573538590487, "language_loss": 0.81075382, "learning_rate": 3.611670663634051e-06, "loss": 0.83544779, "num_input_tokens_seen": 5699780, "step": 273, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01346135, "auxiliary_loss_mlp": 0.01105539, "balance_loss_clip": 1.09398317, "balance_loss_mlp": 1.05749762, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 2.1979313648400547, "language_loss": 0.9131726, "learning_rate": 3.614024787585744e-06, "loss": 0.9376893, "num_input_tokens_seen": 5716980, "step": 274, "time_per_iteration": 2.684718132019043 }, { "auxiliary_loss_clip": 0.013432, "auxiliary_loss_mlp": 0.01108715, "balance_loss_clip": 1.09515727, "balance_loss_mlp": 1.06062579, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.9719932168994616, "language_loss": 0.88054645, "learning_rate": 3.6163703354748927e-06, "loss": 0.90506566, "num_input_tokens_seen": 5737780, "step": 275, "time_per_iteration": 2.7204532623291016 }, { "auxiliary_loss_clip": 0.01346726, "auxiliary_loss_mlp": 0.01102856, "balance_loss_clip": 1.09623361, "balance_loss_mlp": 1.05312169, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.7930545784536995, "language_loss": 0.80726624, "learning_rate": 3.6187073695598707e-06, "loss": 0.83176208, "num_input_tokens_seen": 5758330, "step": 276, "time_per_iteration": 3.04716157913208 }, { "auxiliary_loss_clip": 0.0133817, "auxiliary_loss_mlp": 0.01096103, "balance_loss_clip": 1.09588337, "balance_loss_mlp": 1.05220985, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 1.9196343116615175, "language_loss": 0.80707026, "learning_rate": 3.621035951423551e-06, "loss": 0.83141291, "num_input_tokens_seen": 5778340, "step": 277, "time_per_iteration": 2.809645652770996 }, { "auxiliary_loss_clip": 0.01337061, "auxiliary_loss_mlp": 0.0109637, "balance_loss_clip": 1.08979487, "balance_loss_mlp": 1.04923487, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 2.3224792061881185, "language_loss": 0.80508065, "learning_rate": 3.623356141983041e-06, "loss": 0.82941496, "num_input_tokens_seen": 5794295, "step": 278, "time_per_iteration": 2.604830741882324 }, { "auxiliary_loss_clip": 0.01341116, "auxiliary_loss_mlp": 0.01101968, "balance_loss_clip": 1.09395671, "balance_loss_mlp": 1.05585837, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 2.0021377353660057, "language_loss": 0.90582991, "learning_rate": 3.6256680014992486e-06, "loss": 0.93026078, "num_input_tokens_seen": 5814405, "step": 279, "time_per_iteration": 2.7193243503570557 }, { "auxiliary_loss_clip": 0.01346095, "auxiliary_loss_mlp": 0.01112065, "balance_loss_clip": 1.09383631, "balance_loss_mlp": 1.06450009, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 2.9314445951013988, "language_loss": 0.94049025, "learning_rate": 3.6279715895862713e-06, "loss": 0.96507192, "num_input_tokens_seen": 5832795, "step": 280, "time_per_iteration": 2.680924654006958 }, { "auxiliary_loss_clip": 0.01346658, "auxiliary_loss_mlp": 0.01109166, "balance_loss_clip": 1.09285879, "balance_loss_mlp": 1.06060064, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.6758913403282483, "language_loss": 0.74425459, "learning_rate": 3.6302669652206183e-06, "loss": 0.76881289, "num_input_tokens_seen": 5855750, "step": 281, "time_per_iteration": 2.691152811050415 }, { "auxiliary_loss_clip": 0.01343371, "auxiliary_loss_mlp": 0.01117708, "balance_loss_clip": 1.09609079, "balance_loss_mlp": 1.0724318, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 3.4878028680462005, "language_loss": 0.80255079, "learning_rate": 3.632554186750274e-06, "loss": 0.82716167, "num_input_tokens_seen": 5872610, "step": 282, "time_per_iteration": 2.592664957046509 }, { "auxiliary_loss_clip": 0.01348082, "auxiliary_loss_mlp": 0.01118449, "balance_loss_clip": 1.09700727, "balance_loss_mlp": 1.07114697, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 2.296781711700251, "language_loss": 0.77719986, "learning_rate": 3.6348333119035937e-06, "loss": 0.80186516, "num_input_tokens_seen": 5892985, "step": 283, "time_per_iteration": 2.6502227783203125 }, { "auxiliary_loss_clip": 0.01347311, "auxiliary_loss_mlp": 0.01092934, "balance_loss_clip": 1.0977478, "balance_loss_mlp": 1.04804015, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.3467060832193414, "language_loss": 0.84246969, "learning_rate": 3.6371043977980503e-06, "loss": 0.86687213, "num_input_tokens_seen": 5914060, "step": 284, "time_per_iteration": 2.8534958362579346 }, { "auxiliary_loss_clip": 0.01337962, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.09212708, "balance_loss_mlp": 1.05297756, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 2.7335752956200388, "language_loss": 0.96998906, "learning_rate": 3.639367500948819e-06, "loss": 0.99437273, "num_input_tokens_seen": 5932860, "step": 285, "time_per_iteration": 2.6338655948638916 }, { "auxiliary_loss_clip": 0.01341319, "auxiliary_loss_mlp": 0.01095606, "balance_loss_clip": 1.09538078, "balance_loss_mlp": 1.05123687, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.294843469150046, "language_loss": 0.94079655, "learning_rate": 3.6416226772772178e-06, "loss": 0.96516573, "num_input_tokens_seen": 5952725, "step": 286, "time_per_iteration": 2.711087942123413 }, { "auxiliary_loss_clip": 0.01332862, "auxiliary_loss_mlp": 0.0109035, "balance_loss_clip": 1.08986938, "balance_loss_mlp": 1.04409683, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 1.9277896882465477, "language_loss": 0.92464817, "learning_rate": 3.643869982119001e-06, "loss": 0.94888031, "num_input_tokens_seen": 5970560, "step": 287, "time_per_iteration": 2.640267848968506 }, { "auxiliary_loss_clip": 0.01338192, "auxiliary_loss_mlp": 0.01092315, "balance_loss_clip": 1.09039164, "balance_loss_mlp": 1.04651475, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 2.7883535936791035, "language_loss": 1.01873291, "learning_rate": 3.646109470232502e-06, "loss": 1.04303789, "num_input_tokens_seen": 5982980, "step": 288, "time_per_iteration": 2.558312177658081 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01188305, "balance_loss_clip": 1.09194219, "balance_loss_mlp": 1.17228377, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.9289960013542303, "language_loss": 0.63867617, "learning_rate": 3.6483411958066417e-06, "loss": 0.66281009, "num_input_tokens_seen": 6049445, "step": 289, "time_per_iteration": 3.386254072189331 }, { "auxiliary_loss_clip": 0.01341215, "auxiliary_loss_mlp": 0.01107788, "balance_loss_clip": 1.09622383, "balance_loss_mlp": 1.06482446, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.368974734045724, "language_loss": 0.88156199, "learning_rate": 3.6505652124687957e-06, "loss": 0.90605205, "num_input_tokens_seen": 6064150, "step": 290, "time_per_iteration": 2.5670948028564453 }, { "auxiliary_loss_clip": 0.0133848, "auxiliary_loss_mlp": 0.010946, "balance_loss_clip": 1.09388971, "balance_loss_mlp": 1.04965782, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 2.2011772664145504, "language_loss": 0.84472585, "learning_rate": 3.6527815732925258e-06, "loss": 0.8690567, "num_input_tokens_seen": 6083920, "step": 291, "time_per_iteration": 2.648452043533325 }, { "auxiliary_loss_clip": 0.01343563, "auxiliary_loss_mlp": 0.01115116, "balance_loss_clip": 1.10129941, "balance_loss_mlp": 1.06607366, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.7675259544479762, "language_loss": 0.72679955, "learning_rate": 3.6549903308051806e-06, "loss": 0.75138628, "num_input_tokens_seen": 6105460, "step": 292, "time_per_iteration": 2.7239537239074707 }, { "auxiliary_loss_clip": 0.01334066, "auxiliary_loss_mlp": 0.01107289, "balance_loss_clip": 1.09397244, "balance_loss_mlp": 1.06170392, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.419616990787406, "language_loss": 0.86866581, "learning_rate": 3.6571915369953646e-06, "loss": 0.89307928, "num_input_tokens_seen": 6122890, "step": 293, "time_per_iteration": 2.642854690551758 }, { "auxiliary_loss_clip": 0.01333726, "auxiliary_loss_mlp": 0.0110557, "balance_loss_clip": 1.09271646, "balance_loss_mlp": 1.06086659, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.112624444766753, "language_loss": 0.80896151, "learning_rate": 3.6593852433202797e-06, "loss": 0.83335447, "num_input_tokens_seen": 6142890, "step": 294, "time_per_iteration": 2.598176956176758 }, { "auxiliary_loss_clip": 0.01334179, "auxiliary_loss_mlp": 0.01113433, "balance_loss_clip": 1.09030747, "balance_loss_mlp": 1.06892014, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 2.8289841764142416, "language_loss": 0.83806521, "learning_rate": 3.6615715007129453e-06, "loss": 0.86254132, "num_input_tokens_seen": 6162030, "step": 295, "time_per_iteration": 2.750103712081909 }, { "auxiliary_loss_clip": 0.01339845, "auxiliary_loss_mlp": 0.01121984, "balance_loss_clip": 1.09978509, "balance_loss_mlp": 1.0772326, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 1.8804378237246864, "language_loss": 0.84576106, "learning_rate": 3.6637503595892897e-06, "loss": 0.87037927, "num_input_tokens_seen": 6180540, "step": 296, "time_per_iteration": 4.154251337051392 }, { "auxiliary_loss_clip": 0.01337678, "auxiliary_loss_mlp": 0.01105295, "balance_loss_clip": 1.09463406, "balance_loss_mlp": 1.06154561, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.055710812588959, "language_loss": 0.87810111, "learning_rate": 3.665921869855132e-06, "loss": 0.90253091, "num_input_tokens_seen": 6199425, "step": 297, "time_per_iteration": 4.379676103591919 }, { "auxiliary_loss_clip": 0.0133717, "auxiliary_loss_mlp": 0.01103766, "balance_loss_clip": 1.09343684, "balance_loss_mlp": 1.06004047, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.689351030321763, "language_loss": 0.88947791, "learning_rate": 3.6680860809130346e-06, "loss": 0.91388726, "num_input_tokens_seen": 6219170, "step": 298, "time_per_iteration": 4.1055779457092285 }, { "auxiliary_loss_clip": 0.01333843, "auxiliary_loss_mlp": 0.01121179, "balance_loss_clip": 1.09470236, "balance_loss_mlp": 1.07499719, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 1.8935027270905305, "language_loss": 0.88550889, "learning_rate": 3.6702430416690516e-06, "loss": 0.91005915, "num_input_tokens_seen": 6237930, "step": 299, "time_per_iteration": 2.611168622970581 }, { "auxiliary_loss_clip": 0.0133938, "auxiliary_loss_mlp": 0.0110718, "balance_loss_clip": 1.09468794, "balance_loss_mlp": 1.06130886, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 4.075580609786654, "language_loss": 0.64664406, "learning_rate": 3.672392800539357e-06, "loss": 0.67110968, "num_input_tokens_seen": 6257170, "step": 300, "time_per_iteration": 2.645603656768799 }, { "auxiliary_loss_clip": 0.01338559, "auxiliary_loss_mlp": 0.01111665, "balance_loss_clip": 1.09775913, "balance_loss_mlp": 1.06636548, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 2.5071418214687515, "language_loss": 0.87940675, "learning_rate": 3.6745354054567686e-06, "loss": 0.90390897, "num_input_tokens_seen": 6274780, "step": 301, "time_per_iteration": 2.6035923957824707 }, { "auxiliary_loss_clip": 0.01238361, "auxiliary_loss_mlp": 0.01073699, "balance_loss_clip": 1.1100142, "balance_loss_mlp": 1.05901265, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8350739260664176, "language_loss": 0.62219667, "learning_rate": 3.676670903877158e-06, "loss": 0.64531732, "num_input_tokens_seen": 6340435, "step": 302, "time_per_iteration": 3.3307297229766846 }, { "auxiliary_loss_clip": 0.0132981, "auxiliary_loss_mlp": 0.01110918, "balance_loss_clip": 1.0910126, "balance_loss_mlp": 1.06507051, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 2.115144575016314, "language_loss": 0.89737153, "learning_rate": 3.6787993427857567e-06, "loss": 0.9217788, "num_input_tokens_seen": 6358160, "step": 303, "time_per_iteration": 2.6773293018341064 }, { "auxiliary_loss_clip": 0.01335628, "auxiliary_loss_mlp": 0.01118481, "balance_loss_clip": 1.09579217, "balance_loss_mlp": 1.07237101, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.8670669350935472, "language_loss": 0.80417514, "learning_rate": 3.680920768703364e-06, "loss": 0.82871628, "num_input_tokens_seen": 6378485, "step": 304, "time_per_iteration": 2.691347360610962 }, { "auxiliary_loss_clip": 0.01330802, "auxiliary_loss_mlp": 0.01091671, "balance_loss_clip": 1.09832263, "balance_loss_mlp": 1.04858923, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6863564291935742, "language_loss": 0.82761526, "learning_rate": 3.6830352276924415e-06, "loss": 0.85184002, "num_input_tokens_seen": 6397845, "step": 305, "time_per_iteration": 2.6883981227874756 }, { "auxiliary_loss_clip": 0.01330759, "auxiliary_loss_mlp": 0.01093908, "balance_loss_clip": 1.09012437, "balance_loss_mlp": 1.05115986, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 2.1780708917523297, "language_loss": 0.91148543, "learning_rate": 3.685142765363119e-06, "loss": 0.93573213, "num_input_tokens_seen": 6416475, "step": 306, "time_per_iteration": 2.6465187072753906 }, { "auxiliary_loss_clip": 0.01324743, "auxiliary_loss_mlp": 0.01091696, "balance_loss_clip": 1.08900762, "balance_loss_mlp": 1.04882836, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.4680205003751072, "language_loss": 0.86581063, "learning_rate": 3.687243426879095e-06, "loss": 0.88997507, "num_input_tokens_seen": 6437520, "step": 307, "time_per_iteration": 2.7787318229675293 }, { "auxiliary_loss_clip": 0.01326572, "auxiliary_loss_mlp": 0.01110018, "balance_loss_clip": 1.09346747, "balance_loss_mlp": 1.06247783, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 2.413130156754219, "language_loss": 0.71650648, "learning_rate": 3.6893372569634466e-06, "loss": 0.74087244, "num_input_tokens_seen": 6455680, "step": 308, "time_per_iteration": 2.652973175048828 }, { "auxiliary_loss_clip": 0.01331912, "auxiliary_loss_mlp": 0.01102766, "balance_loss_clip": 1.09061241, "balance_loss_mlp": 1.05911207, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.1869498369051077, "language_loss": 0.91841364, "learning_rate": 3.6914242999043395e-06, "loss": 0.94276047, "num_input_tokens_seen": 6474880, "step": 309, "time_per_iteration": 2.6613030433654785 }, { "auxiliary_loss_clip": 0.01339178, "auxiliary_loss_mlp": 0.01096668, "balance_loss_clip": 1.09145641, "balance_loss_mlp": 1.05084395, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.0400456475786353, "language_loss": 0.72784412, "learning_rate": 3.69350459956065e-06, "loss": 0.75220263, "num_input_tokens_seen": 6495945, "step": 310, "time_per_iteration": 2.705345392227173 }, { "auxiliary_loss_clip": 0.01331019, "auxiliary_loss_mlp": 0.01113021, "balance_loss_clip": 1.09560525, "balance_loss_mlp": 1.06922317, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.1345597100799645, "language_loss": 0.74162471, "learning_rate": 3.695578199367497e-06, "loss": 0.76606506, "num_input_tokens_seen": 6519930, "step": 311, "time_per_iteration": 2.846503496170044 }, { "auxiliary_loss_clip": 0.01338389, "auxiliary_loss_mlp": 0.01104203, "balance_loss_clip": 1.09206033, "balance_loss_mlp": 1.0609777, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 3.713635021153945, "language_loss": 0.91668129, "learning_rate": 3.6976451423416825e-06, "loss": 0.94110715, "num_input_tokens_seen": 6535070, "step": 312, "time_per_iteration": 2.598400592803955 }, { "auxiliary_loss_clip": 0.01339145, "auxiliary_loss_mlp": 0.01116197, "balance_loss_clip": 1.09512305, "balance_loss_mlp": 1.07034922, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 4.5530066286460045, "language_loss": 0.89634913, "learning_rate": 3.699705471087043e-06, "loss": 0.92090249, "num_input_tokens_seen": 6554135, "step": 313, "time_per_iteration": 2.6944596767425537 }, { "auxiliary_loss_clip": 0.01340962, "auxiliary_loss_mlp": 0.0109941, "balance_loss_clip": 1.09381938, "balance_loss_mlp": 1.05430174, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 2.3990870717118455, "language_loss": 0.7335974, "learning_rate": 3.7017592277997256e-06, "loss": 0.75800109, "num_input_tokens_seen": 6572275, "step": 314, "time_per_iteration": 2.6550133228302 }, { "auxiliary_loss_clip": 0.01329658, "auxiliary_loss_mlp": 0.01105546, "balance_loss_clip": 1.09075165, "balance_loss_mlp": 1.06246412, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 5.81191681220521, "language_loss": 0.89890182, "learning_rate": 3.7038064542733654e-06, "loss": 0.92325383, "num_input_tokens_seen": 6594520, "step": 315, "time_per_iteration": 2.7121222019195557 }, { "auxiliary_loss_clip": 0.0133262, "auxiliary_loss_mlp": 0.01096177, "balance_loss_clip": 1.09287357, "balance_loss_mlp": 1.05209303, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 2.446494284682687, "language_loss": 0.80517328, "learning_rate": 3.7058471919041945e-06, "loss": 0.82946122, "num_input_tokens_seen": 6614245, "step": 316, "time_per_iteration": 2.640573501586914 }, { "auxiliary_loss_clip": 0.01326654, "auxiliary_loss_mlp": 0.01094904, "balance_loss_clip": 1.09036672, "balance_loss_mlp": 1.05046248, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 2.3705495670370524, "language_loss": 0.90161496, "learning_rate": 3.7078814816960605e-06, "loss": 0.92583054, "num_input_tokens_seen": 6632015, "step": 317, "time_per_iteration": 2.594388246536255 }, { "auxiliary_loss_clip": 0.01324014, "auxiliary_loss_mlp": 0.01097498, "balance_loss_clip": 1.08944559, "balance_loss_mlp": 1.05281842, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 7.443622240044352, "language_loss": 0.90836811, "learning_rate": 3.709909364265374e-06, "loss": 0.93258321, "num_input_tokens_seen": 6649015, "step": 318, "time_per_iteration": 2.6647114753723145 }, { "auxiliary_loss_clip": 0.01326579, "auxiliary_loss_mlp": 0.01092817, "balance_loss_clip": 1.0886786, "balance_loss_mlp": 1.05102181, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 2.232217614618188, "language_loss": 0.93955356, "learning_rate": 3.7119308798459706e-06, "loss": 0.9637475, "num_input_tokens_seen": 6669225, "step": 319, "time_per_iteration": 2.6901800632476807 }, { "auxiliary_loss_clip": 0.01209258, "auxiliary_loss_mlp": 0.01057567, "balance_loss_clip": 1.08611965, "balance_loss_mlp": 1.04288089, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 1.0009907084180605, "language_loss": 0.59817195, "learning_rate": 3.7139460682939026e-06, "loss": 0.62084019, "num_input_tokens_seen": 6725775, "step": 320, "time_per_iteration": 3.1044812202453613 }, { "auxiliary_loss_clip": 0.01323701, "auxiliary_loss_mlp": 0.01105882, "balance_loss_clip": 1.08827436, "balance_loss_mlp": 1.06291938, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 3.6735645336458163, "language_loss": 0.89620435, "learning_rate": 3.715954969092154e-06, "loss": 0.92050016, "num_input_tokens_seen": 6744170, "step": 321, "time_per_iteration": 2.650325298309326 }, { "auxiliary_loss_clip": 0.01333523, "auxiliary_loss_mlp": 0.01118534, "balance_loss_clip": 1.09200621, "balance_loss_mlp": 1.07440257, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.289334718991835, "language_loss": 0.82897186, "learning_rate": 3.7179576213552805e-06, "loss": 0.85349244, "num_input_tokens_seen": 6764565, "step": 322, "time_per_iteration": 2.65793514251709 }, { "auxiliary_loss_clip": 0.01332983, "auxiliary_loss_mlp": 0.01092262, "balance_loss_clip": 1.09035325, "balance_loss_mlp": 1.05061018, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.3678949255052912, "language_loss": 0.72983897, "learning_rate": 3.719954063833981e-06, "loss": 0.75409144, "num_input_tokens_seen": 6785310, "step": 323, "time_per_iteration": 2.6827828884124756 }, { "auxiliary_loss_clip": 0.01321298, "auxiliary_loss_mlp": 0.01092254, "balance_loss_clip": 1.08474624, "balance_loss_mlp": 1.04974401, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.9971507164977458, "language_loss": 0.92358303, "learning_rate": 3.721944334919596e-06, "loss": 0.9477185, "num_input_tokens_seen": 6803290, "step": 324, "time_per_iteration": 2.667363405227661 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01089098, "balance_loss_clip": 1.09217644, "balance_loss_mlp": 1.04878139, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 6.407507213214319, "language_loss": 0.65127969, "learning_rate": 3.7239284726485375e-06, "loss": 0.67547202, "num_input_tokens_seen": 6822570, "step": 325, "time_per_iteration": 2.658700466156006 }, { "auxiliary_loss_clip": 0.01328385, "auxiliary_loss_mlp": 0.01109788, "balance_loss_clip": 1.09598839, "balance_loss_mlp": 1.06675363, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.7177375017641943, "language_loss": 0.76394802, "learning_rate": 3.72590651470665e-06, "loss": 0.78832972, "num_input_tokens_seen": 6841910, "step": 326, "time_per_iteration": 2.6326630115509033 }, { "auxiliary_loss_clip": 0.01322824, "auxiliary_loss_mlp": 0.01103487, "balance_loss_clip": 1.09083152, "balance_loss_mlp": 1.06040514, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.041100065316132, "language_loss": 0.79262185, "learning_rate": 3.727878498433505e-06, "loss": 0.81688493, "num_input_tokens_seen": 6862480, "step": 327, "time_per_iteration": 2.7195518016815186 }, { "auxiliary_loss_clip": 0.0132945, "auxiliary_loss_mlp": 0.01099712, "balance_loss_clip": 1.09292865, "balance_loss_mlp": 1.05832207, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 2.852301933148325, "language_loss": 0.80569315, "learning_rate": 3.7298444608266328e-06, "loss": 0.82998472, "num_input_tokens_seen": 6882015, "step": 328, "time_per_iteration": 2.6789369583129883 }, { "auxiliary_loss_clip": 0.01327544, "auxiliary_loss_mlp": 0.01094059, "balance_loss_clip": 1.08719349, "balance_loss_mlp": 1.05045235, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.280823996815513, "language_loss": 0.93599927, "learning_rate": 3.731804438545683e-06, "loss": 0.96021533, "num_input_tokens_seen": 6899785, "step": 329, "time_per_iteration": 2.6043548583984375 }, { "auxiliary_loss_clip": 0.0133329, "auxiliary_loss_mlp": 0.0110952, "balance_loss_clip": 1.09211767, "balance_loss_mlp": 1.06629419, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 2.788704520584699, "language_loss": 0.7476396, "learning_rate": 3.7337584679165324e-06, "loss": 0.77206767, "num_input_tokens_seen": 6918575, "step": 330, "time_per_iteration": 2.706001043319702 }, { "auxiliary_loss_clip": 0.0133006, "auxiliary_loss_mlp": 0.01115344, "balance_loss_clip": 1.09077096, "balance_loss_mlp": 1.07280993, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 4.201650057157668, "language_loss": 0.93435889, "learning_rate": 3.7357065849353186e-06, "loss": 0.95881295, "num_input_tokens_seen": 6936965, "step": 331, "time_per_iteration": 2.6499180793762207 }, { "auxiliary_loss_clip": 0.01316843, "auxiliary_loss_mlp": 0.01085812, "balance_loss_clip": 1.08825564, "balance_loss_mlp": 1.04563856, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 2.5475056489813968, "language_loss": 0.9293468, "learning_rate": 3.737648825272422e-06, "loss": 0.95337331, "num_input_tokens_seen": 6953475, "step": 332, "time_per_iteration": 2.5990231037139893 }, { "auxiliary_loss_clip": 0.01325701, "auxiliary_loss_mlp": 0.01091941, "balance_loss_clip": 1.09376514, "balance_loss_mlp": 1.04902601, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 2.7319388202061106, "language_loss": 0.75380504, "learning_rate": 3.739585224276384e-06, "loss": 0.77798152, "num_input_tokens_seen": 6971630, "step": 333, "time_per_iteration": 2.6225569248199463 }, { "auxiliary_loss_clip": 0.01323488, "auxiliary_loss_mlp": 0.01083816, "balance_loss_clip": 1.08822608, "balance_loss_mlp": 1.04249835, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 3.3732742696494924, "language_loss": 0.78797042, "learning_rate": 3.7415158169777673e-06, "loss": 0.81204355, "num_input_tokens_seen": 6992775, "step": 334, "time_per_iteration": 2.725562572479248 }, { "auxiliary_loss_clip": 0.01325152, "auxiliary_loss_mlp": 0.01093257, "balance_loss_clip": 1.08535278, "balance_loss_mlp": 1.04867256, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.945115565921162, "language_loss": 0.83465719, "learning_rate": 3.7434406380929575e-06, "loss": 0.8588413, "num_input_tokens_seen": 7011425, "step": 335, "time_per_iteration": 2.638871192932129 }, { "auxiliary_loss_clip": 0.01322365, "auxiliary_loss_mlp": 0.01085854, "balance_loss_clip": 1.08842373, "balance_loss_mlp": 1.04405963, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 2.3527147371949058, "language_loss": 0.92432821, "learning_rate": 3.745359722027911e-06, "loss": 0.94841033, "num_input_tokens_seen": 7029450, "step": 336, "time_per_iteration": 2.6654980182647705 }, { "auxiliary_loss_clip": 0.01321531, "auxiliary_loss_mlp": 0.01079695, "balance_loss_clip": 1.08577883, "balance_loss_mlp": 1.03818631, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.7223490941555537, "language_loss": 0.88663971, "learning_rate": 3.7472731028818428e-06, "loss": 0.91065204, "num_input_tokens_seen": 7047555, "step": 337, "time_per_iteration": 4.246743440628052 }, { "auxiliary_loss_clip": 0.01312441, "auxiliary_loss_mlp": 0.01102336, "balance_loss_clip": 1.08320296, "balance_loss_mlp": 1.05841899, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.6493597356962735, "language_loss": 0.89869279, "learning_rate": 3.7491808144508626e-06, "loss": 0.92284054, "num_input_tokens_seen": 7068185, "step": 338, "time_per_iteration": 5.869866609573364 }, { "auxiliary_loss_clip": 0.01321566, "auxiliary_loss_mlp": 0.0109858, "balance_loss_clip": 1.08546185, "balance_loss_mlp": 1.05554605, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.1603069065052694, "language_loss": 0.85168982, "learning_rate": 3.7510828902315576e-06, "loss": 0.87589133, "num_input_tokens_seen": 7085955, "step": 339, "time_per_iteration": 2.603130340576172 }, { "auxiliary_loss_clip": 0.01328225, "auxiliary_loss_mlp": 0.01099064, "balance_loss_clip": 1.0902226, "balance_loss_mlp": 1.05524242, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 2.1746002196087817, "language_loss": 0.88821882, "learning_rate": 3.75297936342452e-06, "loss": 0.91249174, "num_input_tokens_seen": 7106345, "step": 340, "time_per_iteration": 2.7247626781463623 }, { "auxiliary_loss_clip": 0.01322505, "auxiliary_loss_mlp": 0.01085559, "balance_loss_clip": 1.08594203, "balance_loss_mlp": 1.04004502, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 2.004763613818719, "language_loss": 0.88489276, "learning_rate": 3.7548702669378253e-06, "loss": 0.9089734, "num_input_tokens_seen": 7125070, "step": 341, "time_per_iteration": 2.731411933898926 }, { "auxiliary_loss_clip": 0.01324734, "auxiliary_loss_mlp": 0.01098572, "balance_loss_clip": 1.08451748, "balance_loss_mlp": 1.05479813, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 2.3638593093640736, "language_loss": 0.80611861, "learning_rate": 3.756755633390458e-06, "loss": 0.83035159, "num_input_tokens_seen": 7144675, "step": 342, "time_per_iteration": 2.6085095405578613 }, { "auxiliary_loss_clip": 0.01313805, "auxiliary_loss_mlp": 0.01098164, "balance_loss_clip": 1.08411694, "balance_loss_mlp": 1.05138612, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.727276092160433, "language_loss": 0.89612651, "learning_rate": 3.7586354951156886e-06, "loss": 0.92024612, "num_input_tokens_seen": 7165505, "step": 343, "time_per_iteration": 2.739912509918213 }, { "auxiliary_loss_clip": 0.01324722, "auxiliary_loss_mlp": 0.01096954, "balance_loss_clip": 1.09109879, "balance_loss_mlp": 1.05518293, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 2.6902665590614663, "language_loss": 0.78381217, "learning_rate": 3.7605098841644e-06, "loss": 0.80802888, "num_input_tokens_seen": 7184605, "step": 344, "time_per_iteration": 2.638439655303955 }, { "auxiliary_loss_clip": 0.01310552, "auxiliary_loss_mlp": 0.01103983, "balance_loss_clip": 1.08375537, "balance_loss_mlp": 1.05982804, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 2.2675296623639114, "language_loss": 0.75051636, "learning_rate": 3.7623788323083666e-06, "loss": 0.77466166, "num_input_tokens_seen": 7203065, "step": 345, "time_per_iteration": 2.581258773803711 }, { "auxiliary_loss_clip": 0.01316305, "auxiliary_loss_mlp": 0.01107937, "balance_loss_clip": 1.08855689, "balance_loss_mlp": 1.06447339, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.2144688897761395, "language_loss": 0.90414572, "learning_rate": 3.7642423710434837e-06, "loss": 0.92838824, "num_input_tokens_seen": 7222995, "step": 346, "time_per_iteration": 2.6281676292419434 }, { "auxiliary_loss_clip": 0.01312286, "auxiliary_loss_mlp": 0.01096576, "balance_loss_clip": 1.08357453, "balance_loss_mlp": 1.05621195, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 3.1106741063140366, "language_loss": 0.79133296, "learning_rate": 3.7661005315929563e-06, "loss": 0.81542158, "num_input_tokens_seen": 7244625, "step": 347, "time_per_iteration": 2.6477038860321045 }, { "auxiliary_loss_clip": 0.01317665, "auxiliary_loss_mlp": 0.01097416, "balance_loss_clip": 1.08921003, "balance_loss_mlp": 1.05328524, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 3.7065871267995893, "language_loss": 0.71211165, "learning_rate": 3.7679533449104354e-06, "loss": 0.73626244, "num_input_tokens_seen": 7263255, "step": 348, "time_per_iteration": 2.6215686798095703 }, { "auxiliary_loss_clip": 0.01319168, "auxiliary_loss_mlp": 0.01104109, "balance_loss_clip": 1.0859139, "balance_loss_mlp": 1.06066906, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.3976328225512495, "language_loss": 0.77118891, "learning_rate": 3.7698008416831116e-06, "loss": 0.79542166, "num_input_tokens_seen": 7279275, "step": 349, "time_per_iteration": 2.60102915763855 }, { "auxiliary_loss_clip": 0.01304146, "auxiliary_loss_mlp": 0.01101496, "balance_loss_clip": 1.08412242, "balance_loss_mlp": 1.06017756, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 1.7599420553547571, "language_loss": 0.85191035, "learning_rate": 3.7716430523347664e-06, "loss": 0.87596673, "num_input_tokens_seen": 7300180, "step": 350, "time_per_iteration": 2.7636313438415527 }, { "auxiliary_loss_clip": 0.01310639, "auxiliary_loss_mlp": 0.01090182, "balance_loss_clip": 1.08742464, "balance_loss_mlp": 1.05015147, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.2188224040826956, "language_loss": 0.7998929, "learning_rate": 3.773480007028776e-06, "loss": 0.82390112, "num_input_tokens_seen": 7317430, "step": 351, "time_per_iteration": 2.651803493499756 }, { "auxiliary_loss_clip": 0.01318922, "auxiliary_loss_mlp": 0.01104903, "balance_loss_clip": 1.08851838, "balance_loss_mlp": 1.06093884, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.30399977815629, "language_loss": 0.8746841, "learning_rate": 3.775311735671078e-06, "loss": 0.89892232, "num_input_tokens_seen": 7334875, "step": 352, "time_per_iteration": 2.687080144882202 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.01101912, "balance_loss_clip": 1.0859803, "balance_loss_mlp": 1.05861485, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 2.574621592267882, "language_loss": 0.8247534, "learning_rate": 3.7771382679130878e-06, "loss": 0.84888554, "num_input_tokens_seen": 7355185, "step": 353, "time_per_iteration": 2.7096078395843506 }, { "auxiliary_loss_clip": 0.01308698, "auxiliary_loss_mlp": 0.01092448, "balance_loss_clip": 1.08573294, "balance_loss_mlp": 1.05160654, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 1.9591973719581535, "language_loss": 0.8089481, "learning_rate": 3.7789596331545845e-06, "loss": 0.83295953, "num_input_tokens_seen": 7374425, "step": 354, "time_per_iteration": 2.658649444580078 }, { "auxiliary_loss_clip": 0.01314249, "auxiliary_loss_mlp": 0.01095812, "balance_loss_clip": 1.08369493, "balance_loss_mlp": 1.05218124, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 2.22170783568627, "language_loss": 0.81311834, "learning_rate": 3.780775860546545e-06, "loss": 0.837219, "num_input_tokens_seen": 7394175, "step": 355, "time_per_iteration": 2.619551420211792 }, { "auxiliary_loss_clip": 0.01310207, "auxiliary_loss_mlp": 0.01090401, "balance_loss_clip": 1.08222032, "balance_loss_mlp": 1.04851055, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.212340256471132, "language_loss": 0.89746779, "learning_rate": 3.7825869789939474e-06, "loss": 0.92147392, "num_input_tokens_seen": 7412645, "step": 356, "time_per_iteration": 2.5877137184143066 }, { "auxiliary_loss_clip": 0.01308298, "auxiliary_loss_mlp": 0.0108474, "balance_loss_clip": 1.08573771, "balance_loss_mlp": 1.04191971, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 1.9878508054592678, "language_loss": 0.79956681, "learning_rate": 3.784393017158528e-06, "loss": 0.82349718, "num_input_tokens_seen": 7432275, "step": 357, "time_per_iteration": 2.781755208969116 }, { "auxiliary_loss_clip": 0.0130988, "auxiliary_loss_mlp": 0.01083565, "balance_loss_clip": 1.08250284, "balance_loss_mlp": 1.04417801, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 2.6679617624252137, "language_loss": 0.76516652, "learning_rate": 3.786194003461506e-06, "loss": 0.78910094, "num_input_tokens_seen": 7450245, "step": 358, "time_per_iteration": 2.63144850730896 }, { "auxiliary_loss_clip": 0.01307251, "auxiliary_loss_mlp": 0.01092013, "balance_loss_clip": 1.08083165, "balance_loss_mlp": 1.04842997, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.344744226979962, "language_loss": 0.88770491, "learning_rate": 3.787989966086264e-06, "loss": 0.91169769, "num_input_tokens_seen": 7466845, "step": 359, "time_per_iteration": 2.641932964324951 }, { "auxiliary_loss_clip": 0.01315087, "auxiliary_loss_mlp": 0.01090441, "balance_loss_clip": 1.08486438, "balance_loss_mlp": 1.05088758, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 3.6505103877164804, "language_loss": 0.75853801, "learning_rate": 3.789780932980997e-06, "loss": 0.78259325, "num_input_tokens_seen": 7485450, "step": 360, "time_per_iteration": 2.5901477336883545 }, { "auxiliary_loss_clip": 0.01203506, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.07682121, "balance_loss_mlp": 1.01781011, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8439708743577624, "language_loss": 0.64861441, "learning_rate": 3.79156693186132e-06, "loss": 0.67095727, "num_input_tokens_seen": 7553780, "step": 361, "time_per_iteration": 3.278409957885742 }, { "auxiliary_loss_clip": 0.01306068, "auxiliary_loss_mlp": 0.01086116, "balance_loss_clip": 1.0792098, "balance_loss_mlp": 1.04501224, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 3.144635825096315, "language_loss": 0.78844237, "learning_rate": 3.7933479902128433e-06, "loss": 0.81236422, "num_input_tokens_seen": 7574155, "step": 362, "time_per_iteration": 2.6302051544189453 }, { "auxiliary_loss_clip": 0.01309585, "auxiliary_loss_mlp": 0.01093258, "balance_loss_clip": 1.08188891, "balance_loss_mlp": 1.05244076, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.019833715135914, "language_loss": 0.92474592, "learning_rate": 3.7951241352937077e-06, "loss": 0.94877434, "num_input_tokens_seen": 7592320, "step": 363, "time_per_iteration": 2.6566081047058105 }, { "auxiliary_loss_clip": 0.01305173, "auxiliary_loss_mlp": 0.01096467, "balance_loss_clip": 1.0816617, "balance_loss_mlp": 1.05693769, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.282586403147275, "language_loss": 0.89844346, "learning_rate": 3.7968953941370915e-06, "loss": 0.92245984, "num_input_tokens_seen": 7611185, "step": 364, "time_per_iteration": 2.711911201477051 }, { "auxiliary_loss_clip": 0.01311963, "auxiliary_loss_mlp": 0.0109247, "balance_loss_clip": 1.08607888, "balance_loss_mlp": 1.04955506, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 1.948927065488749, "language_loss": 0.79460645, "learning_rate": 3.798661793553676e-06, "loss": 0.81865084, "num_input_tokens_seen": 7631970, "step": 365, "time_per_iteration": 2.6396052837371826 }, { "auxiliary_loss_clip": 0.01306043, "auxiliary_loss_mlp": 0.01100405, "balance_loss_clip": 1.08267248, "balance_loss_mlp": 1.05658317, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 1.85181498507666, "language_loss": 0.84341359, "learning_rate": 3.8004233601340808e-06, "loss": 0.86747801, "num_input_tokens_seen": 7649745, "step": 366, "time_per_iteration": 2.6278867721557617 }, { "auxiliary_loss_clip": 0.01312113, "auxiliary_loss_mlp": 0.01087574, "balance_loss_clip": 1.08304918, "balance_loss_mlp": 1.04859269, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 1.9326288300300676, "language_loss": 0.87040466, "learning_rate": 3.8021801202512694e-06, "loss": 0.89440155, "num_input_tokens_seen": 7668830, "step": 367, "time_per_iteration": 2.6410560607910156 }, { "auxiliary_loss_clip": 0.01312217, "auxiliary_loss_mlp": 0.01096053, "balance_loss_clip": 1.08074582, "balance_loss_mlp": 1.05335259, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.7247329926128976, "language_loss": 0.8487373, "learning_rate": 3.803932100062912e-06, "loss": 0.87282002, "num_input_tokens_seen": 7687240, "step": 368, "time_per_iteration": 2.652012825012207 }, { "auxiliary_loss_clip": 0.01312089, "auxiliary_loss_mlp": 0.01079926, "balance_loss_clip": 1.0801568, "balance_loss_mlp": 1.04027653, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.4839328990540794, "language_loss": 0.75997221, "learning_rate": 3.8056793255137264e-06, "loss": 0.78389233, "num_input_tokens_seen": 7704440, "step": 369, "time_per_iteration": 2.601384401321411 }, { "auxiliary_loss_clip": 0.01306737, "auxiliary_loss_mlp": 0.01099274, "balance_loss_clip": 1.08232927, "balance_loss_mlp": 1.05836105, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.189428421230448, "language_loss": 0.82977992, "learning_rate": 3.8074218223377844e-06, "loss": 0.85383999, "num_input_tokens_seen": 7727160, "step": 370, "time_per_iteration": 2.6538548469543457 }, { "auxiliary_loss_clip": 0.01306327, "auxiliary_loss_mlp": 0.01099594, "balance_loss_clip": 1.08127654, "balance_loss_mlp": 1.05713177, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.8569755368340455, "language_loss": 0.81588483, "learning_rate": 3.8091596160607834e-06, "loss": 0.83994406, "num_input_tokens_seen": 7747730, "step": 371, "time_per_iteration": 2.6779489517211914 }, { "auxiliary_loss_clip": 0.01311283, "auxiliary_loss_mlp": 0.01093653, "balance_loss_clip": 1.08593988, "balance_loss_mlp": 1.05169153, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 2.0622769904034817, "language_loss": 0.83493644, "learning_rate": 3.8108927320022896e-06, "loss": 0.85898578, "num_input_tokens_seen": 7766765, "step": 372, "time_per_iteration": 2.676797866821289 }, { "auxiliary_loss_clip": 0.01303906, "auxiliary_loss_mlp": 0.01091688, "balance_loss_clip": 1.08125615, "balance_loss_mlp": 1.05022752, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 2.8569846697004424, "language_loss": 0.79004842, "learning_rate": 3.8126211952779548e-06, "loss": 0.81400436, "num_input_tokens_seen": 7784010, "step": 373, "time_per_iteration": 2.593186616897583 }, { "auxiliary_loss_clip": 0.01309731, "auxiliary_loss_mlp": 0.01087409, "balance_loss_clip": 1.08431911, "balance_loss_mlp": 1.0448271, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 2.5442660874947385, "language_loss": 0.77622557, "learning_rate": 3.8143450308016952e-06, "loss": 0.80019701, "num_input_tokens_seen": 7801305, "step": 374, "time_per_iteration": 2.628392457962036 }, { "auxiliary_loss_clip": 0.0129871, "auxiliary_loss_mlp": 0.01076131, "balance_loss_clip": 1.07404125, "balance_loss_mlp": 1.03395462, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.574507922341891, "language_loss": 0.86032569, "learning_rate": 3.8160642632878525e-06, "loss": 0.88407415, "num_input_tokens_seen": 7823965, "step": 375, "time_per_iteration": 2.6783435344696045 }, { "auxiliary_loss_clip": 0.01307026, "auxiliary_loss_mlp": 0.01102393, "balance_loss_clip": 1.08340597, "balance_loss_mlp": 1.0590483, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.1279260859120286, "language_loss": 0.8901403, "learning_rate": 3.817778917253314e-06, "loss": 0.91423446, "num_input_tokens_seen": 7842115, "step": 376, "time_per_iteration": 2.621629476547241 }, { "auxiliary_loss_clip": 0.01306872, "auxiliary_loss_mlp": 0.01087647, "balance_loss_clip": 1.07870364, "balance_loss_mlp": 1.04868913, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 3.0367767906095917, "language_loss": 0.75437558, "learning_rate": 3.8194890170196155e-06, "loss": 0.77832079, "num_input_tokens_seen": 7857830, "step": 377, "time_per_iteration": 2.5465245246887207 }, { "auxiliary_loss_clip": 0.01298987, "auxiliary_loss_mlp": 0.01093623, "balance_loss_clip": 1.08128345, "balance_loss_mlp": 1.0517087, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.1955644054597374, "language_loss": 0.99231368, "learning_rate": 3.8211945867150055e-06, "loss": 1.01623976, "num_input_tokens_seen": 7875840, "step": 378, "time_per_iteration": 7.184643983840942 }, { "auxiliary_loss_clip": 0.01202133, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.0828104, "balance_loss_mlp": 1.0283463, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9608118941287621, "language_loss": 0.75395739, "learning_rate": 3.822895650276492e-06, "loss": 0.7763871, "num_input_tokens_seen": 7940190, "step": 379, "time_per_iteration": 4.961140394210815 }, { "auxiliary_loss_clip": 0.01308523, "auxiliary_loss_mlp": 0.01087195, "balance_loss_clip": 1.07820678, "balance_loss_mlp": 1.04792738, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 3.7276648293904375, "language_loss": 0.78197825, "learning_rate": 3.824592231451859e-06, "loss": 0.8059355, "num_input_tokens_seen": 7960840, "step": 380, "time_per_iteration": 2.7892863750457764 }, { "auxiliary_loss_clip": 0.01301718, "auxiliary_loss_mlp": 0.01088822, "balance_loss_clip": 1.07955217, "balance_loss_mlp": 1.04945946, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.0941800643649855, "language_loss": 0.96743369, "learning_rate": 3.826284353801652e-06, "loss": 0.99133915, "num_input_tokens_seen": 7975500, "step": 381, "time_per_iteration": 2.619854688644409 }, { "auxiliary_loss_clip": 0.01311313, "auxiliary_loss_mlp": 0.01093973, "balance_loss_clip": 1.08192921, "balance_loss_mlp": 1.0539186, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.122042453210184, "language_loss": 0.87664795, "learning_rate": 3.827972040701142e-06, "loss": 0.90070075, "num_input_tokens_seen": 7993880, "step": 382, "time_per_iteration": 2.617398500442505 }, { "auxiliary_loss_clip": 0.01304042, "auxiliary_loss_mlp": 0.01096828, "balance_loss_clip": 1.0821979, "balance_loss_mlp": 1.05760849, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 1.978420170714987, "language_loss": 0.84990942, "learning_rate": 3.829655315342268e-06, "loss": 0.87391812, "num_input_tokens_seen": 8012730, "step": 383, "time_per_iteration": 2.6345314979553223 }, { "auxiliary_loss_clip": 0.01300873, "auxiliary_loss_mlp": 0.0111136, "balance_loss_clip": 1.08199024, "balance_loss_mlp": 1.0716393, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.0575071112917778, "language_loss": 0.83349717, "learning_rate": 3.831334200735543e-06, "loss": 0.8576194, "num_input_tokens_seen": 8031275, "step": 384, "time_per_iteration": 2.6339902877807617 }, { "auxiliary_loss_clip": 0.0129979, "auxiliary_loss_mlp": 0.010893, "balance_loss_clip": 1.08362782, "balance_loss_mlp": 1.05255938, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.7828777740185773, "language_loss": 0.89289594, "learning_rate": 3.8330087197119426e-06, "loss": 0.91678685, "num_input_tokens_seen": 8051600, "step": 385, "time_per_iteration": 2.690460205078125 }, { "auxiliary_loss_clip": 0.01305297, "auxiliary_loss_mlp": 0.01118129, "balance_loss_clip": 1.08288455, "balance_loss_mlp": 1.07926655, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.9487706588237765, "language_loss": 0.70157433, "learning_rate": 3.83467889492477e-06, "loss": 0.72580856, "num_input_tokens_seen": 8070600, "step": 386, "time_per_iteration": 2.681957721710205 }, { "auxiliary_loss_clip": 0.01305989, "auxiliary_loss_mlp": 0.0109088, "balance_loss_clip": 1.08441973, "balance_loss_mlp": 1.05309081, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 2.354342660334866, "language_loss": 0.87840039, "learning_rate": 3.836344748851495e-06, "loss": 0.90236908, "num_input_tokens_seen": 8090680, "step": 387, "time_per_iteration": 2.6511123180389404 }, { "auxiliary_loss_clip": 0.01304298, "auxiliary_loss_mlp": 0.01075541, "balance_loss_clip": 1.08178413, "balance_loss_mlp": 1.03658366, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 2.2068948332198643, "language_loss": 0.8341614, "learning_rate": 3.838006303795566e-06, "loss": 0.85795981, "num_input_tokens_seen": 8114610, "step": 388, "time_per_iteration": 2.7062034606933594 }, { "auxiliary_loss_clip": 0.01301997, "auxiliary_loss_mlp": 0.01089724, "balance_loss_clip": 1.08110905, "balance_loss_mlp": 1.05284107, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.1887236217853863, "language_loss": 0.93710232, "learning_rate": 3.839663581888206e-06, "loss": 0.96101958, "num_input_tokens_seen": 8133975, "step": 389, "time_per_iteration": 2.680280923843384 }, { "auxiliary_loss_clip": 0.01296082, "auxiliary_loss_mlp": 0.01083127, "balance_loss_clip": 1.0818491, "balance_loss_mlp": 1.04397893, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 1.981860280002506, "language_loss": 0.87747037, "learning_rate": 3.841316605090178e-06, "loss": 0.9012624, "num_input_tokens_seen": 8153570, "step": 390, "time_per_iteration": 2.65970516204834 }, { "auxiliary_loss_clip": 0.01301203, "auxiliary_loss_mlp": 0.01092853, "balance_loss_clip": 1.08357048, "balance_loss_mlp": 1.0568521, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.134782100250632, "language_loss": 0.89370871, "learning_rate": 3.842965395193529e-06, "loss": 0.91764927, "num_input_tokens_seen": 8170075, "step": 391, "time_per_iteration": 2.620009660720825 }, { "auxiliary_loss_clip": 0.01296395, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.07956719, "balance_loss_mlp": 1.03521371, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 2.366558958564603, "language_loss": 0.86076117, "learning_rate": 3.84460997382332e-06, "loss": 0.88444775, "num_input_tokens_seen": 8190420, "step": 392, "time_per_iteration": 2.7171695232391357 }, { "auxiliary_loss_clip": 0.01293283, "auxiliary_loss_mlp": 0.01084283, "balance_loss_clip": 1.07891107, "balance_loss_mlp": 1.04763794, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.038818686720474, "language_loss": 0.89096916, "learning_rate": 3.8462503624393256e-06, "loss": 0.91474473, "num_input_tokens_seen": 8208790, "step": 393, "time_per_iteration": 2.632129669189453 }, { "auxiliary_loss_clip": 0.01304158, "auxiliary_loss_mlp": 0.01102255, "balance_loss_clip": 1.08471596, "balance_loss_mlp": 1.06279635, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 1.7920692319020195, "language_loss": 0.8156364, "learning_rate": 3.84788658233771e-06, "loss": 0.83970058, "num_input_tokens_seen": 8226885, "step": 394, "time_per_iteration": 2.5932936668395996 }, { "auxiliary_loss_clip": 0.01296851, "auxiliary_loss_mlp": 0.01088191, "balance_loss_clip": 1.07939875, "balance_loss_mlp": 1.04920936, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 4.539737106404062, "language_loss": 0.85808635, "learning_rate": 3.84951865465269e-06, "loss": 0.88193679, "num_input_tokens_seen": 8246825, "step": 395, "time_per_iteration": 2.6112868785858154 }, { "auxiliary_loss_clip": 0.01194704, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.07210529, "balance_loss_mlp": 1.02319229, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9258089920958834, "language_loss": 0.6380353, "learning_rate": 3.851146600358172e-06, "loss": 0.66032922, "num_input_tokens_seen": 8302835, "step": 396, "time_per_iteration": 3.031489133834839 }, { "auxiliary_loss_clip": 0.0129188, "auxiliary_loss_mlp": 0.01071022, "balance_loss_clip": 1.07806754, "balance_loss_mlp": 1.03447223, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 2.3741099598177624, "language_loss": 0.83878696, "learning_rate": 3.852770440269372e-06, "loss": 0.86241591, "num_input_tokens_seen": 8320745, "step": 397, "time_per_iteration": 2.6049532890319824 }, { "auxiliary_loss_clip": 0.01297108, "auxiliary_loss_mlp": 0.01087341, "balance_loss_clip": 1.08104038, "balance_loss_mlp": 1.04890823, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 4.6847154905409205, "language_loss": 0.84066498, "learning_rate": 3.854390195044404e-06, "loss": 0.86450952, "num_input_tokens_seen": 8339540, "step": 398, "time_per_iteration": 2.6516692638397217 }, { "auxiliary_loss_clip": 0.01295876, "auxiliary_loss_mlp": 0.01078722, "balance_loss_clip": 1.07671928, "balance_loss_mlp": 1.04007471, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.80358563189936, "language_loss": 0.86029691, "learning_rate": 3.856005885185868e-06, "loss": 0.88404286, "num_input_tokens_seen": 8354890, "step": 399, "time_per_iteration": 2.5452589988708496 }, { "auxiliary_loss_clip": 0.01292698, "auxiliary_loss_mlp": 0.01090822, "balance_loss_clip": 1.08074594, "balance_loss_mlp": 1.05308056, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 2.021318687641168, "language_loss": 0.86254489, "learning_rate": 3.857617531042398e-06, "loss": 0.88638014, "num_input_tokens_seen": 8375845, "step": 400, "time_per_iteration": 2.6626927852630615 }, { "auxiliary_loss_clip": 0.01299822, "auxiliary_loss_mlp": 0.01083301, "balance_loss_clip": 1.08346462, "balance_loss_mlp": 1.04687035, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.735822397657743, "language_loss": 0.79276752, "learning_rate": 3.8592251528102065e-06, "loss": 0.81659877, "num_input_tokens_seen": 8395240, "step": 401, "time_per_iteration": 2.68418025970459 }, { "auxiliary_loss_clip": 0.0129275, "auxiliary_loss_mlp": 0.01091389, "balance_loss_clip": 1.07852793, "balance_loss_mlp": 1.05493474, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 3.889755427752258, "language_loss": 0.78890866, "learning_rate": 3.8608287705345976e-06, "loss": 0.81274998, "num_input_tokens_seen": 8416950, "step": 402, "time_per_iteration": 2.7509379386901855 }, { "auxiliary_loss_clip": 0.01296434, "auxiliary_loss_mlp": 0.01082712, "balance_loss_clip": 1.07797897, "balance_loss_mlp": 1.04399323, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.49356632429363, "language_loss": 0.94936156, "learning_rate": 3.86242840411147e-06, "loss": 0.97315305, "num_input_tokens_seen": 8433660, "step": 403, "time_per_iteration": 2.5760560035705566 }, { "auxiliary_loss_clip": 0.0129994, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07754242, "balance_loss_mlp": 1.05315053, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.361656575803209, "language_loss": 0.99877387, "learning_rate": 3.864024073288798e-06, "loss": 1.0226922, "num_input_tokens_seen": 8450180, "step": 404, "time_per_iteration": 2.5966458320617676 }, { "auxiliary_loss_clip": 0.01298911, "auxiliary_loss_mlp": 0.01100127, "balance_loss_clip": 1.08096266, "balance_loss_mlp": 1.06312442, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.3162348618509276, "language_loss": 0.8802169, "learning_rate": 3.865615797668091e-06, "loss": 0.90420723, "num_input_tokens_seen": 8467775, "step": 405, "time_per_iteration": 2.5728275775909424 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.01097881, "balance_loss_clip": 1.084512, "balance_loss_mlp": 1.06004393, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.7399607903318275, "language_loss": 0.93386561, "learning_rate": 3.867203596705844e-06, "loss": 0.95791268, "num_input_tokens_seen": 8486765, "step": 406, "time_per_iteration": 2.612668991088867 }, { "auxiliary_loss_clip": 0.01299426, "auxiliary_loss_mlp": 0.01088378, "balance_loss_clip": 1.08213782, "balance_loss_mlp": 1.0500164, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 2.1742012769968526, "language_loss": 0.87128031, "learning_rate": 3.86878748971496e-06, "loss": 0.89515841, "num_input_tokens_seen": 8506515, "step": 407, "time_per_iteration": 2.5982017517089844 }, { "auxiliary_loss_clip": 0.01298266, "auxiliary_loss_mlp": 0.01083858, "balance_loss_clip": 1.08472157, "balance_loss_mlp": 1.04630709, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.1458430439144234, "language_loss": 0.74102569, "learning_rate": 3.8703674958661596e-06, "loss": 0.76484692, "num_input_tokens_seen": 8528035, "step": 408, "time_per_iteration": 2.708670139312744 }, { "auxiliary_loss_clip": 0.01300128, "auxiliary_loss_mlp": 0.01089985, "balance_loss_clip": 1.08222318, "balance_loss_mlp": 1.05233896, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 2.4878473813549675, "language_loss": 0.92509401, "learning_rate": 3.871943634189376e-06, "loss": 0.94899511, "num_input_tokens_seen": 8546455, "step": 409, "time_per_iteration": 2.665321111679077 }, { "auxiliary_loss_clip": 0.01296394, "auxiliary_loss_mlp": 0.01077538, "balance_loss_clip": 1.08126342, "balance_loss_mlp": 1.04291987, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.2521095969191722, "language_loss": 0.82792604, "learning_rate": 3.873515923575128e-06, "loss": 0.85166532, "num_input_tokens_seen": 8568450, "step": 410, "time_per_iteration": 2.848928213119507 }, { "auxiliary_loss_clip": 0.01299459, "auxiliary_loss_mlp": 0.01089133, "balance_loss_clip": 1.08187068, "balance_loss_mlp": 1.05284572, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.1393760271628595, "language_loss": 0.77577484, "learning_rate": 3.875084382775879e-06, "loss": 0.79966074, "num_input_tokens_seen": 8589340, "step": 411, "time_per_iteration": 2.6645278930664062 }, { "auxiliary_loss_clip": 0.01298341, "auxiliary_loss_mlp": 0.0110154, "balance_loss_clip": 1.07977521, "balance_loss_mlp": 1.06289268, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.2974658872162665, "language_loss": 0.86379063, "learning_rate": 3.87664903040738e-06, "loss": 0.88778943, "num_input_tokens_seen": 8607150, "step": 412, "time_per_iteration": 2.6091151237487793 }, { "auxiliary_loss_clip": 0.01187014, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.07387948, "balance_loss_mlp": 1.02089787, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8687159185244209, "language_loss": 0.5852263, "learning_rate": 3.878209884949994e-06, "loss": 0.60741079, "num_input_tokens_seen": 8669865, "step": 413, "time_per_iteration": 3.2269625663757324 }, { "auxiliary_loss_clip": 0.0129043, "auxiliary_loss_mlp": 0.01091958, "balance_loss_clip": 1.07709181, "balance_loss_mlp": 1.05249953, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.8280666153990437, "language_loss": 0.80517173, "learning_rate": 3.879766964750006e-06, "loss": 0.82899559, "num_input_tokens_seen": 8690235, "step": 414, "time_per_iteration": 2.720341444015503 }, { "auxiliary_loss_clip": 0.01287097, "auxiliary_loss_mlp": 0.0109242, "balance_loss_clip": 1.0756042, "balance_loss_mlp": 1.0556556, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 2.1921003994701302, "language_loss": 0.80227423, "learning_rate": 3.881320288020917e-06, "loss": 0.82606936, "num_input_tokens_seen": 8706295, "step": 415, "time_per_iteration": 2.6473400592803955 }, { "auxiliary_loss_clip": 0.01302694, "auxiliary_loss_mlp": 0.01082455, "balance_loss_clip": 1.08156919, "balance_loss_mlp": 1.04497528, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.9318871737289776, "language_loss": 0.96236515, "learning_rate": 3.882869872844723e-06, "loss": 0.9862166, "num_input_tokens_seen": 8724200, "step": 416, "time_per_iteration": 2.596189260482788 }, { "auxiliary_loss_clip": 0.01291636, "auxiliary_loss_mlp": 0.01074465, "balance_loss_clip": 1.07628798, "balance_loss_mlp": 1.0355792, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.741746736079687, "language_loss": 0.77381694, "learning_rate": 3.884415737173176e-06, "loss": 0.79747796, "num_input_tokens_seen": 8744170, "step": 417, "time_per_iteration": 5.610344171524048 }, { "auxiliary_loss_clip": 0.01290746, "auxiliary_loss_mlp": 0.0109022, "balance_loss_clip": 1.08072221, "balance_loss_mlp": 1.05264485, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.554385639735456, "language_loss": 0.77076226, "learning_rate": 3.8859578988290344e-06, "loss": 0.79457194, "num_input_tokens_seen": 8765120, "step": 418, "time_per_iteration": 5.837290525436401 }, { "auxiliary_loss_clip": 0.01297026, "auxiliary_loss_mlp": 0.01071197, "balance_loss_clip": 1.08019948, "balance_loss_mlp": 1.03550553, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 2.4603268634516207, "language_loss": 0.81445098, "learning_rate": 3.887496375507294e-06, "loss": 0.83813322, "num_input_tokens_seen": 8783500, "step": 419, "time_per_iteration": 2.582590341567993 }, { "auxiliary_loss_clip": 0.01291114, "auxiliary_loss_mlp": 0.01086736, "balance_loss_clip": 1.07929599, "balance_loss_mlp": 1.04708743, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.8078532084212713, "language_loss": 0.73618573, "learning_rate": 3.8890311847764065e-06, "loss": 0.75996423, "num_input_tokens_seen": 8801175, "step": 420, "time_per_iteration": 2.6739418506622314 }, { "auxiliary_loss_clip": 0.01290485, "auxiliary_loss_mlp": 0.01096292, "balance_loss_clip": 1.07605243, "balance_loss_mlp": 1.05924153, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.77336014903074, "language_loss": 0.79040134, "learning_rate": 3.890562344079484e-06, "loss": 0.81426907, "num_input_tokens_seen": 8820215, "step": 421, "time_per_iteration": 2.6928632259368896 }, { "auxiliary_loss_clip": 0.01290689, "auxiliary_loss_mlp": 0.01088863, "balance_loss_clip": 1.07922924, "balance_loss_mlp": 1.04983425, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.2139016136437104, "language_loss": 0.8203755, "learning_rate": 3.89208987073549e-06, "loss": 0.84417105, "num_input_tokens_seen": 8839660, "step": 422, "time_per_iteration": 2.714707851409912 }, { "auxiliary_loss_clip": 0.01293659, "auxiliary_loss_mlp": 0.01078975, "balance_loss_clip": 1.07677865, "balance_loss_mlp": 1.04430926, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 2.1259138778576356, "language_loss": 0.83458018, "learning_rate": 3.893613781940409e-06, "loss": 0.85830647, "num_input_tokens_seen": 8859280, "step": 423, "time_per_iteration": 2.652757167816162 }, { "auxiliary_loss_clip": 0.01287497, "auxiliary_loss_mlp": 0.01078335, "balance_loss_clip": 1.0742569, "balance_loss_mlp": 1.04221487, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2.012741083661608, "language_loss": 0.74129444, "learning_rate": 3.895134094768415e-06, "loss": 0.76495278, "num_input_tokens_seen": 8880560, "step": 424, "time_per_iteration": 2.7724521160125732 }, { "auxiliary_loss_clip": 0.01296446, "auxiliary_loss_mlp": 0.01093799, "balance_loss_clip": 1.07987142, "balance_loss_mlp": 1.05782199, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 4.623670538116741, "language_loss": 0.83193713, "learning_rate": 3.896650826173015e-06, "loss": 0.85583955, "num_input_tokens_seen": 8899155, "step": 425, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.01292462, "auxiliary_loss_mlp": 0.01092376, "balance_loss_clip": 1.07259536, "balance_loss_mlp": 1.0544672, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.5075767706443566, "language_loss": 0.853073, "learning_rate": 3.898163992988186e-06, "loss": 0.87692136, "num_input_tokens_seen": 8917890, "step": 426, "time_per_iteration": 2.6445271968841553 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.01017688, "balance_loss_clip": 1.06532824, "balance_loss_mlp": 1.00781715, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8949637292547264, "language_loss": 0.57219732, "learning_rate": 3.899673611929491e-06, "loss": 0.5941335, "num_input_tokens_seen": 8978260, "step": 427, "time_per_iteration": 3.2690517902374268 }, { "auxiliary_loss_clip": 0.01291989, "auxiliary_loss_mlp": 0.01092649, "balance_loss_clip": 1.08155811, "balance_loss_mlp": 1.05674267, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.4869215225306673, "language_loss": 0.88130605, "learning_rate": 3.901179699595194e-06, "loss": 0.90515244, "num_input_tokens_seen": 8994460, "step": 428, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01283603, "auxiliary_loss_mlp": 0.0107531, "balance_loss_clip": 1.07418942, "balance_loss_mlp": 1.03735399, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 2.067247304638145, "language_loss": 0.85790849, "learning_rate": 3.902682272467353e-06, "loss": 0.88149762, "num_input_tokens_seen": 9016670, "step": 429, "time_per_iteration": 2.749328374862671 }, { "auxiliary_loss_clip": 0.01288943, "auxiliary_loss_mlp": 0.01083888, "balance_loss_clip": 1.07337689, "balance_loss_mlp": 1.04590786, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.4411876712444034, "language_loss": 0.8815223, "learning_rate": 3.904181346912895e-06, "loss": 0.90525061, "num_input_tokens_seen": 9039720, "step": 430, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.01290726, "auxiliary_loss_mlp": 0.01080495, "balance_loss_clip": 1.0803287, "balance_loss_mlp": 1.04573333, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.086180078538185, "language_loss": 0.84249514, "learning_rate": 3.905676939184698e-06, "loss": 0.8662073, "num_input_tokens_seen": 9059850, "step": 431, "time_per_iteration": 2.6531126499176025 }, { "auxiliary_loss_clip": 0.01286945, "auxiliary_loss_mlp": 0.01073345, "balance_loss_clip": 1.07570636, "balance_loss_mlp": 1.03951311, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 2.681931959502968, "language_loss": 0.86511916, "learning_rate": 3.907169065422638e-06, "loss": 0.88872206, "num_input_tokens_seen": 9077590, "step": 432, "time_per_iteration": 2.7582762241363525 }, { "auxiliary_loss_clip": 0.01287429, "auxiliary_loss_mlp": 0.01072961, "balance_loss_clip": 1.07632601, "balance_loss_mlp": 1.03891492, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 1.95596969308187, "language_loss": 0.76036298, "learning_rate": 3.908657741654636e-06, "loss": 0.7839669, "num_input_tokens_seen": 9099880, "step": 433, "time_per_iteration": 2.707771062850952 }, { "auxiliary_loss_clip": 0.01289436, "auxiliary_loss_mlp": 0.01088504, "balance_loss_clip": 1.07470191, "balance_loss_mlp": 1.04973757, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 2.157056093147959, "language_loss": 0.8979522, "learning_rate": 3.910142983797699e-06, "loss": 0.92173159, "num_input_tokens_seen": 9118620, "step": 434, "time_per_iteration": 2.5665409564971924 }, { "auxiliary_loss_clip": 0.01289617, "auxiliary_loss_mlp": 0.01096405, "balance_loss_clip": 1.07960439, "balance_loss_mlp": 1.05904448, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.306071945033866, "language_loss": 0.80187833, "learning_rate": 3.9116248076589305e-06, "loss": 0.82573849, "num_input_tokens_seen": 9135655, "step": 435, "time_per_iteration": 2.614440679550171 }, { "auxiliary_loss_clip": 0.01285396, "auxiliary_loss_mlp": 0.01092207, "balance_loss_clip": 1.07367229, "balance_loss_mlp": 1.05503798, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 3.0257040949539356, "language_loss": 0.86361396, "learning_rate": 3.913103228936546e-06, "loss": 0.88739002, "num_input_tokens_seen": 9153520, "step": 436, "time_per_iteration": 2.635033130645752 }, { "auxiliary_loss_clip": 0.01289558, "auxiliary_loss_mlp": 0.01096903, "balance_loss_clip": 1.07716811, "balance_loss_mlp": 1.06080687, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.4233286399217993, "language_loss": 0.74725163, "learning_rate": 3.914578263220868e-06, "loss": 0.77111626, "num_input_tokens_seen": 9170750, "step": 437, "time_per_iteration": 2.6614880561828613 }, { "auxiliary_loss_clip": 0.01286403, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.07628679, "balance_loss_mlp": 1.06220388, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.79370908187484, "language_loss": 0.9131338, "learning_rate": 3.916049925995316e-06, "loss": 0.93700182, "num_input_tokens_seen": 9188430, "step": 438, "time_per_iteration": 2.674877166748047 }, { "auxiliary_loss_clip": 0.01169678, "auxiliary_loss_mlp": 0.01072518, "balance_loss_clip": 1.0602653, "balance_loss_mlp": 1.06250465, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8871275810137318, "language_loss": 0.62631273, "learning_rate": 3.917518232637377e-06, "loss": 0.64873469, "num_input_tokens_seen": 9255835, "step": 439, "time_per_iteration": 3.2527849674224854 }, { "auxiliary_loss_clip": 0.01296492, "auxiliary_loss_mlp": 0.01095184, "balance_loss_clip": 1.08175814, "balance_loss_mlp": 1.05758572, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 3.31985956061953, "language_loss": 0.75982475, "learning_rate": 3.918983198419573e-06, "loss": 0.78374153, "num_input_tokens_seen": 9276835, "step": 440, "time_per_iteration": 2.6770262718200684 }, { "auxiliary_loss_clip": 0.01286342, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.07652593, "balance_loss_mlp": 1.04048026, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 3.0236705091068283, "language_loss": 0.83197021, "learning_rate": 3.920444838510415e-06, "loss": 0.85559869, "num_input_tokens_seen": 9295075, "step": 441, "time_per_iteration": 2.591306209564209 }, { "auxiliary_loss_clip": 0.01291817, "auxiliary_loss_mlp": 0.01086154, "balance_loss_clip": 1.07703269, "balance_loss_mlp": 1.04829359, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 2.202684635319811, "language_loss": 0.78490162, "learning_rate": 3.92190316797534e-06, "loss": 0.80868137, "num_input_tokens_seen": 9314205, "step": 442, "time_per_iteration": 2.633054733276367 }, { "auxiliary_loss_clip": 0.0116251, "auxiliary_loss_mlp": 0.01015158, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.0054301, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9609264438471399, "language_loss": 0.64459753, "learning_rate": 3.92335820177765e-06, "loss": 0.66637421, "num_input_tokens_seen": 9367395, "step": 443, "time_per_iteration": 3.1241400241851807 }, { "auxiliary_loss_clip": 0.01291897, "auxiliary_loss_mlp": 0.01085882, "balance_loss_clip": 1.08147204, "balance_loss_mlp": 1.04906964, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 2.121488874389134, "language_loss": 0.82093638, "learning_rate": 3.924809954779425e-06, "loss": 0.84471416, "num_input_tokens_seen": 9385185, "step": 444, "time_per_iteration": 2.6202428340911865 }, { "auxiliary_loss_clip": 0.0129406, "auxiliary_loss_mlp": 0.01082041, "balance_loss_clip": 1.07940578, "balance_loss_mlp": 1.04263067, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.2213674770888607, "language_loss": 0.95689106, "learning_rate": 3.9262584417424425e-06, "loss": 0.98065209, "num_input_tokens_seen": 9403225, "step": 445, "time_per_iteration": 2.6071228981018066 }, { "auxiliary_loss_clip": 0.01289866, "auxiliary_loss_mlp": 0.01094053, "balance_loss_clip": 1.07953668, "balance_loss_mlp": 1.05492878, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.775359545549618, "language_loss": 0.91932094, "learning_rate": 3.9277036773290725e-06, "loss": 0.94316012, "num_input_tokens_seen": 9420540, "step": 446, "time_per_iteration": 2.5791916847229004 }, { "auxiliary_loss_clip": 0.01289847, "auxiliary_loss_mlp": 0.01088114, "balance_loss_clip": 1.08072042, "balance_loss_mlp": 1.05092025, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.0562763127679204, "language_loss": 0.79831308, "learning_rate": 3.92914567610317e-06, "loss": 0.82209271, "num_input_tokens_seen": 9438840, "step": 447, "time_per_iteration": 2.6420843601226807 }, { "auxiliary_loss_clip": 0.01289397, "auxiliary_loss_mlp": 0.01079607, "balance_loss_clip": 1.07901013, "balance_loss_mlp": 1.04446411, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.231264914467203, "language_loss": 0.86402845, "learning_rate": 3.930584452530952e-06, "loss": 0.8877185, "num_input_tokens_seen": 9457215, "step": 448, "time_per_iteration": 2.590277910232544 }, { "auxiliary_loss_clip": 0.01282455, "auxiliary_loss_mlp": 0.01091099, "balance_loss_clip": 1.07706833, "balance_loss_mlp": 1.05662322, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.941778256808524, "language_loss": 0.88581634, "learning_rate": 3.9320200209818755e-06, "loss": 0.90955186, "num_input_tokens_seen": 9475615, "step": 449, "time_per_iteration": 2.610065460205078 }, { "auxiliary_loss_clip": 0.01293472, "auxiliary_loss_mlp": 0.01085576, "balance_loss_clip": 1.07856452, "balance_loss_mlp": 1.04814398, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.199007921978797, "language_loss": 0.80395782, "learning_rate": 3.933452395729493e-06, "loss": 0.8277483, "num_input_tokens_seen": 9493975, "step": 450, "time_per_iteration": 2.637465238571167 }, { "auxiliary_loss_clip": 0.01284612, "auxiliary_loss_mlp": 0.0108001, "balance_loss_clip": 1.08025336, "balance_loss_mlp": 1.04384232, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.599374223212879, "language_loss": 0.81562543, "learning_rate": 3.934881590952304e-06, "loss": 0.83927161, "num_input_tokens_seen": 9514810, "step": 451, "time_per_iteration": 2.6506927013397217 }, { "auxiliary_loss_clip": 0.0128567, "auxiliary_loss_mlp": 0.01090719, "balance_loss_clip": 1.08126068, "balance_loss_mlp": 1.0533824, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.9677929562692107, "language_loss": 0.77019048, "learning_rate": 3.936307620734599e-06, "loss": 0.79395437, "num_input_tokens_seen": 9533635, "step": 452, "time_per_iteration": 2.5751442909240723 }, { "auxiliary_loss_clip": 0.01286865, "auxiliary_loss_mlp": 0.01088287, "balance_loss_clip": 1.08011293, "balance_loss_mlp": 1.05135596, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1.7205362750177517, "language_loss": 0.72874546, "learning_rate": 3.937730499067294e-06, "loss": 0.75249696, "num_input_tokens_seen": 9555420, "step": 453, "time_per_iteration": 2.668083667755127 }, { "auxiliary_loss_clip": 0.01281405, "auxiliary_loss_mlp": 0.01083223, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.04748416, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 1.8353680194819204, "language_loss": 0.82419729, "learning_rate": 3.939150239848748e-06, "loss": 0.84784359, "num_input_tokens_seen": 9578950, "step": 454, "time_per_iteration": 2.8580126762390137 }, { "auxiliary_loss_clip": 0.01285525, "auxiliary_loss_mlp": 0.01077241, "balance_loss_clip": 1.07935429, "balance_loss_mlp": 1.043648, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 1.985829769195046, "language_loss": 0.75404847, "learning_rate": 3.9405668568855866e-06, "loss": 0.77767611, "num_input_tokens_seen": 9598160, "step": 455, "time_per_iteration": 2.6593477725982666 }, { "auxiliary_loss_clip": 0.01282853, "auxiliary_loss_mlp": 0.01094959, "balance_loss_clip": 1.07477236, "balance_loss_mlp": 1.0597918, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 1.92483069519606, "language_loss": 0.80670613, "learning_rate": 3.941980363893499e-06, "loss": 0.83048427, "num_input_tokens_seen": 9616010, "step": 456, "time_per_iteration": 2.6798384189605713 }, { "auxiliary_loss_clip": 0.01280135, "auxiliary_loss_mlp": 0.01080319, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.0435549, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.171481572134165, "language_loss": 0.81587321, "learning_rate": 3.9433907744980384e-06, "loss": 0.83947778, "num_input_tokens_seen": 9634000, "step": 457, "time_per_iteration": 5.62308406829834 }, { "auxiliary_loss_clip": 0.01283922, "auxiliary_loss_mlp": 0.01084055, "balance_loss_clip": 1.07603848, "balance_loss_mlp": 1.04891229, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 2.024184269172234, "language_loss": 0.94030929, "learning_rate": 3.944798102235412e-06, "loss": 0.96398914, "num_input_tokens_seen": 9653455, "step": 458, "time_per_iteration": 5.694372653961182 }, { "auxiliary_loss_clip": 0.01280807, "auxiliary_loss_mlp": 0.01091426, "balance_loss_clip": 1.07479525, "balance_loss_mlp": 1.05666471, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 2.356061876390436, "language_loss": 0.79279089, "learning_rate": 3.9462023605532545e-06, "loss": 0.81651318, "num_input_tokens_seen": 9669650, "step": 459, "time_per_iteration": 2.626948595046997 }, { "auxiliary_loss_clip": 0.01286253, "auxiliary_loss_mlp": 0.01081623, "balance_loss_clip": 1.08119941, "balance_loss_mlp": 1.04278445, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.0583603779546404, "language_loss": 0.83362132, "learning_rate": 3.947603562811407e-06, "loss": 0.85730016, "num_input_tokens_seen": 9691415, "step": 460, "time_per_iteration": 2.7191598415374756 }, { "auxiliary_loss_clip": 0.01158037, "auxiliary_loss_mlp": 0.01054463, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.044402, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.612511499168885, "language_loss": 0.7351321, "learning_rate": 3.949001722282675e-06, "loss": 0.7572571, "num_input_tokens_seen": 9755605, "step": 461, "time_per_iteration": 3.210820436477661 }, { "auxiliary_loss_clip": 0.01284234, "auxiliary_loss_mlp": 0.01079832, "balance_loss_clip": 1.08432341, "balance_loss_mlp": 1.04700136, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.4500038571081073, "language_loss": 0.81596625, "learning_rate": 3.950396852153582e-06, "loss": 0.839607, "num_input_tokens_seen": 9776270, "step": 462, "time_per_iteration": 2.683197021484375 }, { "auxiliary_loss_clip": 0.01280414, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.07752454, "balance_loss_mlp": 1.0454762, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.258526594266715, "language_loss": 0.90062451, "learning_rate": 3.951788965525118e-06, "loss": 0.92421508, "num_input_tokens_seen": 9794465, "step": 463, "time_per_iteration": 2.641674757003784 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.01010002, "balance_loss_clip": 1.04755902, "balance_loss_mlp": 1.00027454, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8962796480673014, "language_loss": 0.59058654, "learning_rate": 3.953178075413476e-06, "loss": 0.61220491, "num_input_tokens_seen": 9849685, "step": 464, "time_per_iteration": 3.1129612922668457 }, { "auxiliary_loss_clip": 0.01292933, "auxiliary_loss_mlp": 0.01100533, "balance_loss_clip": 1.08296049, "balance_loss_mlp": 1.06412649, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.3712654859298055, "language_loss": 0.81454253, "learning_rate": 3.954564194750784e-06, "loss": 0.83847719, "num_input_tokens_seen": 9869505, "step": 465, "time_per_iteration": 2.723144769668579 }, { "auxiliary_loss_clip": 0.01279938, "auxiliary_loss_mlp": 0.01092668, "balance_loss_clip": 1.07546401, "balance_loss_mlp": 1.05630863, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 1.9968224423519798, "language_loss": 0.78396618, "learning_rate": 3.955947336385828e-06, "loss": 0.80769229, "num_input_tokens_seen": 9890950, "step": 466, "time_per_iteration": 2.6278555393218994 }, { "auxiliary_loss_clip": 0.0127853, "auxiliary_loss_mlp": 0.01091802, "balance_loss_clip": 1.07703936, "balance_loss_mlp": 1.05661178, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 2.010021605622182, "language_loss": 0.87699366, "learning_rate": 3.957327513084761e-06, "loss": 0.90069699, "num_input_tokens_seen": 9911265, "step": 467, "time_per_iteration": 2.6687490940093994 }, { "auxiliary_loss_clip": 0.01285129, "auxiliary_loss_mlp": 0.01112935, "balance_loss_clip": 1.07874036, "balance_loss_mlp": 1.07576585, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.2302958424490416, "language_loss": 0.86091757, "learning_rate": 3.958704737531818e-06, "loss": 0.88489819, "num_input_tokens_seen": 9929025, "step": 468, "time_per_iteration": 2.5745644569396973 }, { "auxiliary_loss_clip": 0.01281128, "auxiliary_loss_mlp": 0.01085455, "balance_loss_clip": 1.07529211, "balance_loss_mlp": 1.04857147, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.1866562002509875, "language_loss": 0.91690558, "learning_rate": 3.9600790223300065e-06, "loss": 0.94057143, "num_input_tokens_seen": 9945190, "step": 469, "time_per_iteration": 2.610821008682251 }, { "auxiliary_loss_clip": 0.0127909, "auxiliary_loss_mlp": 0.0110095, "balance_loss_clip": 1.07675052, "balance_loss_mlp": 1.06482995, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 2.674428223667968, "language_loss": 0.81758964, "learning_rate": 3.96145038000181e-06, "loss": 0.84139001, "num_input_tokens_seen": 9962820, "step": 470, "time_per_iteration": 2.6004326343536377 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01086643, "balance_loss_clip": 1.07482624, "balance_loss_mlp": 1.04947352, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.788793606991614, "language_loss": 0.93071401, "learning_rate": 3.962818822989861e-06, "loss": 0.95438784, "num_input_tokens_seen": 9982595, "step": 471, "time_per_iteration": 2.556288719177246 }, { "auxiliary_loss_clip": 0.01273697, "auxiliary_loss_mlp": 0.0110454, "balance_loss_clip": 1.07223165, "balance_loss_mlp": 1.06884849, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 1.8550872135639116, "language_loss": 0.7613501, "learning_rate": 3.964184363657625e-06, "loss": 0.78513247, "num_input_tokens_seen": 10004645, "step": 472, "time_per_iteration": 2.667804002761841 }, { "auxiliary_loss_clip": 0.01280341, "auxiliary_loss_mlp": 0.01090649, "balance_loss_clip": 1.07279634, "balance_loss_mlp": 1.05624473, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.9914661475951314, "language_loss": 0.93097353, "learning_rate": 3.965547014290071e-06, "loss": 0.95468336, "num_input_tokens_seen": 10022555, "step": 473, "time_per_iteration": 2.6402342319488525 }, { "auxiliary_loss_clip": 0.01287339, "auxiliary_loss_mlp": 0.01124194, "balance_loss_clip": 1.07773685, "balance_loss_mlp": 1.08979011, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 3.2560638787193237, "language_loss": 0.88488632, "learning_rate": 3.96690678709433e-06, "loss": 0.90900171, "num_input_tokens_seen": 10041025, "step": 474, "time_per_iteration": 2.5853888988494873 }, { "auxiliary_loss_clip": 0.0127783, "auxiliary_loss_mlp": 0.01093132, "balance_loss_clip": 1.07535374, "balance_loss_mlp": 1.05620146, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 3.1427023167402006, "language_loss": 0.78901398, "learning_rate": 3.968263694200355e-06, "loss": 0.81272364, "num_input_tokens_seen": 10060775, "step": 475, "time_per_iteration": 2.654519557952881 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01095224, "balance_loss_clip": 1.04505777, "balance_loss_mlp": 1.08583021, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9280065830162254, "language_loss": 0.66926932, "learning_rate": 3.969617747661569e-06, "loss": 0.6917026, "num_input_tokens_seen": 10120225, "step": 476, "time_per_iteration": 3.1292569637298584 }, { "auxiliary_loss_clip": 0.01279748, "auxiliary_loss_mlp": 0.01088794, "balance_loss_clip": 1.07638311, "balance_loss_mlp": 1.05188656, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 2.985672001195028, "language_loss": 0.83807188, "learning_rate": 3.970968959455509e-06, "loss": 0.86175728, "num_input_tokens_seen": 10137880, "step": 477, "time_per_iteration": 2.651493549346924 }, { "auxiliary_loss_clip": 0.01284956, "auxiliary_loss_mlp": 0.0108711, "balance_loss_clip": 1.07924342, "balance_loss_mlp": 1.05089426, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.1929055744411943, "language_loss": 0.8233152, "learning_rate": 3.97231734148446e-06, "loss": 0.84703588, "num_input_tokens_seen": 10156930, "step": 478, "time_per_iteration": 2.6986753940582275 }, { "auxiliary_loss_clip": 0.01277687, "auxiliary_loss_mlp": 0.01080644, "balance_loss_clip": 1.07448888, "balance_loss_mlp": 1.04500043, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 4.057107988717453, "language_loss": 0.81195259, "learning_rate": 3.973662905576082e-06, "loss": 0.83553594, "num_input_tokens_seen": 10176295, "step": 479, "time_per_iteration": 2.6321041584014893 }, { "auxiliary_loss_clip": 0.01273765, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.07335579, "balance_loss_mlp": 1.04552341, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 2.352225573775279, "language_loss": 0.7335608, "learning_rate": 3.975005663484038e-06, "loss": 0.75713164, "num_input_tokens_seen": 10195790, "step": 480, "time_per_iteration": 2.650696277618408 }, { "auxiliary_loss_clip": 0.01273107, "auxiliary_loss_mlp": 0.01075586, "balance_loss_clip": 1.07424879, "balance_loss_mlp": 1.04277968, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.867890428108999, "language_loss": 0.87560165, "learning_rate": 3.976345626888605e-06, "loss": 0.89908862, "num_input_tokens_seen": 10218405, "step": 481, "time_per_iteration": 2.6585533618927 }, { "auxiliary_loss_clip": 0.01142103, "auxiliary_loss_mlp": 0.01017301, "balance_loss_clip": 1.04286921, "balance_loss_mlp": 1.00895679, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8486437303263991, "language_loss": 0.66030192, "learning_rate": 3.9776828073972864e-06, "loss": 0.68189597, "num_input_tokens_seen": 10271005, "step": 482, "time_per_iteration": 2.9788918495178223 }, { "auxiliary_loss_clip": 0.01287904, "auxiliary_loss_mlp": 0.01082416, "balance_loss_clip": 1.07739437, "balance_loss_mlp": 1.04868007, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.6473263724689873, "language_loss": 0.7899214, "learning_rate": 3.979017216545415e-06, "loss": 0.81362462, "num_input_tokens_seen": 10288405, "step": 483, "time_per_iteration": 2.5642752647399902 }, { "auxiliary_loss_clip": 0.01283775, "auxiliary_loss_mlp": 0.01097438, "balance_loss_clip": 1.07794189, "balance_loss_mlp": 1.06155562, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.6777328906555766, "language_loss": 0.75510043, "learning_rate": 3.980348865796749e-06, "loss": 0.77891254, "num_input_tokens_seen": 10306875, "step": 484, "time_per_iteration": 2.608337640762329 }, { "auxiliary_loss_clip": 0.0127962, "auxiliary_loss_mlp": 0.01081582, "balance_loss_clip": 1.07543373, "balance_loss_mlp": 1.04760778, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.3457282915841113, "language_loss": 0.8378315, "learning_rate": 3.9816777665440615e-06, "loss": 0.86144352, "num_input_tokens_seen": 10323965, "step": 485, "time_per_iteration": 2.591409921646118 }, { "auxiliary_loss_clip": 0.01282377, "auxiliary_loss_mlp": 0.01084922, "balance_loss_clip": 1.08029485, "balance_loss_mlp": 1.04956484, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.044831141886674, "language_loss": 0.84432101, "learning_rate": 3.983003930109732e-06, "loss": 0.86799401, "num_input_tokens_seen": 10342620, "step": 486, "time_per_iteration": 2.7101452350616455 }, { "auxiliary_loss_clip": 0.01276806, "auxiliary_loss_mlp": 0.01090739, "balance_loss_clip": 1.07363296, "balance_loss_mlp": 1.05476189, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 12.432525192672303, "language_loss": 0.88968349, "learning_rate": 3.984327367746315e-06, "loss": 0.91335887, "num_input_tokens_seen": 10364610, "step": 487, "time_per_iteration": 2.637910842895508 }, { "auxiliary_loss_clip": 0.01283084, "auxiliary_loss_mlp": 0.01069223, "balance_loss_clip": 1.07921362, "balance_loss_mlp": 1.03677416, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.566388301054309, "language_loss": 0.88581878, "learning_rate": 3.985648090637122e-06, "loss": 0.90934181, "num_input_tokens_seen": 10380910, "step": 488, "time_per_iteration": 2.6569244861602783 }, { "auxiliary_loss_clip": 0.01275613, "auxiliary_loss_mlp": 0.01081415, "balance_loss_clip": 1.07419777, "balance_loss_mlp": 1.04667735, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 2.0135021623582503, "language_loss": 0.88869834, "learning_rate": 3.986966109896785e-06, "loss": 0.91226858, "num_input_tokens_seen": 10400665, "step": 489, "time_per_iteration": 2.805555582046509 }, { "auxiliary_loss_clip": 0.01271096, "auxiliary_loss_mlp": 0.01077182, "balance_loss_clip": 1.0704807, "balance_loss_mlp": 1.04168141, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 2.807428314395572, "language_loss": 0.88554472, "learning_rate": 3.988281436571815e-06, "loss": 0.90902752, "num_input_tokens_seen": 10420150, "step": 490, "time_per_iteration": 2.612993001937866 }, { "auxiliary_loss_clip": 0.01276687, "auxiliary_loss_mlp": 0.01088031, "balance_loss_clip": 1.0729506, "balance_loss_mlp": 1.0536747, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 2.430337539839543, "language_loss": 0.91496718, "learning_rate": 3.989594081641164e-06, "loss": 0.93861437, "num_input_tokens_seen": 10438210, "step": 491, "time_per_iteration": 2.6203627586364746 }, { "auxiliary_loss_clip": 0.01266864, "auxiliary_loss_mlp": 0.01072939, "balance_loss_clip": 1.07131863, "balance_loss_mlp": 1.03984618, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 1.9753258841331502, "language_loss": 0.85654163, "learning_rate": 3.9909040560167675e-06, "loss": 0.87993968, "num_input_tokens_seen": 10455125, "step": 492, "time_per_iteration": 2.636378288269043 }, { "auxiliary_loss_clip": 0.01279009, "auxiliary_loss_mlp": 0.01100381, "balance_loss_clip": 1.07765996, "balance_loss_mlp": 1.06471384, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 4.076790847855052, "language_loss": 0.84615922, "learning_rate": 3.992211370544093e-06, "loss": 0.86995316, "num_input_tokens_seen": 10470990, "step": 493, "time_per_iteration": 2.6144914627075195 }, { "auxiliary_loss_clip": 0.01272514, "auxiliary_loss_mlp": 0.01074657, "balance_loss_clip": 1.07140934, "balance_loss_mlp": 1.04042029, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 1.8084917907335818, "language_loss": 0.8658669, "learning_rate": 3.99351603600268e-06, "loss": 0.88933873, "num_input_tokens_seen": 10490685, "step": 494, "time_per_iteration": 2.7063095569610596 }, { "auxiliary_loss_clip": 0.01281688, "auxiliary_loss_mlp": 0.01084428, "balance_loss_clip": 1.07739305, "balance_loss_mlp": 1.05279028, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 7.125038043922513, "language_loss": 0.86841047, "learning_rate": 3.994818063106668e-06, "loss": 0.8920716, "num_input_tokens_seen": 10509435, "step": 495, "time_per_iteration": 2.641700267791748 }, { "auxiliary_loss_clip": 0.01268945, "auxiliary_loss_mlp": 0.01078198, "balance_loss_clip": 1.07384837, "balance_loss_mlp": 1.04508162, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 2.201071528053665, "language_loss": 0.61988759, "learning_rate": 3.99611746250533e-06, "loss": 0.64335901, "num_input_tokens_seen": 10530050, "step": 496, "time_per_iteration": 2.6524407863616943 }, { "auxiliary_loss_clip": 0.01270994, "auxiliary_loss_mlp": 0.01089922, "balance_loss_clip": 1.07575428, "balance_loss_mlp": 1.05680561, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.7538974268426115, "language_loss": 0.88820887, "learning_rate": 3.997414244783595e-06, "loss": 0.91181797, "num_input_tokens_seen": 10551370, "step": 497, "time_per_iteration": 5.648245811462402 }, { "auxiliary_loss_clip": 0.01277289, "auxiliary_loss_mlp": 0.01079642, "balance_loss_clip": 1.07670021, "balance_loss_mlp": 1.04604888, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.8395997319333204, "language_loss": 0.85091698, "learning_rate": 3.998708420462557e-06, "loss": 0.87448633, "num_input_tokens_seen": 10569225, "step": 498, "time_per_iteration": 4.362173080444336 }, { "auxiliary_loss_clip": 0.0127249, "auxiliary_loss_mlp": 0.01078673, "balance_loss_clip": 1.07436109, "balance_loss_mlp": 1.04691589, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 3.2275044857926605, "language_loss": 0.77883017, "learning_rate": 4e-06, "loss": 0.80234182, "num_input_tokens_seen": 10586170, "step": 499, "time_per_iteration": 2.6029655933380127 }, { "auxiliary_loss_clip": 0.01272525, "auxiliary_loss_mlp": 0.01082339, "balance_loss_clip": 1.07433248, "balance_loss_mlp": 1.04905546, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 2.244229511477372, "language_loss": 0.82687509, "learning_rate": 3.9999999620799e-06, "loss": 0.85042375, "num_input_tokens_seen": 10606205, "step": 500, "time_per_iteration": 2.6293113231658936 }, { "auxiliary_loss_clip": 0.01266453, "auxiliary_loss_mlp": 0.0108458, "balance_loss_clip": 1.07100737, "balance_loss_mlp": 1.04922247, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 3.2569274145363356, "language_loss": 0.88086087, "learning_rate": 3.9999998483196e-06, "loss": 0.90437114, "num_input_tokens_seen": 10625995, "step": 501, "time_per_iteration": 2.601081132888794 }, { "auxiliary_loss_clip": 0.01273997, "auxiliary_loss_mlp": 0.01071746, "balance_loss_clip": 1.07361674, "balance_loss_mlp": 1.04025102, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 3.3627001763511855, "language_loss": 0.86654103, "learning_rate": 3.9999996587191065e-06, "loss": 0.88999844, "num_input_tokens_seen": 10644105, "step": 502, "time_per_iteration": 2.5507659912109375 }, { "auxiliary_loss_clip": 0.01270542, "auxiliary_loss_mlp": 0.01081534, "balance_loss_clip": 1.07475543, "balance_loss_mlp": 1.04827452, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 2.4572357458963876, "language_loss": 0.84281206, "learning_rate": 3.999999393278425e-06, "loss": 0.86633277, "num_input_tokens_seen": 10661090, "step": 503, "time_per_iteration": 2.618587017059326 }, { "auxiliary_loss_clip": 0.01262547, "auxiliary_loss_mlp": 0.01091143, "balance_loss_clip": 1.0710721, "balance_loss_mlp": 1.05781209, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 1.6994359255159197, "language_loss": 0.88137805, "learning_rate": 3.999999051997567e-06, "loss": 0.90491492, "num_input_tokens_seen": 10682380, "step": 504, "time_per_iteration": 2.6794183254241943 }, { "auxiliary_loss_clip": 0.01264601, "auxiliary_loss_mlp": 0.01086749, "balance_loss_clip": 1.07040262, "balance_loss_mlp": 1.0541091, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 2.074855698516145, "language_loss": 0.786093, "learning_rate": 3.9999986348765425e-06, "loss": 0.80960649, "num_input_tokens_seen": 10699925, "step": 505, "time_per_iteration": 2.564960479736328 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.010147, "balance_loss_clip": 1.03763247, "balance_loss_mlp": 1.00692737, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.9565689962416369, "language_loss": 0.54981297, "learning_rate": 3.999998141915371e-06, "loss": 0.57130682, "num_input_tokens_seen": 10766525, "step": 506, "time_per_iteration": 3.3345654010772705 }, { "auxiliary_loss_clip": 0.01266577, "auxiliary_loss_mlp": 0.01090299, "balance_loss_clip": 1.07119894, "balance_loss_mlp": 1.05687308, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.2738865373146684, "language_loss": 0.83377159, "learning_rate": 3.999997573114069e-06, "loss": 0.8573404, "num_input_tokens_seen": 10786725, "step": 507, "time_per_iteration": 2.645613670349121 }, { "auxiliary_loss_clip": 0.01269938, "auxiliary_loss_mlp": 0.01076205, "balance_loss_clip": 1.07151937, "balance_loss_mlp": 1.04344678, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.375369924968869, "language_loss": 0.88842839, "learning_rate": 3.999996928472659e-06, "loss": 0.91188985, "num_input_tokens_seen": 10805390, "step": 508, "time_per_iteration": 2.617283344268799 }, { "auxiliary_loss_clip": 0.01272148, "auxiliary_loss_mlp": 0.01067206, "balance_loss_clip": 1.07232118, "balance_loss_mlp": 1.03394616, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 6.964954749829821, "language_loss": 0.71807706, "learning_rate": 3.999996207991165e-06, "loss": 0.74147063, "num_input_tokens_seen": 10828030, "step": 509, "time_per_iteration": 2.7723498344421387 }, { "auxiliary_loss_clip": 0.01264594, "auxiliary_loss_mlp": 0.01074377, "balance_loss_clip": 1.07241154, "balance_loss_mlp": 1.04333544, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 1.9285974370038053, "language_loss": 0.82031929, "learning_rate": 3.999995411669614e-06, "loss": 0.84370899, "num_input_tokens_seen": 10845240, "step": 510, "time_per_iteration": 2.6254217624664307 }, { "auxiliary_loss_clip": 0.01268793, "auxiliary_loss_mlp": 0.01075379, "balance_loss_clip": 1.07532823, "balance_loss_mlp": 1.04252458, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 5.706057095430757, "language_loss": 0.83572316, "learning_rate": 3.999994539508036e-06, "loss": 0.85916495, "num_input_tokens_seen": 10864325, "step": 511, "time_per_iteration": 2.613457441329956 }, { "auxiliary_loss_clip": 0.01269742, "auxiliary_loss_mlp": 0.01081314, "balance_loss_clip": 1.07207167, "balance_loss_mlp": 1.0496521, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.025270681093948, "language_loss": 0.82109964, "learning_rate": 3.9999935915064655e-06, "loss": 0.84461015, "num_input_tokens_seen": 10883860, "step": 512, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01266054, "auxiliary_loss_mlp": 0.01084436, "balance_loss_clip": 1.07086158, "balance_loss_mlp": 1.05070007, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 2.500363981205655, "language_loss": 0.86933553, "learning_rate": 3.9999925676649374e-06, "loss": 0.89284045, "num_input_tokens_seen": 10904555, "step": 513, "time_per_iteration": 2.671926259994507 }, { "auxiliary_loss_clip": 0.01272542, "auxiliary_loss_mlp": 0.01080065, "balance_loss_clip": 1.07461214, "balance_loss_mlp": 1.04744935, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.704575426690477, "language_loss": 0.79124331, "learning_rate": 3.999991467983491e-06, "loss": 0.81476939, "num_input_tokens_seen": 10923700, "step": 514, "time_per_iteration": 2.6158573627471924 }, { "auxiliary_loss_clip": 0.01265821, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.07397485, "balance_loss_mlp": 1.03711247, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 2.729063628201222, "language_loss": 0.77758944, "learning_rate": 3.999990292462167e-06, "loss": 0.80092615, "num_input_tokens_seen": 10942730, "step": 515, "time_per_iteration": 2.636294364929199 }, { "auxiliary_loss_clip": 0.0126398, "auxiliary_loss_mlp": 0.01072575, "balance_loss_clip": 1.06835747, "balance_loss_mlp": 1.03874326, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 2.1228851207681503, "language_loss": 0.82452714, "learning_rate": 3.999989041101011e-06, "loss": 0.84789264, "num_input_tokens_seen": 10967120, "step": 516, "time_per_iteration": 2.8078057765960693 }, { "auxiliary_loss_clip": 0.01263726, "auxiliary_loss_mlp": 0.01073859, "balance_loss_clip": 1.0712111, "balance_loss_mlp": 1.04090929, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 1.9016724574566626, "language_loss": 0.79088318, "learning_rate": 3.999987713900071e-06, "loss": 0.81425899, "num_input_tokens_seen": 10986775, "step": 517, "time_per_iteration": 2.5935981273651123 }, { "auxiliary_loss_clip": 0.0125895, "auxiliary_loss_mlp": 0.0107836, "balance_loss_clip": 1.07049131, "balance_loss_mlp": 1.04629326, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.6829619528007147, "language_loss": 0.90798068, "learning_rate": 3.999986310859396e-06, "loss": 0.93135381, "num_input_tokens_seen": 11011360, "step": 518, "time_per_iteration": 2.6855509281158447 }, { "auxiliary_loss_clip": 0.01272237, "auxiliary_loss_mlp": 0.01097567, "balance_loss_clip": 1.07848859, "balance_loss_mlp": 1.06230497, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 1.8835331125391583, "language_loss": 0.86759162, "learning_rate": 3.999984831979039e-06, "loss": 0.89128959, "num_input_tokens_seen": 11030150, "step": 519, "time_per_iteration": 2.628380060195923 }, { "auxiliary_loss_clip": 0.01265864, "auxiliary_loss_mlp": 0.01086943, "balance_loss_clip": 1.06901193, "balance_loss_mlp": 1.05578136, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 3.8823628482318164, "language_loss": 0.87246573, "learning_rate": 3.999983277259057e-06, "loss": 0.89599377, "num_input_tokens_seen": 11049145, "step": 520, "time_per_iteration": 2.5850255489349365 }, { "auxiliary_loss_clip": 0.01269157, "auxiliary_loss_mlp": 0.01086266, "balance_loss_clip": 1.07231963, "balance_loss_mlp": 1.0528394, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.7050130216714323, "language_loss": 0.89274424, "learning_rate": 3.999981646699509e-06, "loss": 0.91629851, "num_input_tokens_seen": 11068835, "step": 521, "time_per_iteration": 2.6412506103515625 }, { "auxiliary_loss_clip": 0.01263772, "auxiliary_loss_mlp": 0.01082584, "balance_loss_clip": 1.0717473, "balance_loss_mlp": 1.04827595, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 2.085624200373119, "language_loss": 0.71452564, "learning_rate": 3.999979940300456e-06, "loss": 0.73798925, "num_input_tokens_seen": 11088980, "step": 522, "time_per_iteration": 2.6561174392700195 }, { "auxiliary_loss_clip": 0.01265725, "auxiliary_loss_mlp": 0.01082552, "balance_loss_clip": 1.06871116, "balance_loss_mlp": 1.05079484, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 4.223323698032832, "language_loss": 0.84758592, "learning_rate": 3.999978158061963e-06, "loss": 0.87106872, "num_input_tokens_seen": 11104300, "step": 523, "time_per_iteration": 2.608565330505371 }, { "auxiliary_loss_clip": 0.01271589, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.07193565, "balance_loss_mlp": 1.04296994, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.324094801199308, "language_loss": 0.89989722, "learning_rate": 3.999976299984099e-06, "loss": 0.92337573, "num_input_tokens_seen": 11123335, "step": 524, "time_per_iteration": 2.68269944190979 }, { "auxiliary_loss_clip": 0.01273471, "auxiliary_loss_mlp": 0.0108318, "balance_loss_clip": 1.07427168, "balance_loss_mlp": 1.04944324, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.4635323942475766, "language_loss": 0.80114233, "learning_rate": 3.999974366066933e-06, "loss": 0.82470882, "num_input_tokens_seen": 11140880, "step": 525, "time_per_iteration": 2.6396324634552 }, { "auxiliary_loss_clip": 0.01264716, "auxiliary_loss_mlp": 0.01080959, "balance_loss_clip": 1.0681529, "balance_loss_mlp": 1.04798603, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.3553733144031948, "language_loss": 0.81162, "learning_rate": 3.999972356310538e-06, "loss": 0.83507675, "num_input_tokens_seen": 11158710, "step": 526, "time_per_iteration": 2.6167168617248535 }, { "auxiliary_loss_clip": 0.01273987, "auxiliary_loss_mlp": 0.01072725, "balance_loss_clip": 1.07507181, "balance_loss_mlp": 1.03736734, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 1.9666844995001491, "language_loss": 0.81491739, "learning_rate": 3.999970270714991e-06, "loss": 0.83838451, "num_input_tokens_seen": 11177550, "step": 527, "time_per_iteration": 2.580310821533203 }, { "auxiliary_loss_clip": 0.01261155, "auxiliary_loss_mlp": 0.01080842, "balance_loss_clip": 1.06786597, "balance_loss_mlp": 1.04717755, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 1.9105688869262756, "language_loss": 0.93801636, "learning_rate": 3.999968109280371e-06, "loss": 0.96143627, "num_input_tokens_seen": 11196230, "step": 528, "time_per_iteration": 2.5901002883911133 }, { "auxiliary_loss_clip": 0.01263275, "auxiliary_loss_mlp": 0.01071724, "balance_loss_clip": 1.06776333, "balance_loss_mlp": 1.0387274, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 1.8924176613796981, "language_loss": 0.84130204, "learning_rate": 3.99996587200676e-06, "loss": 0.86465204, "num_input_tokens_seen": 11214935, "step": 529, "time_per_iteration": 2.593867063522339 }, { "auxiliary_loss_clip": 0.01266309, "auxiliary_loss_mlp": 0.01088988, "balance_loss_clip": 1.07501197, "balance_loss_mlp": 1.0563724, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 2.316883777672742, "language_loss": 0.90458709, "learning_rate": 3.999963558894243e-06, "loss": 0.92814004, "num_input_tokens_seen": 11235310, "step": 530, "time_per_iteration": 2.5994982719421387 }, { "auxiliary_loss_clip": 0.01261024, "auxiliary_loss_mlp": 0.0107627, "balance_loss_clip": 1.06481552, "balance_loss_mlp": 1.04188991, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.2744046769674324, "language_loss": 0.76334512, "learning_rate": 3.999961169942907e-06, "loss": 0.78671807, "num_input_tokens_seen": 11254425, "step": 531, "time_per_iteration": 2.618149757385254 }, { "auxiliary_loss_clip": 0.01260981, "auxiliary_loss_mlp": 0.01064937, "balance_loss_clip": 1.0669558, "balance_loss_mlp": 1.03143883, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 2.467757262816931, "language_loss": 0.90483695, "learning_rate": 3.999958705152843e-06, "loss": 0.92809618, "num_input_tokens_seen": 11274595, "step": 532, "time_per_iteration": 2.647947072982788 }, { "auxiliary_loss_clip": 0.01146464, "auxiliary_loss_mlp": 0.01012028, "balance_loss_clip": 1.04988623, "balance_loss_mlp": 1.00325394, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 1.9655071928838626, "language_loss": 0.57953775, "learning_rate": 3.9999561645241445e-06, "loss": 0.60112268, "num_input_tokens_seen": 11336705, "step": 533, "time_per_iteration": 3.2502808570861816 }, { "auxiliary_loss_clip": 0.01260941, "auxiliary_loss_mlp": 0.01084263, "balance_loss_clip": 1.06724441, "balance_loss_mlp": 1.0516715, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 1.7138682169725878, "language_loss": 0.86666048, "learning_rate": 3.999953548056907e-06, "loss": 0.89011252, "num_input_tokens_seen": 11356820, "step": 534, "time_per_iteration": 2.678739070892334 }, { "auxiliary_loss_clip": 0.01259554, "auxiliary_loss_mlp": 0.01066669, "balance_loss_clip": 1.06782031, "balance_loss_mlp": 1.03407741, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.12774196295415, "language_loss": 0.77627808, "learning_rate": 3.999950855751232e-06, "loss": 0.79954034, "num_input_tokens_seen": 11376645, "step": 535, "time_per_iteration": 2.7128217220306396 }, { "auxiliary_loss_clip": 0.01261708, "auxiliary_loss_mlp": 0.01081378, "balance_loss_clip": 1.06843078, "balance_loss_mlp": 1.0485003, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 3.9913279940153585, "language_loss": 0.80939913, "learning_rate": 3.999948087607219e-06, "loss": 0.83283001, "num_input_tokens_seen": 11397310, "step": 536, "time_per_iteration": 2.7490127086639404 }, { "auxiliary_loss_clip": 0.01262237, "auxiliary_loss_mlp": 0.01075987, "balance_loss_clip": 1.06839073, "balance_loss_mlp": 1.04167831, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 1.6888601787189168, "language_loss": 0.7009111, "learning_rate": 3.999945243624975e-06, "loss": 0.72429335, "num_input_tokens_seen": 11418475, "step": 537, "time_per_iteration": 5.5609166622161865 }, { "auxiliary_loss_clip": 0.0126357, "auxiliary_loss_mlp": 0.01084205, "balance_loss_clip": 1.07331729, "balance_loss_mlp": 1.05161297, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 2.146306428033486, "language_loss": 0.82684958, "learning_rate": 3.999942323804607e-06, "loss": 0.85032725, "num_input_tokens_seen": 11436630, "step": 538, "time_per_iteration": 2.5465030670166016 }, { "auxiliary_loss_clip": 0.01269537, "auxiliary_loss_mlp": 0.01078099, "balance_loss_clip": 1.06987572, "balance_loss_mlp": 1.04536414, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 1.8709064214989917, "language_loss": 0.79146457, "learning_rate": 3.999939328146225e-06, "loss": 0.81494099, "num_input_tokens_seen": 11457275, "step": 539, "time_per_iteration": 4.172123432159424 }, { "auxiliary_loss_clip": 0.0126143, "auxiliary_loss_mlp": 0.01069528, "balance_loss_clip": 1.06830835, "balance_loss_mlp": 1.03567231, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 35.59051030008172, "language_loss": 0.77379727, "learning_rate": 3.999936256649943e-06, "loss": 0.79710686, "num_input_tokens_seen": 11476925, "step": 540, "time_per_iteration": 2.5633046627044678 }, { "auxiliary_loss_clip": 0.01269863, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.07271969, "balance_loss_mlp": 1.04124355, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.0489065110302636, "language_loss": 0.85458571, "learning_rate": 3.999933109315878e-06, "loss": 0.878021, "num_input_tokens_seen": 11496830, "step": 541, "time_per_iteration": 2.6079938411712646 }, { "auxiliary_loss_clip": 0.01258504, "auxiliary_loss_mlp": 0.01082451, "balance_loss_clip": 1.06961954, "balance_loss_mlp": 1.04835749, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.674731240129174, "language_loss": 0.89234567, "learning_rate": 3.9999298861441496e-06, "loss": 0.91575521, "num_input_tokens_seen": 11515605, "step": 542, "time_per_iteration": 2.597036600112915 }, { "auxiliary_loss_clip": 0.0126351, "auxiliary_loss_mlp": 0.01081041, "balance_loss_clip": 1.06974792, "balance_loss_mlp": 1.04792452, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 2.2714121360014334, "language_loss": 0.71123677, "learning_rate": 3.999926587134879e-06, "loss": 0.73468232, "num_input_tokens_seen": 11536230, "step": 543, "time_per_iteration": 2.634601354598999 }, { "auxiliary_loss_clip": 0.01259994, "auxiliary_loss_mlp": 0.01088763, "balance_loss_clip": 1.06379187, "balance_loss_mlp": 1.05545604, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 4.777521083182084, "language_loss": 0.91540575, "learning_rate": 3.999923212288192e-06, "loss": 0.93889332, "num_input_tokens_seen": 11554715, "step": 544, "time_per_iteration": 2.6173009872436523 }, { "auxiliary_loss_clip": 0.01264485, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.06989884, "balance_loss_mlp": 1.05571437, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 2.6951315012120025, "language_loss": 0.65799558, "learning_rate": 3.999919761604216e-06, "loss": 0.68149722, "num_input_tokens_seen": 11571370, "step": 545, "time_per_iteration": 2.6500988006591797 }, { "auxiliary_loss_clip": 0.012623, "auxiliary_loss_mlp": 0.0107161, "balance_loss_clip": 1.06693912, "balance_loss_mlp": 1.0393517, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.2564766449723908, "language_loss": 0.92221987, "learning_rate": 3.999916235083083e-06, "loss": 0.94555902, "num_input_tokens_seen": 11588560, "step": 546, "time_per_iteration": 2.673250913619995 }, { "auxiliary_loss_clip": 0.01260258, "auxiliary_loss_mlp": 0.01077296, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.04313052, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 2.1923718908590653, "language_loss": 0.81706661, "learning_rate": 3.999912632724925e-06, "loss": 0.84044212, "num_input_tokens_seen": 11605685, "step": 547, "time_per_iteration": 2.725198745727539 }, { "auxiliary_loss_clip": 0.0126227, "auxiliary_loss_mlp": 0.0107871, "balance_loss_clip": 1.06794477, "balance_loss_mlp": 1.04480648, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.730652582963277, "language_loss": 0.81227565, "learning_rate": 3.999908954529881e-06, "loss": 0.83568549, "num_input_tokens_seen": 11626290, "step": 548, "time_per_iteration": 2.714073419570923 }, { "auxiliary_loss_clip": 0.01264818, "auxiliary_loss_mlp": 0.01084154, "balance_loss_clip": 1.06963027, "balance_loss_mlp": 1.04870164, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 3.8540092911047603, "language_loss": 0.67460287, "learning_rate": 3.999905200498087e-06, "loss": 0.69809258, "num_input_tokens_seen": 11643950, "step": 549, "time_per_iteration": 2.6747171878814697 }, { "auxiliary_loss_clip": 0.0125805, "auxiliary_loss_mlp": 0.01076001, "balance_loss_clip": 1.06968856, "balance_loss_mlp": 1.04236054, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 1.933615596136007, "language_loss": 0.86379111, "learning_rate": 3.999901370629689e-06, "loss": 0.88713157, "num_input_tokens_seen": 11662560, "step": 550, "time_per_iteration": 2.553386926651001 }, { "auxiliary_loss_clip": 0.01264951, "auxiliary_loss_mlp": 0.01095377, "balance_loss_clip": 1.07279766, "balance_loss_mlp": 1.06142652, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 3.1958143211070977, "language_loss": 0.8127178, "learning_rate": 3.99989746492483e-06, "loss": 0.83632112, "num_input_tokens_seen": 11682265, "step": 551, "time_per_iteration": 2.6231682300567627 }, { "auxiliary_loss_clip": 0.01271579, "auxiliary_loss_mlp": 0.0108998, "balance_loss_clip": 1.07285261, "balance_loss_mlp": 1.05626702, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 2.9473143774727606, "language_loss": 0.86134821, "learning_rate": 3.999893483383658e-06, "loss": 0.88496381, "num_input_tokens_seen": 11699300, "step": 552, "time_per_iteration": 2.7002694606781006 }, { "auxiliary_loss_clip": 0.01267081, "auxiliary_loss_mlp": 0.01081671, "balance_loss_clip": 1.07191086, "balance_loss_mlp": 1.04650474, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.990469903058063, "language_loss": 0.9301765, "learning_rate": 3.999889426006326e-06, "loss": 0.95366406, "num_input_tokens_seen": 11716955, "step": 553, "time_per_iteration": 2.6629648208618164 }, { "auxiliary_loss_clip": 0.01262345, "auxiliary_loss_mlp": 0.01077186, "balance_loss_clip": 1.06925786, "balance_loss_mlp": 1.04149485, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.1924330874053166, "language_loss": 0.78881586, "learning_rate": 3.999885292792986e-06, "loss": 0.8122111, "num_input_tokens_seen": 11736130, "step": 554, "time_per_iteration": 2.668970823287964 }, { "auxiliary_loss_clip": 0.01258048, "auxiliary_loss_mlp": 0.0108557, "balance_loss_clip": 1.06745815, "balance_loss_mlp": 1.05045104, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.2144550089326938, "language_loss": 0.81971425, "learning_rate": 3.999881083743795e-06, "loss": 0.84315038, "num_input_tokens_seen": 11754425, "step": 555, "time_per_iteration": 2.610807418823242 }, { "auxiliary_loss_clip": 0.01264442, "auxiliary_loss_mlp": 0.0108339, "balance_loss_clip": 1.06914032, "balance_loss_mlp": 1.04805672, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 3.7821745066525487, "language_loss": 0.88661897, "learning_rate": 3.999876798858914e-06, "loss": 0.9100973, "num_input_tokens_seen": 11772845, "step": 556, "time_per_iteration": 2.6288907527923584 }, { "auxiliary_loss_clip": 0.01262553, "auxiliary_loss_mlp": 0.01084158, "balance_loss_clip": 1.06896496, "balance_loss_mlp": 1.04863358, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 1.974910128087634, "language_loss": 0.83708388, "learning_rate": 3.999872438138503e-06, "loss": 0.860551, "num_input_tokens_seen": 11792850, "step": 557, "time_per_iteration": 2.649401903152466 }, { "auxiliary_loss_clip": 0.01268198, "auxiliary_loss_mlp": 0.01069057, "balance_loss_clip": 1.07400489, "balance_loss_mlp": 1.03684711, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 3.176542206824637, "language_loss": 0.94202292, "learning_rate": 3.999868001582729e-06, "loss": 0.96539545, "num_input_tokens_seen": 11809670, "step": 558, "time_per_iteration": 2.550515651702881 }, { "auxiliary_loss_clip": 0.01258948, "auxiliary_loss_mlp": 0.01074291, "balance_loss_clip": 1.06591845, "balance_loss_mlp": 1.04036427, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.6619487077732384, "language_loss": 0.77115649, "learning_rate": 3.99986348919176e-06, "loss": 0.79448891, "num_input_tokens_seen": 11829665, "step": 559, "time_per_iteration": 2.729597330093384 }, { "auxiliary_loss_clip": 0.01261947, "auxiliary_loss_mlp": 0.01080822, "balance_loss_clip": 1.06835234, "balance_loss_mlp": 1.04882574, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 1.945022837871561, "language_loss": 0.87472397, "learning_rate": 3.9998589009657675e-06, "loss": 0.89815164, "num_input_tokens_seen": 11848190, "step": 560, "time_per_iteration": 2.6082279682159424 }, { "auxiliary_loss_clip": 0.01257198, "auxiliary_loss_mlp": 0.0107356, "balance_loss_clip": 1.06704283, "balance_loss_mlp": 1.04199314, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.4061219554407502, "language_loss": 0.81578708, "learning_rate": 3.999854236904925e-06, "loss": 0.83909464, "num_input_tokens_seen": 11864795, "step": 561, "time_per_iteration": 2.602193832397461 }, { "auxiliary_loss_clip": 0.01254722, "auxiliary_loss_mlp": 0.01076361, "balance_loss_clip": 1.06685936, "balance_loss_mlp": 1.04422247, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.683217504050761, "language_loss": 0.82320511, "learning_rate": 3.999849497009409e-06, "loss": 0.84651601, "num_input_tokens_seen": 11885275, "step": 562, "time_per_iteration": 2.675872564315796 }, { "auxiliary_loss_clip": 0.01262146, "auxiliary_loss_mlp": 0.01084212, "balance_loss_clip": 1.06894755, "balance_loss_mlp": 1.0508337, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 2.262509698135982, "language_loss": 0.84285647, "learning_rate": 3.999844681279401e-06, "loss": 0.86632001, "num_input_tokens_seen": 11903595, "step": 563, "time_per_iteration": 2.586944103240967 }, { "auxiliary_loss_clip": 0.01258135, "auxiliary_loss_mlp": 0.01083866, "balance_loss_clip": 1.0675565, "balance_loss_mlp": 1.05094075, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.115200912185494, "language_loss": 0.94438875, "learning_rate": 3.99983978971508e-06, "loss": 0.96780878, "num_input_tokens_seen": 11917815, "step": 564, "time_per_iteration": 2.5444440841674805 }, { "auxiliary_loss_clip": 0.01259509, "auxiliary_loss_mlp": 0.01073406, "balance_loss_clip": 1.06518865, "balance_loss_mlp": 1.03907406, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.6560391741906924, "language_loss": 0.94669235, "learning_rate": 3.999834822316635e-06, "loss": 0.97002149, "num_input_tokens_seen": 11936305, "step": 565, "time_per_iteration": 2.5614171028137207 }, { "auxiliary_loss_clip": 0.01150452, "auxiliary_loss_mlp": 0.01081579, "balance_loss_clip": 1.04835606, "balance_loss_mlp": 1.07499874, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.0610477485673708, "language_loss": 0.54800498, "learning_rate": 3.9998297790842535e-06, "loss": 0.57032537, "num_input_tokens_seen": 11998940, "step": 566, "time_per_iteration": 3.229137659072876 }, { "auxiliary_loss_clip": 0.0126129, "auxiliary_loss_mlp": 0.01073482, "balance_loss_clip": 1.06798041, "balance_loss_mlp": 1.03793335, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 3.1955261820278564, "language_loss": 0.76836932, "learning_rate": 3.999824660018126e-06, "loss": 0.79171705, "num_input_tokens_seen": 12018860, "step": 567, "time_per_iteration": 2.632741928100586 }, { "auxiliary_loss_clip": 0.01253596, "auxiliary_loss_mlp": 0.01083559, "balance_loss_clip": 1.06611466, "balance_loss_mlp": 1.05153918, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 2.115683621050472, "language_loss": 0.80834144, "learning_rate": 3.999819465118447e-06, "loss": 0.83171296, "num_input_tokens_seen": 12039675, "step": 568, "time_per_iteration": 2.7206337451934814 }, { "auxiliary_loss_clip": 0.01254921, "auxiliary_loss_mlp": 0.01082401, "balance_loss_clip": 1.06888509, "balance_loss_mlp": 1.04940367, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.891360159585894, "language_loss": 0.86560667, "learning_rate": 3.999814194385413e-06, "loss": 0.88897985, "num_input_tokens_seen": 12057680, "step": 569, "time_per_iteration": 2.7271673679351807 }, { "auxiliary_loss_clip": 0.01255135, "auxiliary_loss_mlp": 0.01082251, "balance_loss_clip": 1.06644094, "balance_loss_mlp": 1.04922962, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.6888504559193653, "language_loss": 0.95945716, "learning_rate": 3.9998088478192255e-06, "loss": 0.982831, "num_input_tokens_seen": 12076135, "step": 570, "time_per_iteration": 2.5918867588043213 }, { "auxiliary_loss_clip": 0.01255487, "auxiliary_loss_mlp": 0.0108066, "balance_loss_clip": 1.06228065, "balance_loss_mlp": 1.0435617, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.39132447086081, "language_loss": 0.7964232, "learning_rate": 3.9998034254200846e-06, "loss": 0.8197847, "num_input_tokens_seen": 12094785, "step": 571, "time_per_iteration": 2.590184450149536 }, { "auxiliary_loss_clip": 0.01256218, "auxiliary_loss_mlp": 0.01091484, "balance_loss_clip": 1.06740785, "balance_loss_mlp": 1.0565083, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.0738695690993, "language_loss": 0.80214274, "learning_rate": 3.999797927188199e-06, "loss": 0.82561976, "num_input_tokens_seen": 12114590, "step": 572, "time_per_iteration": 2.6862123012542725 }, { "auxiliary_loss_clip": 0.01263024, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.06995344, "balance_loss_mlp": 1.04098535, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 2.2324763929909284, "language_loss": 0.84548658, "learning_rate": 3.999792353123774e-06, "loss": 0.86885858, "num_input_tokens_seen": 12132390, "step": 573, "time_per_iteration": 2.78487229347229 }, { "auxiliary_loss_clip": 0.01256326, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.0644815, "balance_loss_mlp": 1.03781831, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.576428901855709, "language_loss": 0.76602584, "learning_rate": 3.999786703227023e-06, "loss": 0.78927696, "num_input_tokens_seen": 12149035, "step": 574, "time_per_iteration": 2.5697100162506104 }, { "auxiliary_loss_clip": 0.01255191, "auxiliary_loss_mlp": 0.0107671, "balance_loss_clip": 1.06581593, "balance_loss_mlp": 1.04502439, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 2.156110110571344, "language_loss": 0.83854586, "learning_rate": 3.9997809774981606e-06, "loss": 0.86186486, "num_input_tokens_seen": 12167530, "step": 575, "time_per_iteration": 2.596418619155884 }, { "auxiliary_loss_clip": 0.01249695, "auxiliary_loss_mlp": 0.01076053, "balance_loss_clip": 1.06684637, "balance_loss_mlp": 1.04334211, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 2.350120742735315, "language_loss": 0.83990753, "learning_rate": 3.9997751759374025e-06, "loss": 0.86316502, "num_input_tokens_seen": 12186340, "step": 576, "time_per_iteration": 5.821930646896362 }, { "auxiliary_loss_clip": 0.01257114, "auxiliary_loss_mlp": 0.01079503, "balance_loss_clip": 1.07237518, "balance_loss_mlp": 1.04817426, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.138457686407641, "language_loss": 0.85803086, "learning_rate": 3.99976929854497e-06, "loss": 0.88139701, "num_input_tokens_seen": 12204090, "step": 577, "time_per_iteration": 4.225277423858643 }, { "auxiliary_loss_clip": 0.01253845, "auxiliary_loss_mlp": 0.01080214, "balance_loss_clip": 1.06869018, "balance_loss_mlp": 1.04712176, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 4.535240156776142, "language_loss": 0.72226608, "learning_rate": 3.9997633453210845e-06, "loss": 0.74560666, "num_input_tokens_seen": 12224850, "step": 578, "time_per_iteration": 4.486239433288574 }, { "auxiliary_loss_clip": 0.01251871, "auxiliary_loss_mlp": 0.01080519, "balance_loss_clip": 1.06461096, "balance_loss_mlp": 1.04663968, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 1.9496379050984929, "language_loss": 0.77785492, "learning_rate": 3.999757316265973e-06, "loss": 0.80117887, "num_input_tokens_seen": 12244935, "step": 579, "time_per_iteration": 2.6706583499908447 }, { "auxiliary_loss_clip": 0.01251647, "auxiliary_loss_mlp": 0.01087497, "balance_loss_clip": 1.06656826, "balance_loss_mlp": 1.05435717, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.054973215074824, "language_loss": 0.86841297, "learning_rate": 3.999751211379863e-06, "loss": 0.8918044, "num_input_tokens_seen": 12262140, "step": 580, "time_per_iteration": 2.639146566390991 }, { "auxiliary_loss_clip": 0.01256528, "auxiliary_loss_mlp": 0.01069029, "balance_loss_clip": 1.06636667, "balance_loss_mlp": 1.0398469, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.205850105033732, "language_loss": 0.82570344, "learning_rate": 3.999745030662987e-06, "loss": 0.84895897, "num_input_tokens_seen": 12280930, "step": 581, "time_per_iteration": 2.6505649089813232 }, { "auxiliary_loss_clip": 0.01252942, "auxiliary_loss_mlp": 0.01072317, "balance_loss_clip": 1.06823969, "balance_loss_mlp": 1.04168022, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.1922492117358146, "language_loss": 0.7733047, "learning_rate": 3.99973877411558e-06, "loss": 0.79655731, "num_input_tokens_seen": 12299125, "step": 582, "time_per_iteration": 2.7323596477508545 }, { "auxiliary_loss_clip": 0.01250253, "auxiliary_loss_mlp": 0.01082356, "balance_loss_clip": 1.06794167, "balance_loss_mlp": 1.04861939, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 2.1536178016194327, "language_loss": 0.87679923, "learning_rate": 3.999732441737877e-06, "loss": 0.90012532, "num_input_tokens_seen": 12316905, "step": 583, "time_per_iteration": 2.6049294471740723 }, { "auxiliary_loss_clip": 0.01255473, "auxiliary_loss_mlp": 0.01092826, "balance_loss_clip": 1.06699181, "balance_loss_mlp": 1.06104505, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 3.7027110169592015, "language_loss": 0.81196821, "learning_rate": 3.99972603353012e-06, "loss": 0.83545119, "num_input_tokens_seen": 12335070, "step": 584, "time_per_iteration": 2.6011815071105957 }, { "auxiliary_loss_clip": 0.01251161, "auxiliary_loss_mlp": 0.01069463, "balance_loss_clip": 1.06472683, "balance_loss_mlp": 1.03832567, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 3.067717812226321, "language_loss": 0.92399198, "learning_rate": 3.999719549492551e-06, "loss": 0.94719815, "num_input_tokens_seen": 12350315, "step": 585, "time_per_iteration": 2.5592780113220215 }, { "auxiliary_loss_clip": 0.01251271, "auxiliary_loss_mlp": 0.01077423, "balance_loss_clip": 1.06562734, "balance_loss_mlp": 1.04552317, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 2.196660024103635, "language_loss": 0.87644351, "learning_rate": 3.9997129896254165e-06, "loss": 0.89973044, "num_input_tokens_seen": 12366030, "step": 586, "time_per_iteration": 2.5486221313476562 }, { "auxiliary_loss_clip": 0.01256485, "auxiliary_loss_mlp": 0.0108018, "balance_loss_clip": 1.06803596, "balance_loss_mlp": 1.04918551, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.1222089199850878, "language_loss": 0.76079381, "learning_rate": 3.999706353928965e-06, "loss": 0.78416049, "num_input_tokens_seen": 12384895, "step": 587, "time_per_iteration": 2.5923714637756348 }, { "auxiliary_loss_clip": 0.01257125, "auxiliary_loss_mlp": 0.01068649, "balance_loss_clip": 1.06683922, "balance_loss_mlp": 1.03586686, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 2.212352192395094, "language_loss": 0.78601038, "learning_rate": 3.999699642403449e-06, "loss": 0.80926806, "num_input_tokens_seen": 12404980, "step": 588, "time_per_iteration": 2.579280138015747 }, { "auxiliary_loss_clip": 0.0125398, "auxiliary_loss_mlp": 0.0107827, "balance_loss_clip": 1.06582928, "balance_loss_mlp": 1.04367518, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 2.153589114745919, "language_loss": 0.94312829, "learning_rate": 3.99969285504912e-06, "loss": 0.96645081, "num_input_tokens_seen": 12423835, "step": 589, "time_per_iteration": 2.5964701175689697 }, { "auxiliary_loss_clip": 0.01256884, "auxiliary_loss_mlp": 0.01078108, "balance_loss_clip": 1.06697679, "balance_loss_mlp": 1.04666042, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.1162556876212695, "language_loss": 0.84116042, "learning_rate": 3.99968599186624e-06, "loss": 0.8645103, "num_input_tokens_seen": 12443135, "step": 590, "time_per_iteration": 2.746436357498169 }, { "auxiliary_loss_clip": 0.01249398, "auxiliary_loss_mlp": 0.01068452, "balance_loss_clip": 1.06658125, "balance_loss_mlp": 1.03893578, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 1.984522351394552, "language_loss": 0.8684091, "learning_rate": 3.999679052855065e-06, "loss": 0.89158762, "num_input_tokens_seen": 12462895, "step": 591, "time_per_iteration": 2.692303419113159 }, { "auxiliary_loss_clip": 0.01250641, "auxiliary_loss_mlp": 0.01082122, "balance_loss_clip": 1.06297326, "balance_loss_mlp": 1.04883862, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 2.0873185001780783, "language_loss": 0.83075488, "learning_rate": 3.999672038015861e-06, "loss": 0.85408247, "num_input_tokens_seen": 12481515, "step": 592, "time_per_iteration": 2.7822203636169434 }, { "auxiliary_loss_clip": 0.01146211, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.05013406, "balance_loss_mlp": 1.02676773, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8804992705477848, "language_loss": 0.59754086, "learning_rate": 3.999664947348893e-06, "loss": 0.61934447, "num_input_tokens_seen": 12548220, "step": 593, "time_per_iteration": 3.274080276489258 }, { "auxiliary_loss_clip": 0.01249386, "auxiliary_loss_mlp": 0.0107742, "balance_loss_clip": 1.06737614, "balance_loss_mlp": 1.04473329, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 1.8086551314359374, "language_loss": 0.87077361, "learning_rate": 3.999657780854429e-06, "loss": 0.89404166, "num_input_tokens_seen": 12566105, "step": 594, "time_per_iteration": 2.682236671447754 }, { "auxiliary_loss_clip": 0.012487, "auxiliary_loss_mlp": 0.01082358, "balance_loss_clip": 1.06235993, "balance_loss_mlp": 1.05057716, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 5.516524335860627, "language_loss": 0.83920246, "learning_rate": 3.999650538532742e-06, "loss": 0.86251307, "num_input_tokens_seen": 12586680, "step": 595, "time_per_iteration": 2.773669481277466 }, { "auxiliary_loss_clip": 0.01248678, "auxiliary_loss_mlp": 0.01090544, "balance_loss_clip": 1.06579614, "balance_loss_mlp": 1.05850017, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 2.3448814752825204, "language_loss": 0.96041518, "learning_rate": 3.999643220384106e-06, "loss": 0.98380733, "num_input_tokens_seen": 12601605, "step": 596, "time_per_iteration": 2.6541590690612793 }, { "auxiliary_loss_clip": 0.01252662, "auxiliary_loss_mlp": 0.01081887, "balance_loss_clip": 1.0675534, "balance_loss_mlp": 1.05165553, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.4353221882859004, "language_loss": 0.82993281, "learning_rate": 3.999635826408799e-06, "loss": 0.85327828, "num_input_tokens_seen": 12620365, "step": 597, "time_per_iteration": 2.7023818492889404 }, { "auxiliary_loss_clip": 0.01247839, "auxiliary_loss_mlp": 0.01079829, "balance_loss_clip": 1.0668776, "balance_loss_mlp": 1.04766583, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 2.374757318483944, "language_loss": 0.81364304, "learning_rate": 3.999628356607101e-06, "loss": 0.83691972, "num_input_tokens_seen": 12641140, "step": 598, "time_per_iteration": 2.731229782104492 }, { "auxiliary_loss_clip": 0.01243692, "auxiliary_loss_mlp": 0.01077827, "balance_loss_clip": 1.0663228, "balance_loss_mlp": 1.04587913, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.817680341814684, "language_loss": 0.81172699, "learning_rate": 3.999620810979295e-06, "loss": 0.83494222, "num_input_tokens_seen": 12661080, "step": 599, "time_per_iteration": 2.710191011428833 }, { "auxiliary_loss_clip": 0.01250419, "auxiliary_loss_mlp": 0.01074577, "balance_loss_clip": 1.06356514, "balance_loss_mlp": 1.045228, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.3963649020429627, "language_loss": 0.8651731, "learning_rate": 3.999613189525668e-06, "loss": 0.88842309, "num_input_tokens_seen": 12678270, "step": 600, "time_per_iteration": 2.682262420654297 }, { "auxiliary_loss_clip": 0.01241882, "auxiliary_loss_mlp": 0.01084809, "balance_loss_clip": 1.05918193, "balance_loss_mlp": 1.05297971, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 2.0308947613075423, "language_loss": 0.82355881, "learning_rate": 3.999605492246508e-06, "loss": 0.84682572, "num_input_tokens_seen": 12697295, "step": 601, "time_per_iteration": 2.6570894718170166 }, { "auxiliary_loss_clip": 0.01240868, "auxiliary_loss_mlp": 0.010708, "balance_loss_clip": 1.06129336, "balance_loss_mlp": 1.03920949, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 2.3080142694085555, "language_loss": 0.7502507, "learning_rate": 3.999597719142107e-06, "loss": 0.77336735, "num_input_tokens_seen": 12716165, "step": 602, "time_per_iteration": 2.6434237957000732 }, { "auxiliary_loss_clip": 0.01239543, "auxiliary_loss_mlp": 0.01066859, "balance_loss_clip": 1.0604254, "balance_loss_mlp": 1.03562629, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 1.9681237382646195, "language_loss": 0.79599822, "learning_rate": 3.999589870212761e-06, "loss": 0.81906223, "num_input_tokens_seen": 12735475, "step": 603, "time_per_iteration": 2.7201666831970215 }, { "auxiliary_loss_clip": 0.01244834, "auxiliary_loss_mlp": 0.01071177, "balance_loss_clip": 1.06545615, "balance_loss_mlp": 1.04130292, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 1.8363641170913294, "language_loss": 0.86668456, "learning_rate": 3.9995819454587664e-06, "loss": 0.88984472, "num_input_tokens_seen": 12754540, "step": 604, "time_per_iteration": 2.60249924659729 }, { "auxiliary_loss_clip": 0.01248906, "auxiliary_loss_mlp": 0.01072985, "balance_loss_clip": 1.0674324, "balance_loss_mlp": 1.04010737, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 2.510130211393037, "language_loss": 0.80746496, "learning_rate": 3.999573944880424e-06, "loss": 0.83068383, "num_input_tokens_seen": 12773050, "step": 605, "time_per_iteration": 2.766684055328369 }, { "auxiliary_loss_clip": 0.01244274, "auxiliary_loss_mlp": 0.0107873, "balance_loss_clip": 1.0630821, "balance_loss_mlp": 1.04846251, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.2216143800596835, "language_loss": 0.85942292, "learning_rate": 3.9995658684780375e-06, "loss": 0.882653, "num_input_tokens_seen": 12791240, "step": 606, "time_per_iteration": 2.6133925914764404 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.01077404, "balance_loss_clip": 1.06413972, "balance_loss_mlp": 1.04588532, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 2.0684825764003394, "language_loss": 0.82179952, "learning_rate": 3.999557716251912e-06, "loss": 0.84505081, "num_input_tokens_seen": 12812245, "step": 607, "time_per_iteration": 2.6805856227874756 }, { "auxiliary_loss_clip": 0.01245394, "auxiliary_loss_mlp": 0.01073743, "balance_loss_clip": 1.06585169, "balance_loss_mlp": 1.04317796, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 2.3717179235904533, "language_loss": 0.83567071, "learning_rate": 3.999549488202358e-06, "loss": 0.8588621, "num_input_tokens_seen": 12831085, "step": 608, "time_per_iteration": 2.6593453884124756 }, { "auxiliary_loss_clip": 0.01251062, "auxiliary_loss_mlp": 0.01073705, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 1.04006422, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.4795108668903305, "language_loss": 0.8201133, "learning_rate": 3.999541184329688e-06, "loss": 0.84336102, "num_input_tokens_seen": 12849115, "step": 609, "time_per_iteration": 2.6299383640289307 }, { "auxiliary_loss_clip": 0.01255655, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07322037, "balance_loss_mlp": 1.06158984, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.992640540297191, "language_loss": 0.79448462, "learning_rate": 3.999532804634215e-06, "loss": 0.81796008, "num_input_tokens_seen": 12868005, "step": 610, "time_per_iteration": 2.65120530128479 }, { "auxiliary_loss_clip": 0.01254423, "auxiliary_loss_mlp": 0.01088228, "balance_loss_clip": 1.06914616, "balance_loss_mlp": 1.05656588, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 1.9328503999291824, "language_loss": 0.87282723, "learning_rate": 3.9995243491162575e-06, "loss": 0.89625371, "num_input_tokens_seen": 12886890, "step": 611, "time_per_iteration": 2.7398059368133545 }, { "auxiliary_loss_clip": 0.01248885, "auxiliary_loss_mlp": 0.01097673, "balance_loss_clip": 1.06917143, "balance_loss_mlp": 1.06651139, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 3.7435200854847266, "language_loss": 0.72589231, "learning_rate": 3.999515817776136e-06, "loss": 0.74935788, "num_input_tokens_seen": 12906130, "step": 612, "time_per_iteration": 2.700406551361084 }, { "auxiliary_loss_clip": 0.01249112, "auxiliary_loss_mlp": 0.01076924, "balance_loss_clip": 1.06581926, "balance_loss_mlp": 1.04480934, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 3.0863603820013434, "language_loss": 0.79110008, "learning_rate": 3.999507210614175e-06, "loss": 0.81436038, "num_input_tokens_seen": 12925260, "step": 613, "time_per_iteration": 2.630472183227539 }, { "auxiliary_loss_clip": 0.01242581, "auxiliary_loss_mlp": 0.01090278, "balance_loss_clip": 1.06378841, "balance_loss_mlp": 1.05961776, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 2.2015687298668336, "language_loss": 0.93885028, "learning_rate": 3.9994985276307e-06, "loss": 0.96217889, "num_input_tokens_seen": 12944590, "step": 614, "time_per_iteration": 2.6977972984313965 }, { "auxiliary_loss_clip": 0.01254503, "auxiliary_loss_mlp": 0.01081137, "balance_loss_clip": 1.07009673, "balance_loss_mlp": 1.04732919, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 3.0661216019279576, "language_loss": 0.72932875, "learning_rate": 3.999489768826041e-06, "loss": 0.75268513, "num_input_tokens_seen": 12964785, "step": 615, "time_per_iteration": 2.697291612625122 }, { "auxiliary_loss_clip": 0.01250213, "auxiliary_loss_mlp": 0.010716, "balance_loss_clip": 1.06649876, "balance_loss_mlp": 1.04015231, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 2.9941392641088695, "language_loss": 0.81630868, "learning_rate": 3.999480934200528e-06, "loss": 0.83952683, "num_input_tokens_seen": 12986705, "step": 616, "time_per_iteration": 4.1762495040893555 }, { "auxiliary_loss_clip": 0.0124999, "auxiliary_loss_mlp": 0.01076541, "balance_loss_clip": 1.06807041, "balance_loss_mlp": 1.0467627, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.320593216419041, "language_loss": 0.68178958, "learning_rate": 3.999472023754499e-06, "loss": 0.70505488, "num_input_tokens_seen": 13010560, "step": 617, "time_per_iteration": 4.224538564682007 }, { "auxiliary_loss_clip": 0.01254259, "auxiliary_loss_mlp": 0.010771, "balance_loss_clip": 1.07098567, "balance_loss_mlp": 1.04415071, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.245411088847763, "language_loss": 0.80595517, "learning_rate": 3.99946303748829e-06, "loss": 0.82926875, "num_input_tokens_seen": 13028935, "step": 618, "time_per_iteration": 4.200341463088989 }, { "auxiliary_loss_clip": 0.01257669, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06808555, "balance_loss_mlp": 1.04605901, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 10.155035046705617, "language_loss": 0.91591841, "learning_rate": 3.999453975402242e-06, "loss": 0.93928802, "num_input_tokens_seen": 13046000, "step": 619, "time_per_iteration": 2.5787301063537598 }, { "auxiliary_loss_clip": 0.01251145, "auxiliary_loss_mlp": 0.01083548, "balance_loss_clip": 1.06999123, "balance_loss_mlp": 1.05181432, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.0803022158745406, "language_loss": 0.94071603, "learning_rate": 3.9994448374967e-06, "loss": 0.96406299, "num_input_tokens_seen": 13062995, "step": 620, "time_per_iteration": 2.5987205505371094 }, { "auxiliary_loss_clip": 0.01249568, "auxiliary_loss_mlp": 0.0108317, "balance_loss_clip": 1.06624317, "balance_loss_mlp": 1.0502919, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 1.7431896174296577, "language_loss": 0.77319217, "learning_rate": 3.999435623772008e-06, "loss": 0.79651952, "num_input_tokens_seen": 13084120, "step": 621, "time_per_iteration": 2.68758225440979 }, { "auxiliary_loss_clip": 0.01247252, "auxiliary_loss_mlp": 0.01071013, "balance_loss_clip": 1.06894088, "balance_loss_mlp": 1.03792048, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 2.3852872810563364, "language_loss": 0.86546707, "learning_rate": 3.999426334228518e-06, "loss": 0.88864976, "num_input_tokens_seen": 13100035, "step": 622, "time_per_iteration": 2.607121467590332 }, { "auxiliary_loss_clip": 0.012499, "auxiliary_loss_mlp": 0.01072461, "balance_loss_clip": 1.06715882, "balance_loss_mlp": 1.04048872, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 2.2621736327299766, "language_loss": 0.90008956, "learning_rate": 3.999416968866581e-06, "loss": 0.92331314, "num_input_tokens_seen": 13118070, "step": 623, "time_per_iteration": 2.6513512134552 }, { "auxiliary_loss_clip": 0.01251762, "auxiliary_loss_mlp": 0.01090534, "balance_loss_clip": 1.07006013, "balance_loss_mlp": 1.05844235, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 2.760597076727266, "language_loss": 0.84095174, "learning_rate": 3.999407527686551e-06, "loss": 0.8643747, "num_input_tokens_seen": 13136355, "step": 624, "time_per_iteration": 2.66623592376709 }, { "auxiliary_loss_clip": 0.01252431, "auxiliary_loss_mlp": 0.01076353, "balance_loss_clip": 1.06697702, "balance_loss_mlp": 1.04423809, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 4.259276014089895, "language_loss": 0.66778994, "learning_rate": 3.999398010688788e-06, "loss": 0.69107783, "num_input_tokens_seen": 13155435, "step": 625, "time_per_iteration": 2.7288877964019775 }, { "auxiliary_loss_clip": 0.01244959, "auxiliary_loss_mlp": 0.01076274, "balance_loss_clip": 1.06605244, "balance_loss_mlp": 1.042943, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 3.375450269409945, "language_loss": 0.77496696, "learning_rate": 3.999388417873652e-06, "loss": 0.79817927, "num_input_tokens_seen": 13174295, "step": 626, "time_per_iteration": 2.648942470550537 }, { "auxiliary_loss_clip": 0.01249107, "auxiliary_loss_mlp": 0.0108376, "balance_loss_clip": 1.06770003, "balance_loss_mlp": 1.05200303, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 2.0480468386724766, "language_loss": 0.81463408, "learning_rate": 3.999378749241506e-06, "loss": 0.83796275, "num_input_tokens_seen": 13192500, "step": 627, "time_per_iteration": 2.6209845542907715 }, { "auxiliary_loss_clip": 0.01254363, "auxiliary_loss_mlp": 0.01084942, "balance_loss_clip": 1.07041132, "balance_loss_mlp": 1.05215955, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.6934072791943036, "language_loss": 0.88809037, "learning_rate": 3.999369004792719e-06, "loss": 0.91148341, "num_input_tokens_seen": 13213470, "step": 628, "time_per_iteration": 2.7221415042877197 }, { "auxiliary_loss_clip": 0.01247303, "auxiliary_loss_mlp": 0.01080197, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.04765344, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 2.536151380104699, "language_loss": 0.79840028, "learning_rate": 3.999359184527658e-06, "loss": 0.82167524, "num_input_tokens_seen": 13232365, "step": 629, "time_per_iteration": 2.6535024642944336 }, { "auxiliary_loss_clip": 0.01249218, "auxiliary_loss_mlp": 0.0106958, "balance_loss_clip": 1.06675959, "balance_loss_mlp": 1.03885961, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 1.6861994278356789, "language_loss": 0.76824844, "learning_rate": 3.999349288446696e-06, "loss": 0.79143643, "num_input_tokens_seen": 13251920, "step": 630, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.01254291, "auxiliary_loss_mlp": 0.01075963, "balance_loss_clip": 1.06833327, "balance_loss_mlp": 1.04504025, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 3.12435515576561, "language_loss": 0.91593724, "learning_rate": 3.99933931655021e-06, "loss": 0.93923974, "num_input_tokens_seen": 13267440, "step": 631, "time_per_iteration": 2.565293788909912 }, { "auxiliary_loss_clip": 0.01243525, "auxiliary_loss_mlp": 0.01087901, "balance_loss_clip": 1.06386209, "balance_loss_mlp": 1.05356884, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.6822536287963328, "language_loss": 0.92157543, "learning_rate": 3.999329268838575e-06, "loss": 0.94488978, "num_input_tokens_seen": 13287850, "step": 632, "time_per_iteration": 2.6235203742980957 }, { "auxiliary_loss_clip": 0.01248362, "auxiliary_loss_mlp": 0.01067296, "balance_loss_clip": 1.06696796, "balance_loss_mlp": 1.03613472, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 2.1097171792430456, "language_loss": 0.83139223, "learning_rate": 3.999319145312175e-06, "loss": 0.85454881, "num_input_tokens_seen": 13307760, "step": 633, "time_per_iteration": 2.6461985111236572 }, { "auxiliary_loss_clip": 0.01247735, "auxiliary_loss_mlp": 0.01079895, "balance_loss_clip": 1.06473529, "balance_loss_mlp": 1.04811358, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.599115294194595, "language_loss": 0.69883299, "learning_rate": 3.999308945971392e-06, "loss": 0.72210932, "num_input_tokens_seen": 13331230, "step": 634, "time_per_iteration": 2.709033727645874 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01009504, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.00249422, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.893126545279708, "language_loss": 0.61645919, "learning_rate": 3.999298670816614e-06, "loss": 0.63788629, "num_input_tokens_seen": 13394760, "step": 635, "time_per_iteration": 3.2099475860595703 }, { "auxiliary_loss_clip": 0.01244276, "auxiliary_loss_mlp": 0.01072984, "balance_loss_clip": 1.06475401, "balance_loss_mlp": 1.04129851, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.0563589539657205, "language_loss": 0.83629507, "learning_rate": 3.9992883198482294e-06, "loss": 0.85946769, "num_input_tokens_seen": 13412775, "step": 636, "time_per_iteration": 2.6278960704803467 }, { "auxiliary_loss_clip": 0.01248078, "auxiliary_loss_mlp": 0.01096471, "balance_loss_clip": 1.06714165, "balance_loss_mlp": 1.06530952, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.346379148367956, "language_loss": 0.79578567, "learning_rate": 3.999277893066632e-06, "loss": 0.81923115, "num_input_tokens_seen": 13427835, "step": 637, "time_per_iteration": 2.646414279937744 }, { "auxiliary_loss_clip": 0.01247939, "auxiliary_loss_mlp": 0.01088528, "balance_loss_clip": 1.06356907, "balance_loss_mlp": 1.0562222, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.9563283234999833, "language_loss": 0.83989692, "learning_rate": 3.999267390472215e-06, "loss": 0.86326158, "num_input_tokens_seen": 13447295, "step": 638, "time_per_iteration": 2.6416285037994385 }, { "auxiliary_loss_clip": 0.01253172, "auxiliary_loss_mlp": 0.01074704, "balance_loss_clip": 1.06563985, "balance_loss_mlp": 1.04163575, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.5596504471077224, "language_loss": 0.70109725, "learning_rate": 3.999256812065381e-06, "loss": 0.72437602, "num_input_tokens_seen": 13468455, "step": 639, "time_per_iteration": 2.610682487487793 }, { "auxiliary_loss_clip": 0.01248829, "auxiliary_loss_mlp": 0.01081808, "balance_loss_clip": 1.06618333, "balance_loss_mlp": 1.04790449, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.5791624605537082, "language_loss": 0.85322344, "learning_rate": 3.999246157846526e-06, "loss": 0.87652987, "num_input_tokens_seen": 13489085, "step": 640, "time_per_iteration": 2.700456380844116 }, { "auxiliary_loss_clip": 0.01252579, "auxiliary_loss_mlp": 0.01083722, "balance_loss_clip": 1.06751871, "balance_loss_mlp": 1.04934239, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.331268680461456, "language_loss": 0.82141805, "learning_rate": 3.9992354278160574e-06, "loss": 0.84478104, "num_input_tokens_seen": 13509120, "step": 641, "time_per_iteration": 2.6572046279907227 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.01008008, "balance_loss_clip": 1.03825259, "balance_loss_mlp": 1.00095105, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9037629700551453, "language_loss": 0.65444964, "learning_rate": 3.999224621974381e-06, "loss": 0.67580563, "num_input_tokens_seen": 13562005, "step": 642, "time_per_iteration": 3.199925422668457 }, { "auxiliary_loss_clip": 0.01246698, "auxiliary_loss_mlp": 0.01064563, "balance_loss_clip": 1.0651319, "balance_loss_mlp": 1.03453398, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 1.9113268312481755, "language_loss": 0.79272145, "learning_rate": 3.999213740321906e-06, "loss": 0.81583405, "num_input_tokens_seen": 13582185, "step": 643, "time_per_iteration": 2.641437292098999 }, { "auxiliary_loss_clip": 0.01244786, "auxiliary_loss_mlp": 0.01076057, "balance_loss_clip": 1.06219232, "balance_loss_mlp": 1.04599261, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 2.2104774200729262, "language_loss": 0.8294487, "learning_rate": 3.999202782859046e-06, "loss": 0.85265714, "num_input_tokens_seen": 13599555, "step": 644, "time_per_iteration": 2.600558280944824 }, { "auxiliary_loss_clip": 0.01247273, "auxiliary_loss_mlp": 0.01074554, "balance_loss_clip": 1.06383467, "balance_loss_mlp": 1.04193854, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 1.994902925690418, "language_loss": 0.82286513, "learning_rate": 3.9991917495862165e-06, "loss": 0.8460834, "num_input_tokens_seen": 13621160, "step": 645, "time_per_iteration": 2.6751983165740967 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.01070807, "balance_loss_clip": 1.06525111, "balance_loss_mlp": 1.03890657, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.290384247239265, "language_loss": 0.81889713, "learning_rate": 3.9991806405038345e-06, "loss": 0.84209144, "num_input_tokens_seen": 13641915, "step": 646, "time_per_iteration": 2.6987667083740234 }, { "auxiliary_loss_clip": 0.01250204, "auxiliary_loss_mlp": 0.01078836, "balance_loss_clip": 1.06982899, "balance_loss_mlp": 1.04791331, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 1.9171219640425325, "language_loss": 0.82015383, "learning_rate": 3.999169455612323e-06, "loss": 0.84344423, "num_input_tokens_seen": 13661410, "step": 647, "time_per_iteration": 2.590102195739746 }, { "auxiliary_loss_clip": 0.0124696, "auxiliary_loss_mlp": 0.01072111, "balance_loss_clip": 1.06628954, "balance_loss_mlp": 1.04216528, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 1.9398424653049293, "language_loss": 0.84477997, "learning_rate": 3.999158194912106e-06, "loss": 0.86797059, "num_input_tokens_seen": 13681705, "step": 648, "time_per_iteration": 2.7516121864318848 }, { "auxiliary_loss_clip": 0.01244808, "auxiliary_loss_mlp": 0.0107293, "balance_loss_clip": 1.06524062, "balance_loss_mlp": 1.04210222, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.3870859420748136, "language_loss": 0.84254295, "learning_rate": 3.9991468584036086e-06, "loss": 0.86572027, "num_input_tokens_seen": 13700400, "step": 649, "time_per_iteration": 2.6116180419921875 }, { "auxiliary_loss_clip": 0.01246653, "auxiliary_loss_mlp": 0.01073574, "balance_loss_clip": 1.06560743, "balance_loss_mlp": 1.0416739, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 2.00775905451926, "language_loss": 0.79783499, "learning_rate": 3.999135446087263e-06, "loss": 0.82103723, "num_input_tokens_seen": 13720145, "step": 650, "time_per_iteration": 2.574939727783203 }, { "auxiliary_loss_clip": 0.01242721, "auxiliary_loss_mlp": 0.01077536, "balance_loss_clip": 1.06209707, "balance_loss_mlp": 1.04534984, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.334811800093409, "language_loss": 0.78698987, "learning_rate": 3.9991239579635e-06, "loss": 0.81019247, "num_input_tokens_seen": 13737500, "step": 651, "time_per_iteration": 2.5930917263031006 }, { "auxiliary_loss_clip": 0.0124425, "auxiliary_loss_mlp": 0.010838, "balance_loss_clip": 1.06317663, "balance_loss_mlp": 1.05087411, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 3.361008988618244, "language_loss": 0.87392938, "learning_rate": 3.999112394032757e-06, "loss": 0.89720988, "num_input_tokens_seen": 13754750, "step": 652, "time_per_iteration": 2.6072869300842285 }, { "auxiliary_loss_clip": 0.01239638, "auxiliary_loss_mlp": 0.01073938, "balance_loss_clip": 1.06362963, "balance_loss_mlp": 1.0434916, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.6218665998754904, "language_loss": 0.79297256, "learning_rate": 3.999100754295471e-06, "loss": 0.81610829, "num_input_tokens_seen": 13771990, "step": 653, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01250652, "auxiliary_loss_mlp": 0.01075546, "balance_loss_clip": 1.06496143, "balance_loss_mlp": 1.04374111, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 2.0720296605490094, "language_loss": 0.85909009, "learning_rate": 3.999089038752085e-06, "loss": 0.88235211, "num_input_tokens_seen": 13792750, "step": 654, "time_per_iteration": 2.6775124073028564 }, { "auxiliary_loss_clip": 0.01126661, "auxiliary_loss_mlp": 0.01016641, "balance_loss_clip": 1.03977203, "balance_loss_mlp": 1.01001298, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.7366259780501333, "language_loss": 0.4997997, "learning_rate": 3.999077247403041e-06, "loss": 0.52123272, "num_input_tokens_seen": 13858570, "step": 655, "time_per_iteration": 3.3006510734558105 }, { "auxiliary_loss_clip": 0.01241143, "auxiliary_loss_mlp": 0.01076374, "balance_loss_clip": 1.0658412, "balance_loss_mlp": 1.04680991, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 4.17474796245144, "language_loss": 0.80903178, "learning_rate": 3.9990653802487886e-06, "loss": 0.83220696, "num_input_tokens_seen": 13876335, "step": 656, "time_per_iteration": 4.228931427001953 }, { "auxiliary_loss_clip": 0.01251519, "auxiliary_loss_mlp": 0.01093573, "balance_loss_clip": 1.06740427, "balance_loss_mlp": 1.05802524, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.068956760077258, "language_loss": 0.76289558, "learning_rate": 3.999053437289776e-06, "loss": 0.7863465, "num_input_tokens_seen": 13892640, "step": 657, "time_per_iteration": 4.218473434448242 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01076812, "balance_loss_clip": 1.06641233, "balance_loss_mlp": 1.04522133, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 2.07475431213476, "language_loss": 0.8179062, "learning_rate": 3.999041418526457e-06, "loss": 0.84115672, "num_input_tokens_seen": 13910085, "step": 658, "time_per_iteration": 2.671675682067871 }, { "auxiliary_loss_clip": 0.01242678, "auxiliary_loss_mlp": 0.01077963, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.0454669, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.2444983110753625, "language_loss": 0.90790772, "learning_rate": 3.999029323959287e-06, "loss": 0.93111408, "num_input_tokens_seen": 13928800, "step": 659, "time_per_iteration": 4.2601988315582275 }, { "auxiliary_loss_clip": 0.01247633, "auxiliary_loss_mlp": 0.01073069, "balance_loss_clip": 1.06654835, "balance_loss_mlp": 1.04215825, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.2083626038373656, "language_loss": 0.79760063, "learning_rate": 3.999017153588724e-06, "loss": 0.82080764, "num_input_tokens_seen": 13948325, "step": 660, "time_per_iteration": 2.62716007232666 }, { "auxiliary_loss_clip": 0.01246027, "auxiliary_loss_mlp": 0.01077579, "balance_loss_clip": 1.0675652, "balance_loss_mlp": 1.0456785, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.6747851381362888, "language_loss": 0.81757367, "learning_rate": 3.999004907415231e-06, "loss": 0.8408097, "num_input_tokens_seen": 13969090, "step": 661, "time_per_iteration": 2.645423412322998 }, { "auxiliary_loss_clip": 0.01119895, "auxiliary_loss_mlp": 0.01007167, "balance_loss_clip": 1.03320217, "balance_loss_mlp": 1.00077713, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.9117564509831767, "language_loss": 0.69349593, "learning_rate": 3.998992585439272e-06, "loss": 0.71476656, "num_input_tokens_seen": 14037555, "step": 662, "time_per_iteration": 3.3032331466674805 }, { "auxiliary_loss_clip": 0.01249217, "auxiliary_loss_mlp": 0.01074722, "balance_loss_clip": 1.06995225, "balance_loss_mlp": 1.04322648, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 2.160679749799672, "language_loss": 0.82765651, "learning_rate": 3.998980187661314e-06, "loss": 0.85089582, "num_input_tokens_seen": 14055765, "step": 663, "time_per_iteration": 2.6217782497406006 }, { "auxiliary_loss_clip": 0.01252759, "auxiliary_loss_mlp": 0.01063705, "balance_loss_clip": 1.06966817, "balance_loss_mlp": 1.03254378, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.19374813563436, "language_loss": 0.87302262, "learning_rate": 3.998967714081826e-06, "loss": 0.89618725, "num_input_tokens_seen": 14074195, "step": 664, "time_per_iteration": 2.6729183197021484 }, { "auxiliary_loss_clip": 0.01241647, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.06656313, "balance_loss_mlp": 1.03346384, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.036983550581997, "language_loss": 0.84821391, "learning_rate": 3.998955164701281e-06, "loss": 0.87128186, "num_input_tokens_seen": 14090215, "step": 665, "time_per_iteration": 2.593832015991211 }, { "auxiliary_loss_clip": 0.012521, "auxiliary_loss_mlp": 0.01085682, "balance_loss_clip": 1.06867695, "balance_loss_mlp": 1.05223155, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.172699570421913, "language_loss": 0.81745672, "learning_rate": 3.998942539520158e-06, "loss": 0.8408345, "num_input_tokens_seen": 14112150, "step": 666, "time_per_iteration": 2.6743290424346924 }, { "auxiliary_loss_clip": 0.01241565, "auxiliary_loss_mlp": 0.01073617, "balance_loss_clip": 1.06443083, "balance_loss_mlp": 1.04007161, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 2.1003520396389828, "language_loss": 0.87117827, "learning_rate": 3.998929838538932e-06, "loss": 0.89433014, "num_input_tokens_seen": 14131475, "step": 667, "time_per_iteration": 2.6147067546844482 }, { "auxiliary_loss_clip": 0.0124275, "auxiliary_loss_mlp": 0.01071583, "balance_loss_clip": 1.07009172, "balance_loss_mlp": 1.04161382, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.331266403294307, "language_loss": 0.80641299, "learning_rate": 3.998917061758087e-06, "loss": 0.82955635, "num_input_tokens_seen": 14146165, "step": 668, "time_per_iteration": 2.6015820503234863 }, { "auxiliary_loss_clip": 0.01115034, "auxiliary_loss_mlp": 0.01008949, "balance_loss_clip": 1.02975297, "balance_loss_mlp": 1.00317907, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7870483750596657, "language_loss": 0.60066259, "learning_rate": 3.998904209178107e-06, "loss": 0.62190247, "num_input_tokens_seen": 14215005, "step": 669, "time_per_iteration": 3.2993202209472656 }, { "auxiliary_loss_clip": 0.01242272, "auxiliary_loss_mlp": 0.01071485, "balance_loss_clip": 1.06408751, "balance_loss_mlp": 1.04120564, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.7022357666604506, "language_loss": 0.86290276, "learning_rate": 3.9988912807994785e-06, "loss": 0.88604033, "num_input_tokens_seen": 14235510, "step": 670, "time_per_iteration": 2.700657844543457 }, { "auxiliary_loss_clip": 0.01242087, "auxiliary_loss_mlp": 0.01080448, "balance_loss_clip": 1.06647801, "balance_loss_mlp": 1.05014467, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.8224152334464152, "language_loss": 0.75569212, "learning_rate": 3.998878276622692e-06, "loss": 0.77891749, "num_input_tokens_seen": 14254565, "step": 671, "time_per_iteration": 2.6698572635650635 }, { "auxiliary_loss_clip": 0.01248936, "auxiliary_loss_mlp": 0.01076667, "balance_loss_clip": 1.06943047, "balance_loss_mlp": 1.04605412, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 1.9730812981627939, "language_loss": 0.92416775, "learning_rate": 3.998865196648242e-06, "loss": 0.94742376, "num_input_tokens_seen": 14271885, "step": 672, "time_per_iteration": 2.567563533782959 }, { "auxiliary_loss_clip": 0.01245231, "auxiliary_loss_mlp": 0.010776, "balance_loss_clip": 1.0677104, "balance_loss_mlp": 1.04422188, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 1.800141829654062, "language_loss": 0.90174723, "learning_rate": 3.998852040876622e-06, "loss": 0.92497551, "num_input_tokens_seen": 14289670, "step": 673, "time_per_iteration": 2.547154426574707 }, { "auxiliary_loss_clip": 0.01239752, "auxiliary_loss_mlp": 0.01084248, "balance_loss_clip": 1.06466973, "balance_loss_mlp": 1.05184698, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.3989934860433486, "language_loss": 0.75016737, "learning_rate": 3.998838809308334e-06, "loss": 0.7734074, "num_input_tokens_seen": 14309285, "step": 674, "time_per_iteration": 2.681896924972534 }, { "auxiliary_loss_clip": 0.01249861, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06744063, "balance_loss_mlp": 1.03334963, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.55613513039197, "language_loss": 0.78289407, "learning_rate": 3.9988255019438766e-06, "loss": 0.80603576, "num_input_tokens_seen": 14328300, "step": 675, "time_per_iteration": 2.6965043544769287 }, { "auxiliary_loss_clip": 0.01241749, "auxiliary_loss_mlp": 0.01079652, "balance_loss_clip": 1.06532836, "balance_loss_mlp": 1.04648817, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 2.047384767684118, "language_loss": 0.76844448, "learning_rate": 3.998812118783757e-06, "loss": 0.79165846, "num_input_tokens_seen": 14346395, "step": 676, "time_per_iteration": 2.6216623783111572 }, { "auxiliary_loss_clip": 0.01248147, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06811619, "balance_loss_mlp": 1.04813254, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 2.318905665785744, "language_loss": 0.85139382, "learning_rate": 3.9987986598284804e-06, "loss": 0.8746683, "num_input_tokens_seen": 14364605, "step": 677, "time_per_iteration": 2.5663015842437744 }, { "auxiliary_loss_clip": 0.01240385, "auxiliary_loss_mlp": 0.01070741, "balance_loss_clip": 1.06558609, "balance_loss_mlp": 1.03901923, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 2.5041724349122645, "language_loss": 0.76572061, "learning_rate": 3.998785125078559e-06, "loss": 0.78883183, "num_input_tokens_seen": 14385265, "step": 678, "time_per_iteration": 2.624689817428589 }, { "auxiliary_loss_clip": 0.01240972, "auxiliary_loss_mlp": 0.01072606, "balance_loss_clip": 1.06374967, "balance_loss_mlp": 1.04242194, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 1.7096242150987748, "language_loss": 0.82139099, "learning_rate": 3.998771514534505e-06, "loss": 0.84452677, "num_input_tokens_seen": 14406090, "step": 679, "time_per_iteration": 2.7073023319244385 }, { "auxiliary_loss_clip": 0.01248879, "auxiliary_loss_mlp": 0.01064116, "balance_loss_clip": 1.07185793, "balance_loss_mlp": 1.0340035, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.963288262989073, "language_loss": 0.76260424, "learning_rate": 3.998757828196835e-06, "loss": 0.78573418, "num_input_tokens_seen": 14425130, "step": 680, "time_per_iteration": 2.6767218112945557 }, { "auxiliary_loss_clip": 0.01244441, "auxiliary_loss_mlp": 0.01071738, "balance_loss_clip": 1.06458521, "balance_loss_mlp": 1.03864551, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.713943858995997, "language_loss": 0.83089912, "learning_rate": 3.9987440660660685e-06, "loss": 0.85406095, "num_input_tokens_seen": 14447355, "step": 681, "time_per_iteration": 2.6386382579803467 }, { "auxiliary_loss_clip": 0.01244279, "auxiliary_loss_mlp": 0.01073303, "balance_loss_clip": 1.06438065, "balance_loss_mlp": 1.04127121, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.706698119772261, "language_loss": 0.71538687, "learning_rate": 3.998730228142726e-06, "loss": 0.7385627, "num_input_tokens_seen": 14466790, "step": 682, "time_per_iteration": 2.618792772293091 }, { "auxiliary_loss_clip": 0.01243156, "auxiliary_loss_mlp": 0.01078429, "balance_loss_clip": 1.06440282, "balance_loss_mlp": 1.04781592, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.6947476714586034, "language_loss": 0.72599399, "learning_rate": 3.998716314427333e-06, "loss": 0.74920982, "num_input_tokens_seen": 14485195, "step": 683, "time_per_iteration": 2.676133394241333 }, { "auxiliary_loss_clip": 0.01241071, "auxiliary_loss_mlp": 0.01079531, "balance_loss_clip": 1.07077932, "balance_loss_mlp": 1.04851258, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 2.098652785935233, "language_loss": 0.81419414, "learning_rate": 3.998702324920417e-06, "loss": 0.8374002, "num_input_tokens_seen": 14503370, "step": 684, "time_per_iteration": 2.6538476943969727 }, { "auxiliary_loss_clip": 0.01242791, "auxiliary_loss_mlp": 0.0107365, "balance_loss_clip": 1.06783867, "balance_loss_mlp": 1.04139185, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.5053911947555274, "language_loss": 0.90680599, "learning_rate": 3.9986882596225085e-06, "loss": 0.92997038, "num_input_tokens_seen": 14526415, "step": 685, "time_per_iteration": 2.6541450023651123 }, { "auxiliary_loss_clip": 0.01244219, "auxiliary_loss_mlp": 0.01072481, "balance_loss_clip": 1.06659365, "balance_loss_mlp": 1.04093838, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 2.2251875217653185, "language_loss": 0.87851977, "learning_rate": 3.998674118534141e-06, "loss": 0.90168673, "num_input_tokens_seen": 14546595, "step": 686, "time_per_iteration": 2.7298531532287598 }, { "auxiliary_loss_clip": 0.01247476, "auxiliary_loss_mlp": 0.01073385, "balance_loss_clip": 1.06586432, "balance_loss_mlp": 1.04224789, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 1.8582614005091855, "language_loss": 0.7152915, "learning_rate": 3.998659901655851e-06, "loss": 0.73850012, "num_input_tokens_seen": 14566590, "step": 687, "time_per_iteration": 2.6284232139587402 }, { "auxiliary_loss_clip": 0.01243582, "auxiliary_loss_mlp": 0.01076448, "balance_loss_clip": 1.06979251, "balance_loss_mlp": 1.04756403, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 2.596672934278983, "language_loss": 0.86028284, "learning_rate": 3.998645608988177e-06, "loss": 0.88348317, "num_input_tokens_seen": 14585965, "step": 688, "time_per_iteration": 2.522634506225586 }, { "auxiliary_loss_clip": 0.01241593, "auxiliary_loss_mlp": 0.01079647, "balance_loss_clip": 1.06802177, "balance_loss_mlp": 1.04908216, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.852238187591699, "language_loss": 0.83393514, "learning_rate": 3.998631240531661e-06, "loss": 0.85714757, "num_input_tokens_seen": 14606015, "step": 689, "time_per_iteration": 2.6140944957733154 }, { "auxiliary_loss_clip": 0.01238254, "auxiliary_loss_mlp": 0.01085009, "balance_loss_clip": 1.06293654, "balance_loss_mlp": 1.05463421, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.870474577544969, "language_loss": 0.68398476, "learning_rate": 3.998616796286848e-06, "loss": 0.70721734, "num_input_tokens_seen": 14629955, "step": 690, "time_per_iteration": 2.658987522125244 }, { "auxiliary_loss_clip": 0.01235903, "auxiliary_loss_mlp": 0.01075275, "balance_loss_clip": 1.0625304, "balance_loss_mlp": 1.04565191, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.634289561889102, "language_loss": 0.74927461, "learning_rate": 3.998602276254286e-06, "loss": 0.77238643, "num_input_tokens_seen": 14648000, "step": 691, "time_per_iteration": 2.599957227706909 }, { "auxiliary_loss_clip": 0.01239089, "auxiliary_loss_mlp": 0.01081705, "balance_loss_clip": 1.06458938, "balance_loss_mlp": 1.04978108, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 2.123432521314224, "language_loss": 0.84469771, "learning_rate": 3.998587680434526e-06, "loss": 0.86790562, "num_input_tokens_seen": 14662235, "step": 692, "time_per_iteration": 2.5748491287231445 }, { "auxiliary_loss_clip": 0.01242126, "auxiliary_loss_mlp": 0.01076613, "balance_loss_clip": 1.06274796, "balance_loss_mlp": 1.04409313, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.3463094595874665, "language_loss": 0.88948715, "learning_rate": 3.99857300882812e-06, "loss": 0.91267455, "num_input_tokens_seen": 14676065, "step": 693, "time_per_iteration": 2.569277286529541 }, { "auxiliary_loss_clip": 0.01245438, "auxiliary_loss_mlp": 0.01071471, "balance_loss_clip": 1.06845784, "balance_loss_mlp": 1.04123962, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 5.499777597079252, "language_loss": 0.81987685, "learning_rate": 3.998558261435626e-06, "loss": 0.84304595, "num_input_tokens_seen": 14694955, "step": 694, "time_per_iteration": 2.6798722743988037 }, { "auxiliary_loss_clip": 0.01242101, "auxiliary_loss_mlp": 0.01073692, "balance_loss_clip": 1.06179321, "balance_loss_mlp": 1.04303181, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 2.051302362473346, "language_loss": 0.83672506, "learning_rate": 3.9985434382576015e-06, "loss": 0.85988301, "num_input_tokens_seen": 14715510, "step": 695, "time_per_iteration": 2.684537649154663 }, { "auxiliary_loss_clip": 0.01242205, "auxiliary_loss_mlp": 0.01080004, "balance_loss_clip": 1.06535804, "balance_loss_mlp": 1.04822254, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.113561459264794, "language_loss": 0.84351176, "learning_rate": 3.99852853929461e-06, "loss": 0.86673379, "num_input_tokens_seen": 14731755, "step": 696, "time_per_iteration": 4.1141321659088135 }, { "auxiliary_loss_clip": 0.01238462, "auxiliary_loss_mlp": 0.01083207, "balance_loss_clip": 1.06265593, "balance_loss_mlp": 1.05099702, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 6.921460264787684, "language_loss": 0.93193012, "learning_rate": 3.998513564547216e-06, "loss": 0.95514685, "num_input_tokens_seen": 14750810, "step": 697, "time_per_iteration": 5.71666693687439 }, { "auxiliary_loss_clip": 0.01235964, "auxiliary_loss_mlp": 0.01074448, "balance_loss_clip": 1.06324339, "balance_loss_mlp": 1.04495573, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.1002029886241904, "language_loss": 0.83775562, "learning_rate": 3.998498514015987e-06, "loss": 0.86085975, "num_input_tokens_seen": 14768435, "step": 698, "time_per_iteration": 4.194530010223389 }, { "auxiliary_loss_clip": 0.01239177, "auxiliary_loss_mlp": 0.01093516, "balance_loss_clip": 1.06274605, "balance_loss_mlp": 1.06175828, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 2.1234669437327955, "language_loss": 0.91715962, "learning_rate": 3.998483387701495e-06, "loss": 0.94048655, "num_input_tokens_seen": 14786690, "step": 699, "time_per_iteration": 2.6399078369140625 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.0102038, "balance_loss_clip": 1.03020263, "balance_loss_mlp": 1.01403797, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.9035134571641164, "language_loss": 0.67873394, "learning_rate": 3.998468185604312e-06, "loss": 0.70007098, "num_input_tokens_seen": 14853840, "step": 700, "time_per_iteration": 3.192026376724243 }, { "auxiliary_loss_clip": 0.01246765, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.06717515, "balance_loss_mlp": 1.05017269, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 2.2754848646841888, "language_loss": 0.884673, "learning_rate": 3.998452907725016e-06, "loss": 0.90797025, "num_input_tokens_seen": 14869580, "step": 701, "time_per_iteration": 2.5790441036224365 }, { "auxiliary_loss_clip": 0.01242428, "auxiliary_loss_mlp": 0.01080259, "balance_loss_clip": 1.06793952, "balance_loss_mlp": 1.04833448, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 2.000128536077818, "language_loss": 0.67100394, "learning_rate": 3.998437554064184e-06, "loss": 0.69423079, "num_input_tokens_seen": 14891065, "step": 702, "time_per_iteration": 2.6247870922088623 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01005563, "balance_loss_clip": 1.02512407, "balance_loss_mlp": 0.99922067, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8439205282656718, "language_loss": 0.60756463, "learning_rate": 3.9984221246224006e-06, "loss": 0.62869191, "num_input_tokens_seen": 14954815, "step": 703, "time_per_iteration": 3.1991655826568604 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01006502, "balance_loss_clip": 1.02562141, "balance_loss_mlp": 0.99973089, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0471369072250156, "language_loss": 0.57677412, "learning_rate": 3.9984066194002494e-06, "loss": 0.59791845, "num_input_tokens_seen": 15003050, "step": 704, "time_per_iteration": 3.037705659866333 }, { "auxiliary_loss_clip": 0.01241513, "auxiliary_loss_mlp": 0.01072126, "balance_loss_clip": 1.06549489, "balance_loss_mlp": 1.0406549, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.9488804643242488, "language_loss": 0.87553984, "learning_rate": 3.998391038398319e-06, "loss": 0.89867628, "num_input_tokens_seen": 15021990, "step": 705, "time_per_iteration": 2.6233222484588623 }, { "auxiliary_loss_clip": 0.01230342, "auxiliary_loss_mlp": 0.0107194, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.04204249, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 2.556815837902013, "language_loss": 0.71071029, "learning_rate": 3.998375381617201e-06, "loss": 0.73373306, "num_input_tokens_seen": 15040700, "step": 706, "time_per_iteration": 2.560434579849243 }, { "auxiliary_loss_clip": 0.0123412, "auxiliary_loss_mlp": 0.01070349, "balance_loss_clip": 1.06249404, "balance_loss_mlp": 1.03799582, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 2.0814078632624167, "language_loss": 0.93418455, "learning_rate": 3.9983596490574875e-06, "loss": 0.95722926, "num_input_tokens_seen": 15056725, "step": 707, "time_per_iteration": 2.6130473613739014 }, { "auxiliary_loss_clip": 0.01237541, "auxiliary_loss_mlp": 0.01067908, "balance_loss_clip": 1.05994225, "balance_loss_mlp": 1.03617477, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 2.424205580643553, "language_loss": 0.81514043, "learning_rate": 3.998343840719776e-06, "loss": 0.83819497, "num_input_tokens_seen": 15077550, "step": 708, "time_per_iteration": 2.656277894973755 }, { "auxiliary_loss_clip": 0.01243932, "auxiliary_loss_mlp": 0.0108167, "balance_loss_clip": 1.06461239, "balance_loss_mlp": 1.04934049, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.0883592727868145, "language_loss": 0.82027614, "learning_rate": 3.998327956604666e-06, "loss": 0.8435322, "num_input_tokens_seen": 15094955, "step": 709, "time_per_iteration": 2.5758891105651855 }, { "auxiliary_loss_clip": 0.01243538, "auxiliary_loss_mlp": 0.01071217, "balance_loss_clip": 1.06374872, "balance_loss_mlp": 1.03960264, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 2.7686525844665133, "language_loss": 0.8502059, "learning_rate": 3.99831199671276e-06, "loss": 0.87335348, "num_input_tokens_seen": 15113395, "step": 710, "time_per_iteration": 2.571559429168701 }, { "auxiliary_loss_clip": 0.0124498, "auxiliary_loss_mlp": 0.01072229, "balance_loss_clip": 1.06788397, "balance_loss_mlp": 1.04166365, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 7.911177124524585, "language_loss": 0.84914303, "learning_rate": 3.998295961044662e-06, "loss": 0.87231517, "num_input_tokens_seen": 15132920, "step": 711, "time_per_iteration": 2.569959878921509 }, { "auxiliary_loss_clip": 0.01237769, "auxiliary_loss_mlp": 0.01074338, "balance_loss_clip": 1.06188083, "balance_loss_mlp": 1.04229426, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 1.7189790042473796, "language_loss": 0.85439789, "learning_rate": 3.9982798496009804e-06, "loss": 0.87751901, "num_input_tokens_seen": 15153115, "step": 712, "time_per_iteration": 2.6200509071350098 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.01069523, "balance_loss_clip": 1.06085837, "balance_loss_mlp": 1.03989983, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 5.490507523621204, "language_loss": 0.91178697, "learning_rate": 3.998263662382328e-06, "loss": 0.93490618, "num_input_tokens_seen": 15172770, "step": 713, "time_per_iteration": 2.6353416442871094 }, { "auxiliary_loss_clip": 0.01104693, "auxiliary_loss_mlp": 0.01006514, "balance_loss_clip": 1.02325606, "balance_loss_mlp": 0.99955195, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.9310328114407391, "language_loss": 0.63725489, "learning_rate": 3.9982473993893165e-06, "loss": 0.65836698, "num_input_tokens_seen": 15240055, "step": 714, "time_per_iteration": 3.2544445991516113 }, { "auxiliary_loss_clip": 0.01239175, "auxiliary_loss_mlp": 0.01085992, "balance_loss_clip": 1.06602359, "balance_loss_mlp": 1.05552244, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 1.8449858143817996, "language_loss": 0.75010103, "learning_rate": 3.998231060622563e-06, "loss": 0.77335274, "num_input_tokens_seen": 15261585, "step": 715, "time_per_iteration": 2.7048466205596924 }, { "auxiliary_loss_clip": 0.01242734, "auxiliary_loss_mlp": 0.01074126, "balance_loss_clip": 1.0666225, "balance_loss_mlp": 1.04227352, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 1.9505519101092619, "language_loss": 0.72289199, "learning_rate": 3.998214646082688e-06, "loss": 0.74606061, "num_input_tokens_seen": 15281160, "step": 716, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01104303, "auxiliary_loss_mlp": 0.01006894, "balance_loss_clip": 1.02277207, "balance_loss_mlp": 0.99997944, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.9245106661639481, "language_loss": 0.65587437, "learning_rate": 3.998198155770314e-06, "loss": 0.67698634, "num_input_tokens_seen": 15344505, "step": 717, "time_per_iteration": 3.250870943069458 }, { "auxiliary_loss_clip": 0.01103971, "auxiliary_loss_mlp": 0.01009587, "balance_loss_clip": 1.02238059, "balance_loss_mlp": 1.00267255, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9849394627593366, "language_loss": 0.58785796, "learning_rate": 3.998181589686065e-06, "loss": 0.60899353, "num_input_tokens_seen": 15404050, "step": 718, "time_per_iteration": 3.0402464866638184 }, { "auxiliary_loss_clip": 0.0124025, "auxiliary_loss_mlp": 0.0107507, "balance_loss_clip": 1.06784248, "balance_loss_mlp": 1.0424546, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 1.9557310597444375, "language_loss": 0.91440111, "learning_rate": 3.99816494783057e-06, "loss": 0.9375543, "num_input_tokens_seen": 15424190, "step": 719, "time_per_iteration": 2.6500089168548584 }, { "auxiliary_loss_clip": 0.01235843, "auxiliary_loss_mlp": 0.01072906, "balance_loss_clip": 1.06020999, "balance_loss_mlp": 1.04296041, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.7057721639328365, "language_loss": 0.66461253, "learning_rate": 3.99814823020446e-06, "loss": 0.68770003, "num_input_tokens_seen": 15446500, "step": 720, "time_per_iteration": 2.673184871673584 }, { "auxiliary_loss_clip": 0.01234245, "auxiliary_loss_mlp": 0.01072069, "balance_loss_clip": 1.06111717, "balance_loss_mlp": 1.04131258, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.9491363249287763, "language_loss": 0.77460182, "learning_rate": 3.9981314368083684e-06, "loss": 0.79766488, "num_input_tokens_seen": 15465830, "step": 721, "time_per_iteration": 2.6695611476898193 }, { "auxiliary_loss_clip": 0.01241854, "auxiliary_loss_mlp": 0.01087169, "balance_loss_clip": 1.06622314, "balance_loss_mlp": 1.05719972, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.8383174670702718, "language_loss": 0.88298881, "learning_rate": 3.998114567642933e-06, "loss": 0.90627909, "num_input_tokens_seen": 15479985, "step": 722, "time_per_iteration": 2.661313533782959 }, { "auxiliary_loss_clip": 0.01244836, "auxiliary_loss_mlp": 0.01076885, "balance_loss_clip": 1.06665182, "balance_loss_mlp": 1.0480125, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 5.515838365549148, "language_loss": 0.84387141, "learning_rate": 3.998097622708792e-06, "loss": 0.86708868, "num_input_tokens_seen": 15501545, "step": 723, "time_per_iteration": 2.6447954177856445 }, { "auxiliary_loss_clip": 0.01245825, "auxiliary_loss_mlp": 0.01081354, "balance_loss_clip": 1.06723523, "balance_loss_mlp": 1.05019248, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 1.7852936089408447, "language_loss": 0.82789439, "learning_rate": 3.99808060200659e-06, "loss": 0.85116619, "num_input_tokens_seen": 15521725, "step": 724, "time_per_iteration": 2.676985263824463 }, { "auxiliary_loss_clip": 0.0124127, "auxiliary_loss_mlp": 0.01087491, "balance_loss_clip": 1.06535757, "balance_loss_mlp": 1.05609179, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 2.011685360503238, "language_loss": 0.79444051, "learning_rate": 3.998063505536971e-06, "loss": 0.81772816, "num_input_tokens_seen": 15540910, "step": 725, "time_per_iteration": 2.6241447925567627 }, { "auxiliary_loss_clip": 0.01251777, "auxiliary_loss_mlp": 0.01074923, "balance_loss_clip": 1.06783843, "balance_loss_mlp": 1.04309392, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.2160842755462817, "language_loss": 0.87175703, "learning_rate": 3.998046333300584e-06, "loss": 0.89502406, "num_input_tokens_seen": 15558640, "step": 726, "time_per_iteration": 2.555551052093506 }, { "auxiliary_loss_clip": 0.01100917, "auxiliary_loss_mlp": 0.01015411, "balance_loss_clip": 1.02171838, "balance_loss_mlp": 1.00947404, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.908981905466007, "language_loss": 0.55868411, "learning_rate": 3.998029085298079e-06, "loss": 0.5798474, "num_input_tokens_seen": 15612975, "step": 727, "time_per_iteration": 3.375901699066162 }, { "auxiliary_loss_clip": 0.01245647, "auxiliary_loss_mlp": 0.0108809, "balance_loss_clip": 1.06717396, "balance_loss_mlp": 1.05614173, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.282663852625415, "language_loss": 0.82326066, "learning_rate": 3.998011761530112e-06, "loss": 0.84659809, "num_input_tokens_seen": 15631070, "step": 728, "time_per_iteration": 2.605970621109009 }, { "auxiliary_loss_clip": 0.01237902, "auxiliary_loss_mlp": 0.01073495, "balance_loss_clip": 1.06600416, "balance_loss_mlp": 1.04321551, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 2.1303486954703152, "language_loss": 0.76890069, "learning_rate": 3.997994361997338e-06, "loss": 0.7920146, "num_input_tokens_seen": 15647825, "step": 729, "time_per_iteration": 2.652466297149658 }, { "auxiliary_loss_clip": 0.01243746, "auxiliary_loss_mlp": 0.01079207, "balance_loss_clip": 1.06438255, "balance_loss_mlp": 1.04859376, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.1385115795714107, "language_loss": 0.95153189, "learning_rate": 3.997976886700417e-06, "loss": 0.97476137, "num_input_tokens_seen": 15668260, "step": 730, "time_per_iteration": 2.734614133834839 }, { "auxiliary_loss_clip": 0.01238581, "auxiliary_loss_mlp": 0.01074727, "balance_loss_clip": 1.06093788, "balance_loss_mlp": 1.04315984, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 2.333073864238008, "language_loss": 0.88456279, "learning_rate": 3.997959335640013e-06, "loss": 0.90769589, "num_input_tokens_seen": 15685630, "step": 731, "time_per_iteration": 2.5912294387817383 }, { "auxiliary_loss_clip": 0.01242247, "auxiliary_loss_mlp": 0.01076563, "balance_loss_clip": 1.06636512, "balance_loss_mlp": 1.04757094, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 3.0398759554531254, "language_loss": 0.88683128, "learning_rate": 3.997941708816791e-06, "loss": 0.9100194, "num_input_tokens_seen": 15698645, "step": 732, "time_per_iteration": 2.5897367000579834 }, { "auxiliary_loss_clip": 0.01242736, "auxiliary_loss_mlp": 0.01087795, "balance_loss_clip": 1.06544232, "balance_loss_mlp": 1.05646718, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.304959545118842, "language_loss": 0.85829747, "learning_rate": 3.997924006231419e-06, "loss": 0.88160276, "num_input_tokens_seen": 15716775, "step": 733, "time_per_iteration": 2.650681972503662 }, { "auxiliary_loss_clip": 0.01246603, "auxiliary_loss_mlp": 0.01088724, "balance_loss_clip": 1.06722379, "balance_loss_mlp": 1.05544066, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 2.207780377909299, "language_loss": 0.91189414, "learning_rate": 3.9979062278845685e-06, "loss": 0.93524742, "num_input_tokens_seen": 15733320, "step": 734, "time_per_iteration": 2.5956180095672607 }, { "auxiliary_loss_clip": 0.01238395, "auxiliary_loss_mlp": 0.01067579, "balance_loss_clip": 1.06596422, "balance_loss_mlp": 1.03781235, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 1.9297536072777384, "language_loss": 0.77884138, "learning_rate": 3.9978883737769125e-06, "loss": 0.8019011, "num_input_tokens_seen": 15752705, "step": 735, "time_per_iteration": 2.603809118270874 }, { "auxiliary_loss_clip": 0.01234188, "auxiliary_loss_mlp": 0.01070499, "balance_loss_clip": 1.06063068, "balance_loss_mlp": 1.04091144, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.266122200005257, "language_loss": 0.8832593, "learning_rate": 3.9978704439091305e-06, "loss": 0.90630615, "num_input_tokens_seen": 15772800, "step": 736, "time_per_iteration": 5.841086149215698 }, { "auxiliary_loss_clip": 0.01235947, "auxiliary_loss_mlp": 0.01081098, "balance_loss_clip": 1.06597185, "balance_loss_mlp": 1.05165362, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.8984177574034653, "language_loss": 0.84481263, "learning_rate": 3.997852438281901e-06, "loss": 0.8679831, "num_input_tokens_seen": 15793665, "step": 737, "time_per_iteration": 4.1386003494262695 }, { "auxiliary_loss_clip": 0.01240863, "auxiliary_loss_mlp": 0.01072388, "balance_loss_clip": 1.0653491, "balance_loss_mlp": 1.03961766, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.2366199062134706, "language_loss": 0.84712577, "learning_rate": 3.997834356895906e-06, "loss": 0.87025833, "num_input_tokens_seen": 15813175, "step": 738, "time_per_iteration": 4.447159290313721 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.0102196, "balance_loss_clip": 1.02144337, "balance_loss_mlp": 1.01685739, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.8779518557387592, "language_loss": 0.59179878, "learning_rate": 3.9978161997518324e-06, "loss": 0.61300576, "num_input_tokens_seen": 15872050, "step": 739, "time_per_iteration": 3.0780396461486816 }, { "auxiliary_loss_clip": 0.012386, "auxiliary_loss_mlp": 0.01067387, "balance_loss_clip": 1.06604302, "balance_loss_mlp": 1.03717899, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.295102845773205, "language_loss": 0.91329807, "learning_rate": 3.997797966850369e-06, "loss": 0.93635798, "num_input_tokens_seen": 15891085, "step": 740, "time_per_iteration": 2.6687562465667725 }, { "auxiliary_loss_clip": 0.01243424, "auxiliary_loss_mlp": 0.01067832, "balance_loss_clip": 1.06807768, "balance_loss_mlp": 1.03929377, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.0543845689042484, "language_loss": 0.71875739, "learning_rate": 3.997779658192205e-06, "loss": 0.74186987, "num_input_tokens_seen": 15914225, "step": 741, "time_per_iteration": 2.707231283187866 }, { "auxiliary_loss_clip": 0.01233192, "auxiliary_loss_mlp": 0.01084138, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.05476475, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 1.7086571433899975, "language_loss": 0.88933527, "learning_rate": 3.997761273778037e-06, "loss": 0.91250861, "num_input_tokens_seen": 15934540, "step": 742, "time_per_iteration": 2.6647751331329346 }, { "auxiliary_loss_clip": 0.01237248, "auxiliary_loss_mlp": 0.0106534, "balance_loss_clip": 1.06481838, "balance_loss_mlp": 1.03367805, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 1.9055071619943689, "language_loss": 0.83840811, "learning_rate": 3.997742813608561e-06, "loss": 0.86143398, "num_input_tokens_seen": 15952560, "step": 743, "time_per_iteration": 2.697864055633545 }, { "auxiliary_loss_clip": 0.01239398, "auxiliary_loss_mlp": 0.01073846, "balance_loss_clip": 1.06395566, "balance_loss_mlp": 1.04373407, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.2041873634107696, "language_loss": 0.80026019, "learning_rate": 3.997724277684479e-06, "loss": 0.82339263, "num_input_tokens_seen": 15970620, "step": 744, "time_per_iteration": 2.6551101207733154 }, { "auxiliary_loss_clip": 0.01236158, "auxiliary_loss_mlp": 0.01076186, "balance_loss_clip": 1.06385589, "balance_loss_mlp": 1.04665816, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 2.139129927663487, "language_loss": 0.85502481, "learning_rate": 3.99770566600649e-06, "loss": 0.87814826, "num_input_tokens_seen": 15987325, "step": 745, "time_per_iteration": 2.6686010360717773 }, { "auxiliary_loss_clip": 0.01235001, "auxiliary_loss_mlp": 0.01066107, "balance_loss_clip": 1.06320596, "balance_loss_mlp": 1.03594685, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.8251828520192552, "language_loss": 0.69291008, "learning_rate": 3.997686978575302e-06, "loss": 0.71592116, "num_input_tokens_seen": 16008310, "step": 746, "time_per_iteration": 2.6782095432281494 }, { "auxiliary_loss_clip": 0.01244022, "auxiliary_loss_mlp": 0.01081644, "balance_loss_clip": 1.07012939, "balance_loss_mlp": 1.05000615, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 3.6053643469900982, "language_loss": 0.68531066, "learning_rate": 3.997668215391625e-06, "loss": 0.70856726, "num_input_tokens_seen": 16029620, "step": 747, "time_per_iteration": 2.6589114665985107 }, { "auxiliary_loss_clip": 0.0124018, "auxiliary_loss_mlp": 0.01083594, "balance_loss_clip": 1.0652504, "balance_loss_mlp": 1.05183625, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 1.8376208182131786, "language_loss": 0.66778374, "learning_rate": 3.997649376456168e-06, "loss": 0.69102144, "num_input_tokens_seen": 16049065, "step": 748, "time_per_iteration": 2.674691677093506 }, { "auxiliary_loss_clip": 0.01243343, "auxiliary_loss_mlp": 0.01085665, "balance_loss_clip": 1.07101417, "balance_loss_mlp": 1.05596995, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 2.4197486882062322, "language_loss": 0.76684916, "learning_rate": 3.997630461769647e-06, "loss": 0.7901392, "num_input_tokens_seen": 16066765, "step": 749, "time_per_iteration": 2.5940611362457275 }, { "auxiliary_loss_clip": 0.01243381, "auxiliary_loss_mlp": 0.01083303, "balance_loss_clip": 1.06892776, "balance_loss_mlp": 1.05338168, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 1.926675828378473, "language_loss": 0.88739896, "learning_rate": 3.997611471332778e-06, "loss": 0.91066581, "num_input_tokens_seen": 16085980, "step": 750, "time_per_iteration": 2.551717758178711 }, { "auxiliary_loss_clip": 0.01238484, "auxiliary_loss_mlp": 0.01077419, "balance_loss_clip": 1.062783, "balance_loss_mlp": 1.04404092, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 3.4910287963746116, "language_loss": 0.74371743, "learning_rate": 3.9975924051462825e-06, "loss": 0.76687646, "num_input_tokens_seen": 16106260, "step": 751, "time_per_iteration": 2.6299028396606445 }, { "auxiliary_loss_clip": 0.0123577, "auxiliary_loss_mlp": 0.01078322, "balance_loss_clip": 1.06347609, "balance_loss_mlp": 1.04884171, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 3.3938056459605583, "language_loss": 0.69115144, "learning_rate": 3.997573263210883e-06, "loss": 0.71429229, "num_input_tokens_seen": 16123475, "step": 752, "time_per_iteration": 2.571223020553589 }, { "auxiliary_loss_clip": 0.01235899, "auxiliary_loss_mlp": 0.01060876, "balance_loss_clip": 1.0627141, "balance_loss_mlp": 1.03212225, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.69328062598792, "language_loss": 0.92126763, "learning_rate": 3.997554045527305e-06, "loss": 0.94423538, "num_input_tokens_seen": 16138335, "step": 753, "time_per_iteration": 2.6100237369537354 }, { "auxiliary_loss_clip": 0.01239023, "auxiliary_loss_mlp": 0.01080271, "balance_loss_clip": 1.06628633, "balance_loss_mlp": 1.05116034, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 4.138305317267875, "language_loss": 0.91373456, "learning_rate": 3.997534752096277e-06, "loss": 0.93692756, "num_input_tokens_seen": 16157110, "step": 754, "time_per_iteration": 2.642747402191162 }, { "auxiliary_loss_clip": 0.01229195, "auxiliary_loss_mlp": 0.01078016, "balance_loss_clip": 1.06402516, "balance_loss_mlp": 1.04725957, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 4.559941934311277, "language_loss": 0.78558046, "learning_rate": 3.997515382918531e-06, "loss": 0.80865264, "num_input_tokens_seen": 16174155, "step": 755, "time_per_iteration": 2.6316659450531006 }, { "auxiliary_loss_clip": 0.01240044, "auxiliary_loss_mlp": 0.01081048, "balance_loss_clip": 1.06624937, "balance_loss_mlp": 1.05099559, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.193539224658874, "language_loss": 0.78473848, "learning_rate": 3.9974959379948015e-06, "loss": 0.80794942, "num_input_tokens_seen": 16192240, "step": 756, "time_per_iteration": 2.6390748023986816 }, { "auxiliary_loss_clip": 0.01101224, "auxiliary_loss_mlp": 0.01013849, "balance_loss_clip": 1.02455997, "balance_loss_mlp": 1.0089612, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8202876780471967, "language_loss": 0.62756521, "learning_rate": 3.997476417325827e-06, "loss": 0.64871597, "num_input_tokens_seen": 16255775, "step": 757, "time_per_iteration": 3.2393198013305664 }, { "auxiliary_loss_clip": 0.01235136, "auxiliary_loss_mlp": 0.01071767, "balance_loss_clip": 1.06455243, "balance_loss_mlp": 1.04346693, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 1.6528285304744148, "language_loss": 0.84211069, "learning_rate": 3.997456820912346e-06, "loss": 0.86517978, "num_input_tokens_seen": 16277015, "step": 758, "time_per_iteration": 2.6508655548095703 }, { "auxiliary_loss_clip": 0.01228461, "auxiliary_loss_mlp": 0.01067033, "balance_loss_clip": 1.05912399, "balance_loss_mlp": 1.0391618, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 2.695805662282291, "language_loss": 0.88150775, "learning_rate": 3.997437148755101e-06, "loss": 0.9044627, "num_input_tokens_seen": 16296005, "step": 759, "time_per_iteration": 2.7782890796661377 }, { "auxiliary_loss_clip": 0.01240589, "auxiliary_loss_mlp": 0.01078815, "balance_loss_clip": 1.06747675, "balance_loss_mlp": 1.04846466, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2.392455009776849, "language_loss": 0.73440695, "learning_rate": 3.9974174008548405e-06, "loss": 0.75760102, "num_input_tokens_seen": 16315300, "step": 760, "time_per_iteration": 2.7138822078704834 }, { "auxiliary_loss_clip": 0.01240372, "auxiliary_loss_mlp": 0.01079791, "balance_loss_clip": 1.07095265, "balance_loss_mlp": 1.05162191, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 3.497321311688565, "language_loss": 0.81781888, "learning_rate": 3.9973975772123105e-06, "loss": 0.84102058, "num_input_tokens_seen": 16333820, "step": 761, "time_per_iteration": 2.631303310394287 }, { "auxiliary_loss_clip": 0.01231969, "auxiliary_loss_mlp": 0.01078623, "balance_loss_clip": 1.06324267, "balance_loss_mlp": 1.04922605, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 2.0632320043111965, "language_loss": 0.79811668, "learning_rate": 3.997377677828266e-06, "loss": 0.82122266, "num_input_tokens_seen": 16355290, "step": 762, "time_per_iteration": 2.646928071975708 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.01027943, "balance_loss_clip": 1.01857328, "balance_loss_mlp": 1.02288842, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0128965743658471, "language_loss": 0.58723813, "learning_rate": 3.9973577027034585e-06, "loss": 0.60845619, "num_input_tokens_seen": 16415995, "step": 763, "time_per_iteration": 3.1712563037872314 }, { "auxiliary_loss_clip": 0.012343, "auxiliary_loss_mlp": 0.01082461, "balance_loss_clip": 1.06205368, "balance_loss_mlp": 1.0531354, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 4.978761831483118, "language_loss": 0.87544954, "learning_rate": 3.9973376518386475e-06, "loss": 0.89861715, "num_input_tokens_seen": 16433120, "step": 764, "time_per_iteration": 2.5985426902770996 }, { "auxiliary_loss_clip": 0.01236145, "auxiliary_loss_mlp": 0.01087868, "balance_loss_clip": 1.06553543, "balance_loss_mlp": 1.05854285, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.0894169515773067, "language_loss": 0.85966802, "learning_rate": 3.997317525234592e-06, "loss": 0.88290817, "num_input_tokens_seen": 16453360, "step": 765, "time_per_iteration": 2.6572606563568115 }, { "auxiliary_loss_clip": 0.01239644, "auxiliary_loss_mlp": 0.01077398, "balance_loss_clip": 1.06530261, "balance_loss_mlp": 1.04573584, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 2.628046285830335, "language_loss": 0.88265938, "learning_rate": 3.997297322892056e-06, "loss": 0.90582979, "num_input_tokens_seen": 16471160, "step": 766, "time_per_iteration": 2.673226833343506 }, { "auxiliary_loss_clip": 0.01235506, "auxiliary_loss_mlp": 0.0107998, "balance_loss_clip": 1.06371713, "balance_loss_mlp": 1.05115545, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 2.343908591401411, "language_loss": 0.84302223, "learning_rate": 3.997277044811806e-06, "loss": 0.86617708, "num_input_tokens_seen": 16488940, "step": 767, "time_per_iteration": 2.683429002761841 }, { "auxiliary_loss_clip": 0.01236229, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.06769753, "balance_loss_mlp": 1.03791094, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 1.9268984031305718, "language_loss": 0.8669976, "learning_rate": 3.99725669099461e-06, "loss": 0.89003831, "num_input_tokens_seen": 16509505, "step": 768, "time_per_iteration": 2.8125200271606445 }, { "auxiliary_loss_clip": 0.01234175, "auxiliary_loss_mlp": 0.01076069, "balance_loss_clip": 1.06150854, "balance_loss_mlp": 1.04738712, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 2.115272554881108, "language_loss": 0.75152099, "learning_rate": 3.9972362614412395e-06, "loss": 0.77462339, "num_input_tokens_seen": 16528840, "step": 769, "time_per_iteration": 2.7286128997802734 }, { "auxiliary_loss_clip": 0.01229956, "auxiliary_loss_mlp": 0.01072391, "balance_loss_clip": 1.06326365, "balance_loss_mlp": 1.04462695, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 1.8368669953292174, "language_loss": 0.86292851, "learning_rate": 3.997215756152471e-06, "loss": 0.885952, "num_input_tokens_seen": 16548335, "step": 770, "time_per_iteration": 2.68608021736145 }, { "auxiliary_loss_clip": 0.01239009, "auxiliary_loss_mlp": 0.01072125, "balance_loss_clip": 1.06274092, "balance_loss_mlp": 1.04284704, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 2.058802627607224, "language_loss": 0.86842889, "learning_rate": 3.99719517512908e-06, "loss": 0.89154023, "num_input_tokens_seen": 16567725, "step": 771, "time_per_iteration": 2.637509822845459 }, { "auxiliary_loss_clip": 0.01239449, "auxiliary_loss_mlp": 0.01079651, "balance_loss_clip": 1.06184912, "balance_loss_mlp": 1.04884768, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 1.87920888608735, "language_loss": 0.83691382, "learning_rate": 3.997174518371848e-06, "loss": 0.8601048, "num_input_tokens_seen": 16588175, "step": 772, "time_per_iteration": 2.745006561279297 }, { "auxiliary_loss_clip": 0.01236322, "auxiliary_loss_mlp": 0.0107061, "balance_loss_clip": 1.06672883, "balance_loss_mlp": 1.04220271, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.9655107083336736, "language_loss": 0.73639083, "learning_rate": 3.997153785881557e-06, "loss": 0.75946015, "num_input_tokens_seen": 16607735, "step": 773, "time_per_iteration": 2.869290828704834 }, { "auxiliary_loss_clip": 0.01231219, "auxiliary_loss_mlp": 0.01071681, "balance_loss_clip": 1.06529772, "balance_loss_mlp": 1.04054356, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.096431798380756, "language_loss": 0.78228974, "learning_rate": 3.997132977658996e-06, "loss": 0.80531871, "num_input_tokens_seen": 16627225, "step": 774, "time_per_iteration": 2.6967568397521973 }, { "auxiliary_loss_clip": 0.01230587, "auxiliary_loss_mlp": 0.01069519, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.04131365, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 2.018140205527256, "language_loss": 0.73187691, "learning_rate": 3.997112093704952e-06, "loss": 0.75487792, "num_input_tokens_seen": 16647785, "step": 775, "time_per_iteration": 2.737140417098999 }, { "auxiliary_loss_clip": 0.01231996, "auxiliary_loss_mlp": 0.01066454, "balance_loss_clip": 1.06187618, "balance_loss_mlp": 1.03650832, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.668093168561758, "language_loss": 0.77180624, "learning_rate": 3.997091134020217e-06, "loss": 0.7947908, "num_input_tokens_seen": 16667555, "step": 776, "time_per_iteration": 4.154085159301758 }, { "auxiliary_loss_clip": 0.0122577, "auxiliary_loss_mlp": 0.01071334, "balance_loss_clip": 1.06031108, "balance_loss_mlp": 1.04352236, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 1.9054628166827923, "language_loss": 0.7087816, "learning_rate": 3.997070098605585e-06, "loss": 0.73175263, "num_input_tokens_seen": 16686875, "step": 777, "time_per_iteration": 4.176887512207031 }, { "auxiliary_loss_clip": 0.0122979, "auxiliary_loss_mlp": 0.01076806, "balance_loss_clip": 1.06275606, "balance_loss_mlp": 1.04705119, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.8083238359854679, "language_loss": 0.77069759, "learning_rate": 3.997048987461856e-06, "loss": 0.79376352, "num_input_tokens_seen": 16706420, "step": 778, "time_per_iteration": 5.943394422531128 }, { "auxiliary_loss_clip": 0.01227067, "auxiliary_loss_mlp": 0.01064982, "balance_loss_clip": 1.06043744, "balance_loss_mlp": 1.03563297, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.1737778598926463, "language_loss": 0.79181123, "learning_rate": 3.997027800589829e-06, "loss": 0.81473172, "num_input_tokens_seen": 16726390, "step": 779, "time_per_iteration": 2.611804485321045 }, { "auxiliary_loss_clip": 0.01219629, "auxiliary_loss_mlp": 0.01070238, "balance_loss_clip": 1.05842376, "balance_loss_mlp": 1.04271269, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.888854926622149, "language_loss": 0.77364886, "learning_rate": 3.997006537990308e-06, "loss": 0.79654753, "num_input_tokens_seen": 16748965, "step": 780, "time_per_iteration": 2.668239116668701 }, { "auxiliary_loss_clip": 0.012253, "auxiliary_loss_mlp": 0.01073321, "balance_loss_clip": 1.06098521, "balance_loss_mlp": 1.04605746, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.7616538282563206, "language_loss": 0.76700419, "learning_rate": 3.996985199664099e-06, "loss": 0.78999043, "num_input_tokens_seen": 16768620, "step": 781, "time_per_iteration": 2.5979926586151123 }, { "auxiliary_loss_clip": 0.01236637, "auxiliary_loss_mlp": 0.01077479, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.04836786, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 3.0946494667490856, "language_loss": 0.73786414, "learning_rate": 3.99696378561201e-06, "loss": 0.76100528, "num_input_tokens_seen": 16789755, "step": 782, "time_per_iteration": 2.708855390548706 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01069368, "balance_loss_clip": 1.06431556, "balance_loss_mlp": 1.04253423, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.1459158015790183, "language_loss": 0.80524659, "learning_rate": 3.996942295834855e-06, "loss": 0.82823706, "num_input_tokens_seen": 16807585, "step": 783, "time_per_iteration": 2.6355738639831543 }, { "auxiliary_loss_clip": 0.01222415, "auxiliary_loss_mlp": 0.01063155, "balance_loss_clip": 1.06221437, "balance_loss_mlp": 1.03663135, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.9084512066318515, "language_loss": 0.81687874, "learning_rate": 3.996920730333448e-06, "loss": 0.83973444, "num_input_tokens_seen": 16827220, "step": 784, "time_per_iteration": 2.64365291595459 }, { "auxiliary_loss_clip": 0.01226632, "auxiliary_loss_mlp": 0.01074549, "balance_loss_clip": 1.0582943, "balance_loss_mlp": 1.04719007, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 3.970707764370453, "language_loss": 0.80619848, "learning_rate": 3.996899089108607e-06, "loss": 0.82921028, "num_input_tokens_seen": 16846230, "step": 785, "time_per_iteration": 2.682971715927124 }, { "auxiliary_loss_clip": 0.01231621, "auxiliary_loss_mlp": 0.01063774, "balance_loss_clip": 1.06683421, "balance_loss_mlp": 1.03784585, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 2.074448818096939, "language_loss": 0.89784658, "learning_rate": 3.996877372161152e-06, "loss": 0.92080051, "num_input_tokens_seen": 16865325, "step": 786, "time_per_iteration": 2.6072235107421875 }, { "auxiliary_loss_clip": 0.01227201, "auxiliary_loss_mlp": 0.01069453, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.03912568, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 6.783818284100465, "language_loss": 0.76794451, "learning_rate": 3.9968555794919065e-06, "loss": 0.79091108, "num_input_tokens_seen": 16882930, "step": 787, "time_per_iteration": 2.595069646835327 }, { "auxiliary_loss_clip": 0.01233526, "auxiliary_loss_mlp": 0.01070856, "balance_loss_clip": 1.06563127, "balance_loss_mlp": 1.04248405, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.309745026689568, "language_loss": 0.81301165, "learning_rate": 3.996833711101698e-06, "loss": 0.83605546, "num_input_tokens_seen": 16900710, "step": 788, "time_per_iteration": 2.633812427520752 }, { "auxiliary_loss_clip": 0.01225447, "auxiliary_loss_mlp": 0.01078934, "balance_loss_clip": 1.06370282, "balance_loss_mlp": 1.04934621, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 2.941245147417381, "language_loss": 0.84428835, "learning_rate": 3.996811766991355e-06, "loss": 0.86733222, "num_input_tokens_seen": 16919210, "step": 789, "time_per_iteration": 2.6711082458496094 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.01071483, "balance_loss_clip": 1.06367648, "balance_loss_mlp": 1.0441606, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 2.0289407228390615, "language_loss": 0.81787878, "learning_rate": 3.996789747161709e-06, "loss": 0.84090227, "num_input_tokens_seen": 16937125, "step": 790, "time_per_iteration": 2.6136717796325684 }, { "auxiliary_loss_clip": 0.01224033, "auxiliary_loss_mlp": 0.01064065, "balance_loss_clip": 1.05880189, "balance_loss_mlp": 1.03546715, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.9735437778568965, "language_loss": 0.88116109, "learning_rate": 3.996767651613597e-06, "loss": 0.90404207, "num_input_tokens_seen": 16958610, "step": 791, "time_per_iteration": 2.747586727142334 }, { "auxiliary_loss_clip": 0.01226267, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06144643, "balance_loss_mlp": 1.03743124, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.1239226540804537, "language_loss": 0.90671498, "learning_rate": 3.996745480347854e-06, "loss": 0.92964232, "num_input_tokens_seen": 16977300, "step": 792, "time_per_iteration": 2.591477870941162 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.0107926, "balance_loss_clip": 1.05968022, "balance_loss_mlp": 1.05225897, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 1.9120988315570397, "language_loss": 0.73246223, "learning_rate": 3.996723233365324e-06, "loss": 0.75552362, "num_input_tokens_seen": 16994950, "step": 793, "time_per_iteration": 2.6319899559020996 }, { "auxiliary_loss_clip": 0.01231301, "auxiliary_loss_mlp": 0.01070716, "balance_loss_clip": 1.06213653, "balance_loss_mlp": 1.04146254, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.86347948201136, "language_loss": 0.86139679, "learning_rate": 3.996700910666847e-06, "loss": 0.88441694, "num_input_tokens_seen": 17014760, "step": 794, "time_per_iteration": 2.6835687160491943 }, { "auxiliary_loss_clip": 0.01228204, "auxiliary_loss_mlp": 0.01077895, "balance_loss_clip": 1.05969596, "balance_loss_mlp": 1.04935622, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 2.370166301863074, "language_loss": 0.69069195, "learning_rate": 3.996678512253272e-06, "loss": 0.71375293, "num_input_tokens_seen": 17032715, "step": 795, "time_per_iteration": 2.669261932373047 }, { "auxiliary_loss_clip": 0.01225748, "auxiliary_loss_mlp": 0.01076275, "balance_loss_clip": 1.06129098, "balance_loss_mlp": 1.04756904, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.744925212230271, "language_loss": 0.810256, "learning_rate": 3.996656038125449e-06, "loss": 0.83327615, "num_input_tokens_seen": 17052215, "step": 796, "time_per_iteration": 2.5800065994262695 }, { "auxiliary_loss_clip": 0.01228235, "auxiliary_loss_mlp": 0.01065433, "balance_loss_clip": 1.06224668, "balance_loss_mlp": 1.03638172, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 1.979164246440182, "language_loss": 0.8128069, "learning_rate": 3.996633488284228e-06, "loss": 0.83574355, "num_input_tokens_seen": 17069225, "step": 797, "time_per_iteration": 2.58878493309021 }, { "auxiliary_loss_clip": 0.01100259, "auxiliary_loss_mlp": 0.01007215, "balance_loss_clip": 1.02779806, "balance_loss_mlp": 1.00266171, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.912416075283383, "language_loss": 0.64532876, "learning_rate": 3.996610862730465e-06, "loss": 0.66640353, "num_input_tokens_seen": 17126680, "step": 798, "time_per_iteration": 3.0779380798339844 }, { "auxiliary_loss_clip": 0.01229665, "auxiliary_loss_mlp": 0.01068747, "balance_loss_clip": 1.05799031, "balance_loss_mlp": 1.04121017, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 2.0206600610723333, "language_loss": 0.91274291, "learning_rate": 3.996588161465018e-06, "loss": 0.935727, "num_input_tokens_seen": 17144835, "step": 799, "time_per_iteration": 2.660438299179077 }, { "auxiliary_loss_clip": 0.01230751, "auxiliary_loss_mlp": 0.010715, "balance_loss_clip": 1.06640434, "balance_loss_mlp": 1.04274678, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.0752654205923866, "language_loss": 0.86825287, "learning_rate": 3.996565384488748e-06, "loss": 0.89127541, "num_input_tokens_seen": 17165030, "step": 800, "time_per_iteration": 2.6700456142425537 }, { "auxiliary_loss_clip": 0.01229893, "auxiliary_loss_mlp": 0.01072058, "balance_loss_clip": 1.06186771, "balance_loss_mlp": 1.04618931, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.5310108886746976, "language_loss": 0.83949852, "learning_rate": 3.996542531802518e-06, "loss": 0.86251807, "num_input_tokens_seen": 17184895, "step": 801, "time_per_iteration": 2.7724695205688477 }, { "auxiliary_loss_clip": 0.01227846, "auxiliary_loss_mlp": 0.010756, "balance_loss_clip": 1.06226814, "balance_loss_mlp": 1.04847932, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 1.9607091513106172, "language_loss": 0.79818648, "learning_rate": 3.996519603407196e-06, "loss": 0.82122099, "num_input_tokens_seen": 17208225, "step": 802, "time_per_iteration": 2.861309766769409 }, { "auxiliary_loss_clip": 0.0122832, "auxiliary_loss_mlp": 0.01069086, "balance_loss_clip": 1.06392837, "balance_loss_mlp": 1.04278886, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 1.798745906633195, "language_loss": 0.86600745, "learning_rate": 3.996496599303649e-06, "loss": 0.88898146, "num_input_tokens_seen": 17226305, "step": 803, "time_per_iteration": 2.612684965133667 }, { "auxiliary_loss_clip": 0.01222438, "auxiliary_loss_mlp": 0.01063116, "balance_loss_clip": 1.06214345, "balance_loss_mlp": 1.03643703, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 5.958214069975319, "language_loss": 0.85139012, "learning_rate": 3.996473519492753e-06, "loss": 0.8742457, "num_input_tokens_seen": 17244545, "step": 804, "time_per_iteration": 2.596965789794922 }, { "auxiliary_loss_clip": 0.01225485, "auxiliary_loss_mlp": 0.0106948, "balance_loss_clip": 1.06206632, "balance_loss_mlp": 1.04222918, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 1.9492340448514227, "language_loss": 0.85939878, "learning_rate": 3.99645036397538e-06, "loss": 0.88234842, "num_input_tokens_seen": 17265730, "step": 805, "time_per_iteration": 2.6773781776428223 }, { "auxiliary_loss_clip": 0.01221339, "auxiliary_loss_mlp": 0.01071867, "balance_loss_clip": 1.05968738, "balance_loss_mlp": 1.04591477, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 1.8764849579047527, "language_loss": 0.68025368, "learning_rate": 3.9964271327524085e-06, "loss": 0.70318574, "num_input_tokens_seen": 17284820, "step": 806, "time_per_iteration": 2.6270596981048584 }, { "auxiliary_loss_clip": 0.01221043, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.06064904, "balance_loss_mlp": 1.03384972, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 8.586680684018, "language_loss": 0.76488906, "learning_rate": 3.9964038258247214e-06, "loss": 0.78770459, "num_input_tokens_seen": 17305085, "step": 807, "time_per_iteration": 2.6783089637756348 }, { "auxiliary_loss_clip": 0.01218859, "auxiliary_loss_mlp": 0.01068871, "balance_loss_clip": 1.05734789, "balance_loss_mlp": 1.04290676, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 2.4056749627509157, "language_loss": 0.86882269, "learning_rate": 3.9963804431932005e-06, "loss": 0.89170003, "num_input_tokens_seen": 17322715, "step": 808, "time_per_iteration": 2.6447641849517822 }, { "auxiliary_loss_clip": 0.01227529, "auxiliary_loss_mlp": 0.01069446, "balance_loss_clip": 1.06140316, "balance_loss_mlp": 1.0424329, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 2.6040733531164424, "language_loss": 0.89710444, "learning_rate": 3.996356984858732e-06, "loss": 0.92007422, "num_input_tokens_seen": 17341455, "step": 809, "time_per_iteration": 2.6679790019989014 }, { "auxiliary_loss_clip": 0.01226608, "auxiliary_loss_mlp": 0.01067211, "balance_loss_clip": 1.0643065, "balance_loss_mlp": 1.04060316, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 3.0721319202916324, "language_loss": 0.84918916, "learning_rate": 3.996333450822208e-06, "loss": 0.87212729, "num_input_tokens_seen": 17360765, "step": 810, "time_per_iteration": 2.696772575378418 }, { "auxiliary_loss_clip": 0.01227202, "auxiliary_loss_mlp": 0.01067343, "balance_loss_clip": 1.0622344, "balance_loss_mlp": 1.04049683, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 1.8136675943398954, "language_loss": 0.80799425, "learning_rate": 3.99630984108452e-06, "loss": 0.83093977, "num_input_tokens_seen": 17380625, "step": 811, "time_per_iteration": 2.653808355331421 }, { "auxiliary_loss_clip": 0.01217843, "auxiliary_loss_mlp": 0.01070621, "balance_loss_clip": 1.05928314, "balance_loss_mlp": 1.04466903, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.7193599003225197, "language_loss": 0.74634516, "learning_rate": 3.9962861556465615e-06, "loss": 0.76922977, "num_input_tokens_seen": 17399355, "step": 812, "time_per_iteration": 2.7274649143218994 }, { "auxiliary_loss_clip": 0.01222659, "auxiliary_loss_mlp": 0.01073562, "balance_loss_clip": 1.06445217, "balance_loss_mlp": 1.04862356, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 1.9311665765462733, "language_loss": 0.90124279, "learning_rate": 3.996262394509233e-06, "loss": 0.92420495, "num_input_tokens_seen": 17418240, "step": 813, "time_per_iteration": 2.654874801635742 }, { "auxiliary_loss_clip": 0.0122, "auxiliary_loss_mlp": 0.01057827, "balance_loss_clip": 1.06157589, "balance_loss_mlp": 1.03248262, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 1.9238840150723209, "language_loss": 0.74904704, "learning_rate": 3.9962385576734335e-06, "loss": 0.77182531, "num_input_tokens_seen": 17436250, "step": 814, "time_per_iteration": 2.7381603717803955 }, { "auxiliary_loss_clip": 0.01223782, "auxiliary_loss_mlp": 0.01069686, "balance_loss_clip": 1.06125045, "balance_loss_mlp": 1.04289961, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 2.1966001004582596, "language_loss": 0.83816808, "learning_rate": 3.9962146451400675e-06, "loss": 0.86110282, "num_input_tokens_seen": 17455750, "step": 815, "time_per_iteration": 2.7289621829986572 }, { "auxiliary_loss_clip": 0.01227011, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.06326818, "balance_loss_mlp": 1.0344646, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.3329994981275943, "language_loss": 0.90796101, "learning_rate": 3.996190656910043e-06, "loss": 0.93083686, "num_input_tokens_seen": 17474995, "step": 816, "time_per_iteration": 4.174290180206299 }, { "auxiliary_loss_clip": 0.01226278, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.06172895, "balance_loss_mlp": 1.03054583, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 2.2253098946667853, "language_loss": 0.79834002, "learning_rate": 3.996166592984268e-06, "loss": 0.82116789, "num_input_tokens_seen": 17493395, "step": 817, "time_per_iteration": 4.2819907665252686 }, { "auxiliary_loss_clip": 0.01222491, "auxiliary_loss_mlp": 0.01072358, "balance_loss_clip": 1.06228495, "balance_loss_mlp": 1.04563141, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.9292138186207266, "language_loss": 0.8532303, "learning_rate": 3.996142453363656e-06, "loss": 0.8761788, "num_input_tokens_seen": 17514565, "step": 818, "time_per_iteration": 7.687308073043823 }, { "auxiliary_loss_clip": 0.01228571, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06170368, "balance_loss_mlp": 1.0369786, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.1064810754058407, "language_loss": 0.75623614, "learning_rate": 3.996118238049124e-06, "loss": 0.77916616, "num_input_tokens_seen": 17534590, "step": 819, "time_per_iteration": 2.5708072185516357 }, { "auxiliary_loss_clip": 0.01227988, "auxiliary_loss_mlp": 0.010616, "balance_loss_clip": 1.06580663, "balance_loss_mlp": 1.03785336, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.8685299631500487, "language_loss": 0.85082126, "learning_rate": 3.996093947041586e-06, "loss": 0.87371719, "num_input_tokens_seen": 17551900, "step": 820, "time_per_iteration": 2.695204973220825 }, { "auxiliary_loss_clip": 0.01224953, "auxiliary_loss_mlp": 0.01065985, "balance_loss_clip": 1.06082845, "balance_loss_mlp": 1.04037917, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 1.734636988660555, "language_loss": 0.90459162, "learning_rate": 3.996069580341966e-06, "loss": 0.92750102, "num_input_tokens_seen": 17571485, "step": 821, "time_per_iteration": 2.6284992694854736 }, { "auxiliary_loss_clip": 0.01222526, "auxiliary_loss_mlp": 0.01080357, "balance_loss_clip": 1.06015635, "balance_loss_mlp": 1.05485809, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 1.7915267676548876, "language_loss": 0.89795959, "learning_rate": 3.996045137951188e-06, "loss": 0.92098844, "num_input_tokens_seen": 17591410, "step": 822, "time_per_iteration": 2.6085855960845947 }, { "auxiliary_loss_clip": 0.0122571, "auxiliary_loss_mlp": 0.01062887, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.03472972, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 2.28747155105076, "language_loss": 0.67558801, "learning_rate": 3.996020619870178e-06, "loss": 0.69847399, "num_input_tokens_seen": 17612010, "step": 823, "time_per_iteration": 2.644277572631836 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.0100741, "balance_loss_clip": 1.0267303, "balance_loss_mlp": 1.00266516, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.3456360586087317, "language_loss": 0.62254131, "learning_rate": 3.995996026099866e-06, "loss": 0.64360917, "num_input_tokens_seen": 17673430, "step": 824, "time_per_iteration": 3.230381488800049 }, { "auxiliary_loss_clip": 0.01228758, "auxiliary_loss_mlp": 0.01066541, "balance_loss_clip": 1.06346989, "balance_loss_mlp": 1.03909945, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.8854339538524305, "language_loss": 0.90479428, "learning_rate": 3.995971356641185e-06, "loss": 0.92774737, "num_input_tokens_seen": 17689545, "step": 825, "time_per_iteration": 2.58868670463562 }, { "auxiliary_loss_clip": 0.01227734, "auxiliary_loss_mlp": 0.01066527, "balance_loss_clip": 1.06315517, "balance_loss_mlp": 1.03844118, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 2.307419213246734, "language_loss": 0.66851091, "learning_rate": 3.9959466114950695e-06, "loss": 0.69145352, "num_input_tokens_seen": 17705965, "step": 826, "time_per_iteration": 2.59468412399292 }, { "auxiliary_loss_clip": 0.01230149, "auxiliary_loss_mlp": 0.01069061, "balance_loss_clip": 1.06421614, "balance_loss_mlp": 1.04216766, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 1.8316571551414482, "language_loss": 0.78298402, "learning_rate": 3.995921790662459e-06, "loss": 0.80597603, "num_input_tokens_seen": 17724580, "step": 827, "time_per_iteration": 2.7148005962371826 }, { "auxiliary_loss_clip": 0.01230507, "auxiliary_loss_mlp": 0.01079145, "balance_loss_clip": 1.06385946, "balance_loss_mlp": 1.05119085, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.6017511297862308, "language_loss": 0.78696525, "learning_rate": 3.995896894144294e-06, "loss": 0.81006181, "num_input_tokens_seen": 17747755, "step": 828, "time_per_iteration": 2.86991548538208 }, { "auxiliary_loss_clip": 0.0121958, "auxiliary_loss_mlp": 0.01059689, "balance_loss_clip": 1.05939984, "balance_loss_mlp": 1.03390431, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 2.48577103336206, "language_loss": 0.83530867, "learning_rate": 3.995871921941519e-06, "loss": 0.85810131, "num_input_tokens_seen": 17768550, "step": 829, "time_per_iteration": 2.655895948410034 }, { "auxiliary_loss_clip": 0.01226863, "auxiliary_loss_mlp": 0.01080723, "balance_loss_clip": 1.06109536, "balance_loss_mlp": 1.05068195, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.078538436430036, "language_loss": 0.74857247, "learning_rate": 3.99584687405508e-06, "loss": 0.77164829, "num_input_tokens_seen": 17786080, "step": 830, "time_per_iteration": 2.5820400714874268 }, { "auxiliary_loss_clip": 0.0122584, "auxiliary_loss_mlp": 0.01074077, "balance_loss_clip": 1.06154907, "balance_loss_mlp": 1.04667115, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 1.8327841960194244, "language_loss": 0.79279459, "learning_rate": 3.995821750485929e-06, "loss": 0.81579381, "num_input_tokens_seen": 17803635, "step": 831, "time_per_iteration": 2.5980231761932373 }, { "auxiliary_loss_clip": 0.01173206, "auxiliary_loss_mlp": 0.01072743, "balance_loss_clip": 1.0542444, "balance_loss_mlp": 1.04725623, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 3.034319898285603, "language_loss": 0.91497368, "learning_rate": 3.995796551235016e-06, "loss": 0.93743312, "num_input_tokens_seen": 17822190, "step": 832, "time_per_iteration": 2.7498815059661865 }, { "auxiliary_loss_clip": 0.01194428, "auxiliary_loss_mlp": 0.01081719, "balance_loss_clip": 1.05826366, "balance_loss_mlp": 1.05667353, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 1.887029338258115, "language_loss": 0.83167893, "learning_rate": 3.9957712763032974e-06, "loss": 0.85444039, "num_input_tokens_seen": 17846915, "step": 833, "time_per_iteration": 2.863208770751953 }, { "auxiliary_loss_clip": 0.01199525, "auxiliary_loss_mlp": 0.01061962, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.03468657, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.8753922020214033, "language_loss": 0.82409853, "learning_rate": 3.995745925691733e-06, "loss": 0.84671336, "num_input_tokens_seen": 17867270, "step": 834, "time_per_iteration": 2.7868030071258545 }, { "auxiliary_loss_clip": 0.01216246, "auxiliary_loss_mlp": 0.01064427, "balance_loss_clip": 1.06272483, "balance_loss_mlp": 1.03672278, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.2306487397141646, "language_loss": 0.92186153, "learning_rate": 3.995720499401282e-06, "loss": 0.94466823, "num_input_tokens_seen": 17884880, "step": 835, "time_per_iteration": 2.6224496364593506 }, { "auxiliary_loss_clip": 0.01229494, "auxiliary_loss_mlp": 0.01074922, "balance_loss_clip": 1.06143415, "balance_loss_mlp": 1.0464313, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.196832783808158, "language_loss": 0.76143622, "learning_rate": 3.995694997432911e-06, "loss": 0.78448039, "num_input_tokens_seen": 17903695, "step": 836, "time_per_iteration": 2.5648462772369385 }, { "auxiliary_loss_clip": 0.01211162, "auxiliary_loss_mlp": 0.01075977, "balance_loss_clip": 1.06259084, "balance_loss_mlp": 1.04992962, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.100773352560791, "language_loss": 0.83627856, "learning_rate": 3.9956694197875855e-06, "loss": 0.85914999, "num_input_tokens_seen": 17920745, "step": 837, "time_per_iteration": 2.7420156002044678 }, { "auxiliary_loss_clip": 0.01198815, "auxiliary_loss_mlp": 0.0078439, "balance_loss_clip": 1.06345344, "balance_loss_mlp": 1.00053763, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.1353335821274477, "language_loss": 0.72857559, "learning_rate": 3.995643766466275e-06, "loss": 0.7484076, "num_input_tokens_seen": 17938220, "step": 838, "time_per_iteration": 2.679177761077881 }, { "auxiliary_loss_clip": 0.01189223, "auxiliary_loss_mlp": 0.01071526, "balance_loss_clip": 1.05415273, "balance_loss_mlp": 1.04510927, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.8138261016039334, "language_loss": 0.83462799, "learning_rate": 3.995618037469953e-06, "loss": 0.85723549, "num_input_tokens_seen": 17957325, "step": 839, "time_per_iteration": 2.69063663482666 }, { "auxiliary_loss_clip": 0.01220356, "auxiliary_loss_mlp": 0.01069331, "balance_loss_clip": 1.05991399, "balance_loss_mlp": 1.04411805, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.7513762525269907, "language_loss": 0.85775483, "learning_rate": 3.995592232799595e-06, "loss": 0.88065171, "num_input_tokens_seen": 17975875, "step": 840, "time_per_iteration": 2.6477303504943848 }, { "auxiliary_loss_clip": 0.01192112, "auxiliary_loss_mlp": 0.01064377, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.036291, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.7956760046069329, "language_loss": 0.9457823, "learning_rate": 3.99556635245618e-06, "loss": 0.96834719, "num_input_tokens_seen": 17994340, "step": 841, "time_per_iteration": 2.8354220390319824 }, { "auxiliary_loss_clip": 0.0122473, "auxiliary_loss_mlp": 0.01070125, "balance_loss_clip": 1.06219172, "balance_loss_mlp": 1.04329097, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.3106044659054104, "language_loss": 0.77566791, "learning_rate": 3.995540396440688e-06, "loss": 0.79861641, "num_input_tokens_seen": 18015260, "step": 842, "time_per_iteration": 2.6909749507904053 }, { "auxiliary_loss_clip": 0.01214637, "auxiliary_loss_mlp": 0.01071033, "balance_loss_clip": 1.06270838, "balance_loss_mlp": 1.04391265, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 2.8849837971101864, "language_loss": 0.78126526, "learning_rate": 3.995514364754105e-06, "loss": 0.80412203, "num_input_tokens_seen": 18033960, "step": 843, "time_per_iteration": 2.6534156799316406 }, { "auxiliary_loss_clip": 0.01212948, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.06317043, "balance_loss_mlp": 1.03894806, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.9320015451631862, "language_loss": 0.83256191, "learning_rate": 3.995488257397417e-06, "loss": 0.85532749, "num_input_tokens_seen": 18056700, "step": 844, "time_per_iteration": 2.7682149410247803 }, { "auxiliary_loss_clip": 0.01216308, "auxiliary_loss_mlp": 0.01067162, "balance_loss_clip": 1.06307864, "balance_loss_mlp": 1.04138875, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.113957107027846, "language_loss": 0.77108061, "learning_rate": 3.995462074371614e-06, "loss": 0.79391527, "num_input_tokens_seen": 18075815, "step": 845, "time_per_iteration": 2.6720399856567383 }, { "auxiliary_loss_clip": 0.01206643, "auxiliary_loss_mlp": 0.01065522, "balance_loss_clip": 1.05881417, "balance_loss_mlp": 1.03885484, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 1.8497392628450484, "language_loss": 0.87773871, "learning_rate": 3.99543581567769e-06, "loss": 0.90046036, "num_input_tokens_seen": 18095095, "step": 846, "time_per_iteration": 2.696049690246582 }, { "auxiliary_loss_clip": 0.01206291, "auxiliary_loss_mlp": 0.01069231, "balance_loss_clip": 1.06204462, "balance_loss_mlp": 1.04330277, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.695550491545423, "language_loss": 0.87364423, "learning_rate": 3.9954094813166394e-06, "loss": 0.89639944, "num_input_tokens_seen": 18112675, "step": 847, "time_per_iteration": 2.666907548904419 }, { "auxiliary_loss_clip": 0.01175052, "auxiliary_loss_mlp": 0.01071976, "balance_loss_clip": 1.06267309, "balance_loss_mlp": 1.0447005, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.5687168450386637, "language_loss": 0.81878662, "learning_rate": 3.995383071289462e-06, "loss": 0.84125686, "num_input_tokens_seen": 18130745, "step": 848, "time_per_iteration": 2.782135486602783 }, { "auxiliary_loss_clip": 0.0122638, "auxiliary_loss_mlp": 0.01071388, "balance_loss_clip": 1.06619906, "balance_loss_mlp": 1.04544854, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.678404869397893, "language_loss": 0.87187904, "learning_rate": 3.995356585597158e-06, "loss": 0.89485669, "num_input_tokens_seen": 18152410, "step": 849, "time_per_iteration": 2.787992000579834 }, { "auxiliary_loss_clip": 0.01220251, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.06049275, "balance_loss_mlp": 1.03545308, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 2.125711462362114, "language_loss": 0.8315587, "learning_rate": 3.995330024240732e-06, "loss": 0.85437429, "num_input_tokens_seen": 18170870, "step": 850, "time_per_iteration": 2.6548752784729004 }, { "auxiliary_loss_clip": 0.01210598, "auxiliary_loss_mlp": 0.01063491, "balance_loss_clip": 1.06061506, "balance_loss_mlp": 1.0379566, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.2115645013354253, "language_loss": 0.65423882, "learning_rate": 3.995303387221192e-06, "loss": 0.67697972, "num_input_tokens_seen": 18191555, "step": 851, "time_per_iteration": 2.817197322845459 }, { "auxiliary_loss_clip": 0.0120566, "auxiliary_loss_mlp": 0.01075745, "balance_loss_clip": 1.05822444, "balance_loss_mlp": 1.04761147, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 2.3720786299251073, "language_loss": 0.83587611, "learning_rate": 3.995276674539547e-06, "loss": 0.8586902, "num_input_tokens_seen": 18208620, "step": 852, "time_per_iteration": 2.685727119445801 }, { "auxiliary_loss_clip": 0.01193575, "auxiliary_loss_mlp": 0.01074152, "balance_loss_clip": 1.05924761, "balance_loss_mlp": 1.04737723, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.1832763559951234, "language_loss": 0.80761266, "learning_rate": 3.995249886196811e-06, "loss": 0.8302899, "num_input_tokens_seen": 18226370, "step": 853, "time_per_iteration": 2.6078240871429443 }, { "auxiliary_loss_clip": 0.01222394, "auxiliary_loss_mlp": 0.01065268, "balance_loss_clip": 1.06223083, "balance_loss_mlp": 1.03780222, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 1.8511550328562763, "language_loss": 0.75617325, "learning_rate": 3.995223022193999e-06, "loss": 0.77904987, "num_input_tokens_seen": 18247075, "step": 854, "time_per_iteration": 2.633543014526367 }, { "auxiliary_loss_clip": 0.01202415, "auxiliary_loss_mlp": 0.01065973, "balance_loss_clip": 1.06141627, "balance_loss_mlp": 1.03828049, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.04057054323539, "language_loss": 0.81722355, "learning_rate": 3.99519608253213e-06, "loss": 0.83990741, "num_input_tokens_seen": 18265680, "step": 855, "time_per_iteration": 2.760880708694458 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.00762392, "balance_loss_clip": 1.0358243, "balance_loss_mlp": 1.00074518, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9894594919315515, "language_loss": 0.65634769, "learning_rate": 3.995169067212227e-06, "loss": 0.67473871, "num_input_tokens_seen": 18327015, "step": 856, "time_per_iteration": 6.271182298660278 }, { "auxiliary_loss_clip": 0.01194232, "auxiliary_loss_mlp": 0.01056626, "balance_loss_clip": 1.05972147, "balance_loss_mlp": 1.02994716, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 1.8001295724347575, "language_loss": 0.77139348, "learning_rate": 3.9951419762353116e-06, "loss": 0.79390204, "num_input_tokens_seen": 18345235, "step": 857, "time_per_iteration": 4.905239582061768 }, { "auxiliary_loss_clip": 0.01183581, "auxiliary_loss_mlp": 0.01059685, "balance_loss_clip": 1.05640614, "balance_loss_mlp": 1.03291047, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.111656321737554, "language_loss": 0.89194518, "learning_rate": 3.995114809602412e-06, "loss": 0.91437781, "num_input_tokens_seen": 18362350, "step": 858, "time_per_iteration": 2.7349045276641846 }, { "auxiliary_loss_clip": 0.01196113, "auxiliary_loss_mlp": 0.01060739, "balance_loss_clip": 1.06114125, "balance_loss_mlp": 1.03398848, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.030377637624243, "language_loss": 0.75684321, "learning_rate": 3.9950875673145605e-06, "loss": 0.77941179, "num_input_tokens_seen": 18383390, "step": 859, "time_per_iteration": 2.7611751556396484 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.0107269, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04354358, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.134655488493178, "language_loss": 0.91122925, "learning_rate": 3.995060249372788e-06, "loss": 0.93381929, "num_input_tokens_seen": 18399220, "step": 860, "time_per_iteration": 2.666740894317627 }, { "auxiliary_loss_clip": 0.0122488, "auxiliary_loss_mlp": 0.01060586, "balance_loss_clip": 1.06531346, "balance_loss_mlp": 1.03536153, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.7954568874114027, "language_loss": 0.82378531, "learning_rate": 3.99503285577813e-06, "loss": 0.84663993, "num_input_tokens_seen": 18419005, "step": 861, "time_per_iteration": 2.6337814331054688 }, { "auxiliary_loss_clip": 0.01198486, "auxiliary_loss_mlp": 0.01060236, "balance_loss_clip": 1.06147969, "balance_loss_mlp": 1.03437924, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 2.5785699637959776, "language_loss": 0.78664875, "learning_rate": 3.995005386531627e-06, "loss": 0.80923599, "num_input_tokens_seen": 18440550, "step": 862, "time_per_iteration": 2.7570109367370605 }, { "auxiliary_loss_clip": 0.01189664, "auxiliary_loss_mlp": 0.01070327, "balance_loss_clip": 1.058797, "balance_loss_mlp": 1.04547238, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 1.7880881456146414, "language_loss": 0.89090264, "learning_rate": 3.9949778416343195e-06, "loss": 0.91350257, "num_input_tokens_seen": 18461950, "step": 863, "time_per_iteration": 2.7118866443634033 }, { "auxiliary_loss_clip": 0.01201772, "auxiliary_loss_mlp": 0.01064316, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.0369451, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 2.081656150811602, "language_loss": 0.76119763, "learning_rate": 3.9949502210872525e-06, "loss": 0.78385854, "num_input_tokens_seen": 18480555, "step": 864, "time_per_iteration": 2.6946637630462646 }, { "auxiliary_loss_clip": 0.01186585, "auxiliary_loss_mlp": 0.01067959, "balance_loss_clip": 1.05559874, "balance_loss_mlp": 1.04046965, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 1.9374308734697678, "language_loss": 0.7908361, "learning_rate": 3.994922524891474e-06, "loss": 0.81338149, "num_input_tokens_seen": 18499645, "step": 865, "time_per_iteration": 2.7700579166412354 }, { "auxiliary_loss_clip": 0.01210067, "auxiliary_loss_mlp": 0.01067568, "balance_loss_clip": 1.06164694, "balance_loss_mlp": 1.04152083, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.269489500676155, "language_loss": 0.85860598, "learning_rate": 3.994894753048032e-06, "loss": 0.88138229, "num_input_tokens_seen": 18516810, "step": 866, "time_per_iteration": 2.659614086151123 }, { "auxiliary_loss_clip": 0.01186536, "auxiliary_loss_mlp": 0.01070465, "balance_loss_clip": 1.06327558, "balance_loss_mlp": 1.04371393, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.1733876112564565, "language_loss": 0.87495244, "learning_rate": 3.9948669055579815e-06, "loss": 0.89752245, "num_input_tokens_seen": 18532510, "step": 867, "time_per_iteration": 2.740238904953003 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01078445, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.05437636, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.8498678854952728, "language_loss": 0.63917863, "learning_rate": 3.9948389824223785e-06, "loss": 0.66163892, "num_input_tokens_seen": 18557380, "step": 868, "time_per_iteration": 2.9310383796691895 }, { "auxiliary_loss_clip": 0.01225135, "auxiliary_loss_mlp": 0.01069894, "balance_loss_clip": 1.06287289, "balance_loss_mlp": 1.04173636, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 2.742912036955754, "language_loss": 0.83379138, "learning_rate": 3.994810983642281e-06, "loss": 0.85674161, "num_input_tokens_seen": 18575720, "step": 869, "time_per_iteration": 2.6453137397766113 }, { "auxiliary_loss_clip": 0.01216406, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.0645746, "balance_loss_mlp": 1.03053236, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.188953802542244, "language_loss": 0.87822217, "learning_rate": 3.994782909218751e-06, "loss": 0.90095031, "num_input_tokens_seen": 18592185, "step": 870, "time_per_iteration": 2.7044875621795654 }, { "auxiliary_loss_clip": 0.01226316, "auxiliary_loss_mlp": 0.01064746, "balance_loss_clip": 1.06603277, "balance_loss_mlp": 1.03965199, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 1.975067156516721, "language_loss": 0.80651748, "learning_rate": 3.994754759152854e-06, "loss": 0.82942802, "num_input_tokens_seen": 18609560, "step": 871, "time_per_iteration": 2.6892175674438477 }, { "auxiliary_loss_clip": 0.0119502, "auxiliary_loss_mlp": 0.01064309, "balance_loss_clip": 1.0650804, "balance_loss_mlp": 1.0396452, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.7402390708810018, "language_loss": 0.81330585, "learning_rate": 3.994726533445656e-06, "loss": 0.83589917, "num_input_tokens_seen": 18629405, "step": 872, "time_per_iteration": 2.8044185638427734 }, { "auxiliary_loss_clip": 0.0107835, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.03168392, "balance_loss_mlp": 1.01515913, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 0.883483589670371, "language_loss": 0.61589074, "learning_rate": 3.9946982320982274e-06, "loss": 0.63688087, "num_input_tokens_seen": 18681480, "step": 873, "time_per_iteration": 3.1711297035217285 }, { "auxiliary_loss_clip": 0.01197438, "auxiliary_loss_mlp": 0.01056818, "balance_loss_clip": 1.06202292, "balance_loss_mlp": 1.03120041, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 2.1995328011281488, "language_loss": 0.88965189, "learning_rate": 3.994669855111643e-06, "loss": 0.91219449, "num_input_tokens_seen": 18700390, "step": 874, "time_per_iteration": 2.8240153789520264 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.01063458, "balance_loss_clip": 1.0614326, "balance_loss_mlp": 1.03682709, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.858649685360537, "language_loss": 0.74537963, "learning_rate": 3.994641402486977e-06, "loss": 0.76798666, "num_input_tokens_seen": 18721280, "step": 875, "time_per_iteration": 2.9111931324005127 }, { "auxiliary_loss_clip": 0.01206205, "auxiliary_loss_mlp": 0.01058912, "balance_loss_clip": 1.06306934, "balance_loss_mlp": 1.03210175, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.7697857141051123, "language_loss": 0.92843151, "learning_rate": 3.99461287422531e-06, "loss": 0.95108265, "num_input_tokens_seen": 18741545, "step": 876, "time_per_iteration": 2.800252676010132 }, { "auxiliary_loss_clip": 0.01100151, "auxiliary_loss_mlp": 0.01006341, "balance_loss_clip": 1.02669787, "balance_loss_mlp": 1.0020256, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8383495859932864, "language_loss": 0.62929404, "learning_rate": 3.994584270327722e-06, "loss": 0.65035897, "num_input_tokens_seen": 18801400, "step": 877, "time_per_iteration": 3.2090368270874023 }, { "auxiliary_loss_clip": 0.01200578, "auxiliary_loss_mlp": 0.0106702, "balance_loss_clip": 1.06150424, "balance_loss_mlp": 1.03931606, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.042786693643985, "language_loss": 0.85383844, "learning_rate": 3.994555590795299e-06, "loss": 0.87651443, "num_input_tokens_seen": 18819670, "step": 878, "time_per_iteration": 2.823835849761963 }, { "auxiliary_loss_clip": 0.0122514, "auxiliary_loss_mlp": 0.01061117, "balance_loss_clip": 1.0635035, "balance_loss_mlp": 1.03551078, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.7462717669338121, "language_loss": 0.83076209, "learning_rate": 3.9945268356291275e-06, "loss": 0.8536247, "num_input_tokens_seen": 18840580, "step": 879, "time_per_iteration": 2.743673086166382 }, { "auxiliary_loss_clip": 0.0119139, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06152987, "balance_loss_mlp": 1.04013824, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 1.9601789563010765, "language_loss": 0.84284604, "learning_rate": 3.9944980048302985e-06, "loss": 0.86542469, "num_input_tokens_seen": 18859295, "step": 880, "time_per_iteration": 2.7560529708862305 }, { "auxiliary_loss_clip": 0.01184956, "auxiliary_loss_mlp": 0.01065063, "balance_loss_clip": 1.05969453, "balance_loss_mlp": 1.03887296, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.4477328752698564, "language_loss": 0.86870736, "learning_rate": 3.994469098399906e-06, "loss": 0.89120758, "num_input_tokens_seen": 18877485, "step": 881, "time_per_iteration": 2.855395555496216 }, { "auxiliary_loss_clip": 0.01207858, "auxiliary_loss_mlp": 0.01070235, "balance_loss_clip": 1.05984437, "balance_loss_mlp": 1.04238808, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7611192020675561, "language_loss": 0.87967896, "learning_rate": 3.994440116339046e-06, "loss": 0.90245986, "num_input_tokens_seen": 18898275, "step": 882, "time_per_iteration": 2.8480119705200195 }, { "auxiliary_loss_clip": 0.01224906, "auxiliary_loss_mlp": 0.01057944, "balance_loss_clip": 1.06268644, "balance_loss_mlp": 1.03059733, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.3555018967788635, "language_loss": 0.69469339, "learning_rate": 3.994411058648816e-06, "loss": 0.71752191, "num_input_tokens_seen": 18920665, "step": 883, "time_per_iteration": 2.8808236122131348 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01063991, "balance_loss_clip": 1.05333591, "balance_loss_mlp": 1.03855157, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 2.039016812023355, "language_loss": 0.76100993, "learning_rate": 3.994381925330319e-06, "loss": 0.78331757, "num_input_tokens_seen": 18939835, "step": 884, "time_per_iteration": 2.8462212085723877 }, { "auxiliary_loss_clip": 0.01172569, "auxiliary_loss_mlp": 0.01066856, "balance_loss_clip": 1.06269383, "balance_loss_mlp": 1.04147613, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 1.9865896222141148, "language_loss": 0.86195529, "learning_rate": 3.994352716384659e-06, "loss": 0.88434947, "num_input_tokens_seen": 18958405, "step": 885, "time_per_iteration": 2.7825753688812256 }, { "auxiliary_loss_clip": 0.0118405, "auxiliary_loss_mlp": 0.01068976, "balance_loss_clip": 1.05229151, "balance_loss_mlp": 1.04203486, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 2.608647457747672, "language_loss": 0.85971159, "learning_rate": 3.994323431812945e-06, "loss": 0.88224185, "num_input_tokens_seen": 18975445, "step": 886, "time_per_iteration": 2.7393639087677 }, { "auxiliary_loss_clip": 0.0117343, "auxiliary_loss_mlp": 0.01065966, "balance_loss_clip": 1.05620933, "balance_loss_mlp": 1.03879774, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.040002880698432, "language_loss": 0.8961553, "learning_rate": 3.994294071616286e-06, "loss": 0.91854936, "num_input_tokens_seen": 18991930, "step": 887, "time_per_iteration": 2.8606581687927246 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01072438, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.04411352, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 2.062562868466936, "language_loss": 0.74852538, "learning_rate": 3.994264635795796e-06, "loss": 0.77069414, "num_input_tokens_seen": 19009790, "step": 888, "time_per_iteration": 2.8675312995910645 }, { "auxiliary_loss_clip": 0.01164085, "auxiliary_loss_mlp": 0.01072324, "balance_loss_clip": 1.05659473, "balance_loss_mlp": 1.04525173, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 1.7884280759117637, "language_loss": 0.88440782, "learning_rate": 3.994235124352592e-06, "loss": 0.9067719, "num_input_tokens_seen": 19030170, "step": 889, "time_per_iteration": 2.9419636726379395 }, { "auxiliary_loss_clip": 0.0121577, "auxiliary_loss_mlp": 0.0105125, "balance_loss_clip": 1.06085157, "balance_loss_mlp": 1.02607334, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 1.9333059575084248, "language_loss": 0.88386381, "learning_rate": 3.994205537287791e-06, "loss": 0.90653402, "num_input_tokens_seen": 19048075, "step": 890, "time_per_iteration": 2.7030327320098877 }, { "auxiliary_loss_clip": 0.01195034, "auxiliary_loss_mlp": 0.01069003, "balance_loss_clip": 1.05835462, "balance_loss_mlp": 1.04450595, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 2.435204176890571, "language_loss": 0.93450797, "learning_rate": 3.994175874602517e-06, "loss": 0.95714831, "num_input_tokens_seen": 19067465, "step": 891, "time_per_iteration": 2.81527042388916 }, { "auxiliary_loss_clip": 0.01190797, "auxiliary_loss_mlp": 0.01066955, "balance_loss_clip": 1.05605483, "balance_loss_mlp": 1.03909576, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.3400199158693087, "language_loss": 0.71625131, "learning_rate": 3.994146136297893e-06, "loss": 0.73882878, "num_input_tokens_seen": 19085505, "step": 892, "time_per_iteration": 2.825984239578247 }, { "auxiliary_loss_clip": 0.01191313, "auxiliary_loss_mlp": 0.0078394, "balance_loss_clip": 1.05727172, "balance_loss_mlp": 1.00024366, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 1.6058100223173828, "language_loss": 0.82331586, "learning_rate": 3.994116322375049e-06, "loss": 0.84306836, "num_input_tokens_seen": 19104360, "step": 893, "time_per_iteration": 2.8618266582489014 }, { "auxiliary_loss_clip": 0.01192677, "auxiliary_loss_mlp": 0.01063531, "balance_loss_clip": 1.0572021, "balance_loss_mlp": 1.03850877, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.0228714136718122, "language_loss": 0.82052565, "learning_rate": 3.994086432835114e-06, "loss": 0.84308773, "num_input_tokens_seen": 19124680, "step": 894, "time_per_iteration": 2.8347885608673096 }, { "auxiliary_loss_clip": 0.0120111, "auxiliary_loss_mlp": 0.01065233, "balance_loss_clip": 1.0570271, "balance_loss_mlp": 1.03997254, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.260594705980758, "language_loss": 0.76133072, "learning_rate": 3.994056467679221e-06, "loss": 0.78399414, "num_input_tokens_seen": 19142895, "step": 895, "time_per_iteration": 2.7288858890533447 }, { "auxiliary_loss_clip": 0.01200143, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.06422663, "balance_loss_mlp": 1.03547084, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 2.0450623179174974, "language_loss": 0.86767507, "learning_rate": 3.9940264269085065e-06, "loss": 0.89028239, "num_input_tokens_seen": 19163125, "step": 896, "time_per_iteration": 4.404265642166138 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.00782931, "balance_loss_clip": 1.06062579, "balance_loss_mlp": 1.0002867, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 3.0866230440609805, "language_loss": 0.8797363, "learning_rate": 3.9939963105241115e-06, "loss": 0.89978594, "num_input_tokens_seen": 19179385, "step": 897, "time_per_iteration": 4.843130588531494 }, { "auxiliary_loss_clip": 0.01201639, "auxiliary_loss_mlp": 0.01063724, "balance_loss_clip": 1.05896854, "balance_loss_mlp": 1.03658032, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.8270040910241792, "language_loss": 0.90170419, "learning_rate": 3.993966118527175e-06, "loss": 0.92435783, "num_input_tokens_seen": 19198725, "step": 898, "time_per_iteration": 2.695235252380371 }, { "auxiliary_loss_clip": 0.01200189, "auxiliary_loss_mlp": 0.01076438, "balance_loss_clip": 1.05787873, "balance_loss_mlp": 1.05105805, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 2.793625116693953, "language_loss": 0.91544139, "learning_rate": 3.993935850918845e-06, "loss": 0.93820769, "num_input_tokens_seen": 19212380, "step": 899, "time_per_iteration": 2.7509548664093018 }, { "auxiliary_loss_clip": 0.01186479, "auxiliary_loss_mlp": 0.01068594, "balance_loss_clip": 1.05614042, "balance_loss_mlp": 1.04154527, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 1.983572968760697, "language_loss": 0.75742769, "learning_rate": 3.9939055077002665e-06, "loss": 0.77997845, "num_input_tokens_seen": 19232235, "step": 900, "time_per_iteration": 2.771371364593506 }, { "auxiliary_loss_clip": 0.01211506, "auxiliary_loss_mlp": 0.01058176, "balance_loss_clip": 1.05839145, "balance_loss_mlp": 1.03401244, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.192527627735503, "language_loss": 0.74331856, "learning_rate": 3.993875088872592e-06, "loss": 0.76601535, "num_input_tokens_seen": 19251460, "step": 901, "time_per_iteration": 2.859912157058716 }, { "auxiliary_loss_clip": 0.01177502, "auxiliary_loss_mlp": 0.01065445, "balance_loss_clip": 1.0569309, "balance_loss_mlp": 1.04166329, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.352700712836257, "language_loss": 0.85287452, "learning_rate": 3.9938445944369745e-06, "loss": 0.87530404, "num_input_tokens_seen": 19269060, "step": 902, "time_per_iteration": 2.7940642833709717 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01066664, "balance_loss_clip": 1.04903233, "balance_loss_mlp": 1.04112983, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 1.9620711230312637, "language_loss": 0.86385572, "learning_rate": 3.993814024394569e-06, "loss": 0.88613302, "num_input_tokens_seen": 19288620, "step": 903, "time_per_iteration": 2.9258980751037598 }, { "auxiliary_loss_clip": 0.0121005, "auxiliary_loss_mlp": 0.01059616, "balance_loss_clip": 1.06094384, "balance_loss_mlp": 1.03534508, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 2.175127974944855, "language_loss": 0.74927866, "learning_rate": 3.993783378746537e-06, "loss": 0.7719754, "num_input_tokens_seen": 19306615, "step": 904, "time_per_iteration": 2.7239954471588135 }, { "auxiliary_loss_clip": 0.01208402, "auxiliary_loss_mlp": 0.01067543, "balance_loss_clip": 1.06052148, "balance_loss_mlp": 1.04325962, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 2.5191963984804535, "language_loss": 0.85946918, "learning_rate": 3.993752657494039e-06, "loss": 0.88222867, "num_input_tokens_seen": 19321680, "step": 905, "time_per_iteration": 2.693896532058716 }, { "auxiliary_loss_clip": 0.01198232, "auxiliary_loss_mlp": 0.01078072, "balance_loss_clip": 1.06483209, "balance_loss_mlp": 1.05400348, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.7753581401878566, "language_loss": 0.74413162, "learning_rate": 3.993721860638241e-06, "loss": 0.7668947, "num_input_tokens_seen": 19339760, "step": 906, "time_per_iteration": 2.6679019927978516 }, { "auxiliary_loss_clip": 0.01192373, "auxiliary_loss_mlp": 0.01064381, "balance_loss_clip": 1.05954027, "balance_loss_mlp": 1.0397284, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 2.3037248114268896, "language_loss": 0.87340188, "learning_rate": 3.993690988180309e-06, "loss": 0.89596951, "num_input_tokens_seen": 19359585, "step": 907, "time_per_iteration": 2.7363240718841553 }, { "auxiliary_loss_clip": 0.01205519, "auxiliary_loss_mlp": 0.01068463, "balance_loss_clip": 1.0616293, "balance_loss_mlp": 1.04332149, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.6666873589767146, "language_loss": 0.86928803, "learning_rate": 3.9936600401214165e-06, "loss": 0.89202785, "num_input_tokens_seen": 19378590, "step": 908, "time_per_iteration": 2.6266026496887207 }, { "auxiliary_loss_clip": 0.01198848, "auxiliary_loss_mlp": 0.01067336, "balance_loss_clip": 1.05974221, "balance_loss_mlp": 1.04107404, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.1282794409977215, "language_loss": 0.89792144, "learning_rate": 3.9936290164627345e-06, "loss": 0.92058325, "num_input_tokens_seen": 19397910, "step": 909, "time_per_iteration": 2.7163166999816895 }, { "auxiliary_loss_clip": 0.01200393, "auxiliary_loss_mlp": 0.01073374, "balance_loss_clip": 1.06157839, "balance_loss_mlp": 1.04742169, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 2.095924869989121, "language_loss": 0.70949811, "learning_rate": 3.99359791720544e-06, "loss": 0.73223579, "num_input_tokens_seen": 19415950, "step": 910, "time_per_iteration": 2.6697354316711426 }, { "auxiliary_loss_clip": 0.01187784, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.05651259, "balance_loss_mlp": 1.02975583, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.6633724338567386, "language_loss": 0.83651805, "learning_rate": 3.993566742350714e-06, "loss": 0.85894263, "num_input_tokens_seen": 19435275, "step": 911, "time_per_iteration": 2.692798137664795 }, { "auxiliary_loss_clip": 0.01187113, "auxiliary_loss_mlp": 0.01073028, "balance_loss_clip": 1.05334687, "balance_loss_mlp": 1.04719508, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.283907419545301, "language_loss": 0.76320881, "learning_rate": 3.993535491899736e-06, "loss": 0.78581023, "num_input_tokens_seen": 19452090, "step": 912, "time_per_iteration": 2.6653189659118652 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.05707574, "balance_loss_mlp": 1.0271548, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.366460016615147, "language_loss": 0.82826668, "learning_rate": 3.993504165853694e-06, "loss": 0.85064626, "num_input_tokens_seen": 19470865, "step": 913, "time_per_iteration": 2.6826348304748535 }, { "auxiliary_loss_clip": 0.01194515, "auxiliary_loss_mlp": 0.01060483, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.03651023, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 3.3338391252510586, "language_loss": 0.8373239, "learning_rate": 3.993472764213772e-06, "loss": 0.85987389, "num_input_tokens_seen": 19492145, "step": 914, "time_per_iteration": 2.7358829975128174 }, { "auxiliary_loss_clip": 0.0120705, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.06039774, "balance_loss_mlp": 1.00027478, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.520244909384168, "language_loss": 0.90146536, "learning_rate": 3.9934412869811655e-06, "loss": 0.92135859, "num_input_tokens_seen": 19511015, "step": 915, "time_per_iteration": 2.9398341178894043 }, { "auxiliary_loss_clip": 0.01201461, "auxiliary_loss_mlp": 0.01059252, "balance_loss_clip": 1.06274199, "balance_loss_mlp": 1.03558862, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 2.182721785653499, "language_loss": 0.89710975, "learning_rate": 3.993409734157064e-06, "loss": 0.91971689, "num_input_tokens_seen": 19529040, "step": 916, "time_per_iteration": 2.7210159301757812 }, { "auxiliary_loss_clip": 0.01175226, "auxiliary_loss_mlp": 0.01066073, "balance_loss_clip": 1.05741024, "balance_loss_mlp": 1.04103947, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 1.7899379897310368, "language_loss": 0.8016991, "learning_rate": 3.993378105742666e-06, "loss": 0.82411212, "num_input_tokens_seen": 19549540, "step": 917, "time_per_iteration": 2.7923104763031006 }, { "auxiliary_loss_clip": 0.01139072, "auxiliary_loss_mlp": 0.0105947, "balance_loss_clip": 1.05135942, "balance_loss_mlp": 1.03414989, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 2.106744179667805, "language_loss": 0.79437333, "learning_rate": 3.9933464017391705e-06, "loss": 0.81635869, "num_input_tokens_seen": 19567570, "step": 918, "time_per_iteration": 2.8051092624664307 }, { "auxiliary_loss_clip": 0.01196947, "auxiliary_loss_mlp": 0.01055679, "balance_loss_clip": 1.05616307, "balance_loss_mlp": 1.03166997, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.454030193031321, "language_loss": 0.89019686, "learning_rate": 3.99331462214778e-06, "loss": 0.91272312, "num_input_tokens_seen": 19585330, "step": 919, "time_per_iteration": 2.6846773624420166 }, { "auxiliary_loss_clip": 0.01213326, "auxiliary_loss_mlp": 0.01069349, "balance_loss_clip": 1.05950904, "balance_loss_mlp": 1.04417229, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.246354931091656, "language_loss": 0.8746047, "learning_rate": 3.993282766969699e-06, "loss": 0.89743137, "num_input_tokens_seen": 19604970, "step": 920, "time_per_iteration": 2.6699845790863037 }, { "auxiliary_loss_clip": 0.01190424, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.06023288, "balance_loss_mlp": 1.03657782, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 1.975714125194334, "language_loss": 0.6568011, "learning_rate": 3.993250836206136e-06, "loss": 0.67930895, "num_input_tokens_seen": 19626235, "step": 921, "time_per_iteration": 2.833644390106201 }, { "auxiliary_loss_clip": 0.01209678, "auxiliary_loss_mlp": 0.01065483, "balance_loss_clip": 1.06060767, "balance_loss_mlp": 1.03874445, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 1.7242493696651606, "language_loss": 0.71861136, "learning_rate": 3.993218829858301e-06, "loss": 0.74136293, "num_input_tokens_seen": 19644305, "step": 922, "time_per_iteration": 2.6168808937072754 }, { "auxiliary_loss_clip": 0.01187138, "auxiliary_loss_mlp": 0.01067213, "balance_loss_clip": 1.05423355, "balance_loss_mlp": 1.04223895, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 2.6848185900705412, "language_loss": 0.82304025, "learning_rate": 3.993186747927408e-06, "loss": 0.8455838, "num_input_tokens_seen": 19662130, "step": 923, "time_per_iteration": 2.7298316955566406 }, { "auxiliary_loss_clip": 0.01202941, "auxiliary_loss_mlp": 0.01064106, "balance_loss_clip": 1.05725455, "balance_loss_mlp": 1.03933442, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 1.9334372940525173, "language_loss": 0.78759122, "learning_rate": 3.993154590414675e-06, "loss": 0.81026167, "num_input_tokens_seen": 19680715, "step": 924, "time_per_iteration": 2.6869630813598633 }, { "auxiliary_loss_clip": 0.0116422, "auxiliary_loss_mlp": 0.01053758, "balance_loss_clip": 1.05395627, "balance_loss_mlp": 1.02844954, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.005203138116014, "language_loss": 1.02005315, "learning_rate": 3.993122357321319e-06, "loss": 1.04223299, "num_input_tokens_seen": 19700535, "step": 925, "time_per_iteration": 2.716089963912964 }, { "auxiliary_loss_clip": 0.01163201, "auxiliary_loss_mlp": 0.01052104, "balance_loss_clip": 1.05070591, "balance_loss_mlp": 1.02739179, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 2.0106641835017482, "language_loss": 0.80939209, "learning_rate": 3.993090048648564e-06, "loss": 0.83154511, "num_input_tokens_seen": 19718825, "step": 926, "time_per_iteration": 2.895803451538086 }, { "auxiliary_loss_clip": 0.01207515, "auxiliary_loss_mlp": 0.01068168, "balance_loss_clip": 1.05892682, "balance_loss_mlp": 1.0419066, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 2.9732625845644045, "language_loss": 0.73220479, "learning_rate": 3.993057664397634e-06, "loss": 0.75496161, "num_input_tokens_seen": 19739080, "step": 927, "time_per_iteration": 2.677725076675415 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01015011, "balance_loss_clip": 1.02922702, "balance_loss_mlp": 1.01014709, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.8406874373244947, "language_loss": 0.59841412, "learning_rate": 3.9930252045697585e-06, "loss": 0.61957431, "num_input_tokens_seen": 19802960, "step": 928, "time_per_iteration": 3.187382221221924 }, { "auxiliary_loss_clip": 0.01202438, "auxiliary_loss_mlp": 0.01065066, "balance_loss_clip": 1.05921853, "balance_loss_mlp": 1.04070008, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.0668361967965994, "language_loss": 0.95411372, "learning_rate": 3.992992669166168e-06, "loss": 0.97678876, "num_input_tokens_seen": 19822765, "step": 929, "time_per_iteration": 2.6930506229400635 }, { "auxiliary_loss_clip": 0.01171806, "auxiliary_loss_mlp": 0.01068051, "balance_loss_clip": 1.05343258, "balance_loss_mlp": 1.04101443, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.1442452677256627, "language_loss": 0.71756601, "learning_rate": 3.992960058188094e-06, "loss": 0.7399646, "num_input_tokens_seen": 19843590, "step": 930, "time_per_iteration": 2.803219795227051 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01058888, "balance_loss_clip": 1.05783677, "balance_loss_mlp": 1.03377056, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 2.381261552273062, "language_loss": 0.85279298, "learning_rate": 3.992927371636776e-06, "loss": 0.87526459, "num_input_tokens_seen": 19860230, "step": 931, "time_per_iteration": 2.6215872764587402 }, { "auxiliary_loss_clip": 0.01203533, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05677414, "balance_loss_mlp": 1.00025761, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 2.2861197477099973, "language_loss": 0.83645165, "learning_rate": 3.9928946095134525e-06, "loss": 0.85631776, "num_input_tokens_seen": 19880795, "step": 932, "time_per_iteration": 2.664062261581421 }, { "auxiliary_loss_clip": 0.01200637, "auxiliary_loss_mlp": 0.0107041, "balance_loss_clip": 1.05897784, "balance_loss_mlp": 1.04407716, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 1.8036739452122519, "language_loss": 0.73694205, "learning_rate": 3.992861771819365e-06, "loss": 0.7596525, "num_input_tokens_seen": 19897960, "step": 933, "time_per_iteration": 2.631620168685913 }, { "auxiliary_loss_clip": 0.01153445, "auxiliary_loss_mlp": 0.01076903, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.05060577, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 2.385249039382274, "language_loss": 0.86660421, "learning_rate": 3.99282885855576e-06, "loss": 0.88890779, "num_input_tokens_seen": 19913315, "step": 934, "time_per_iteration": 2.7739439010620117 }, { "auxiliary_loss_clip": 0.01164295, "auxiliary_loss_mlp": 0.0108083, "balance_loss_clip": 1.05509257, "balance_loss_mlp": 1.0557723, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.2740258482680433, "language_loss": 0.80388415, "learning_rate": 3.992795869723885e-06, "loss": 0.82633543, "num_input_tokens_seen": 19928790, "step": 935, "time_per_iteration": 5.93512487411499 }, { "auxiliary_loss_clip": 0.01093927, "auxiliary_loss_mlp": 0.01019701, "balance_loss_clip": 1.02288604, "balance_loss_mlp": 1.01540911, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.820561718243334, "language_loss": 0.69191676, "learning_rate": 3.99276280532499e-06, "loss": 0.71305299, "num_input_tokens_seen": 19988785, "step": 936, "time_per_iteration": 4.862478733062744 }, { "auxiliary_loss_clip": 0.01213648, "auxiliary_loss_mlp": 0.01068507, "balance_loss_clip": 1.05806684, "balance_loss_mlp": 1.04429567, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 1.9573264311231433, "language_loss": 0.7572521, "learning_rate": 3.992729665360331e-06, "loss": 0.78007358, "num_input_tokens_seen": 20007685, "step": 937, "time_per_iteration": 4.219425916671753 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01013805, "balance_loss_clip": 1.02476001, "balance_loss_mlp": 1.00944233, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.9053055994078011, "language_loss": 0.64309287, "learning_rate": 3.992696449831162e-06, "loss": 0.66408622, "num_input_tokens_seen": 20072750, "step": 938, "time_per_iteration": 3.1298794746398926 }, { "auxiliary_loss_clip": 0.01171203, "auxiliary_loss_mlp": 0.01068815, "balance_loss_clip": 1.05175185, "balance_loss_mlp": 1.0426966, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 2.7427540631348832, "language_loss": 0.79751205, "learning_rate": 3.992663158738745e-06, "loss": 0.8199122, "num_input_tokens_seen": 20089070, "step": 939, "time_per_iteration": 2.6863484382629395 }, { "auxiliary_loss_clip": 0.01175528, "auxiliary_loss_mlp": 0.01068297, "balance_loss_clip": 1.0509069, "balance_loss_mlp": 1.04338217, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.8374791395473227, "language_loss": 0.73919088, "learning_rate": 3.992629792084341e-06, "loss": 0.76162916, "num_input_tokens_seen": 20108790, "step": 940, "time_per_iteration": 2.7111120223999023 }, { "auxiliary_loss_clip": 0.01198483, "auxiliary_loss_mlp": 0.01058511, "balance_loss_clip": 1.05900669, "balance_loss_mlp": 1.03252339, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 2.2993716569389813, "language_loss": 0.70622003, "learning_rate": 3.992596349869216e-06, "loss": 0.72878999, "num_input_tokens_seen": 20128455, "step": 941, "time_per_iteration": 2.657594680786133 }, { "auxiliary_loss_clip": 0.01135396, "auxiliary_loss_mlp": 0.01059543, "balance_loss_clip": 1.04961574, "balance_loss_mlp": 1.03382993, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 2.0678542992190847, "language_loss": 0.80921417, "learning_rate": 3.992562832094637e-06, "loss": 0.83116359, "num_input_tokens_seen": 20145775, "step": 942, "time_per_iteration": 2.7379891872406006 }, { "auxiliary_loss_clip": 0.01186767, "auxiliary_loss_mlp": 0.01062055, "balance_loss_clip": 1.05228579, "balance_loss_mlp": 1.03554332, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.245249922529115, "language_loss": 0.88858449, "learning_rate": 3.9925292387618755e-06, "loss": 0.91107273, "num_input_tokens_seen": 20164315, "step": 943, "time_per_iteration": 2.6502583026885986 }, { "auxiliary_loss_clip": 0.01199122, "auxiliary_loss_mlp": 0.0105963, "balance_loss_clip": 1.05991781, "balance_loss_mlp": 1.03534663, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.5514256959015995, "language_loss": 0.74771839, "learning_rate": 3.992495569872206e-06, "loss": 0.77030593, "num_input_tokens_seen": 20182760, "step": 944, "time_per_iteration": 2.676079034805298 }, { "auxiliary_loss_clip": 0.01204502, "auxiliary_loss_mlp": 0.01064591, "balance_loss_clip": 1.05980551, "balance_loss_mlp": 1.04085672, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.5959266123312272, "language_loss": 0.79406166, "learning_rate": 3.992461825426906e-06, "loss": 0.81675267, "num_input_tokens_seen": 20203830, "step": 945, "time_per_iteration": 2.734299421310425 }, { "auxiliary_loss_clip": 0.01195984, "auxiliary_loss_mlp": 0.0105672, "balance_loss_clip": 1.05686593, "balance_loss_mlp": 1.03156662, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.5637081249861824, "language_loss": 0.82651746, "learning_rate": 3.992428005427252e-06, "loss": 0.84904456, "num_input_tokens_seen": 20220365, "step": 946, "time_per_iteration": 2.6636929512023926 }, { "auxiliary_loss_clip": 0.0122014, "auxiliary_loss_mlp": 0.01061449, "balance_loss_clip": 1.06224144, "balance_loss_mlp": 1.03524721, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 1.8433174156507384, "language_loss": 0.79031301, "learning_rate": 3.992394109874529e-06, "loss": 0.81312895, "num_input_tokens_seen": 20238640, "step": 947, "time_per_iteration": 2.623671293258667 }, { "auxiliary_loss_clip": 0.0117587, "auxiliary_loss_mlp": 0.01061489, "balance_loss_clip": 1.05605412, "balance_loss_mlp": 1.03569245, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 6.8661947111986725, "language_loss": 0.85425055, "learning_rate": 3.9923601387700225e-06, "loss": 0.87662411, "num_input_tokens_seen": 20251025, "step": 948, "time_per_iteration": 2.7410409450531006 }, { "auxiliary_loss_clip": 0.01214005, "auxiliary_loss_mlp": 0.01063231, "balance_loss_clip": 1.05969238, "balance_loss_mlp": 1.03598022, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 3.649211317819821, "language_loss": 0.87346625, "learning_rate": 3.992326092115019e-06, "loss": 0.89623863, "num_input_tokens_seen": 20269775, "step": 949, "time_per_iteration": 2.6893157958984375 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.0106695, "balance_loss_clip": 1.05799937, "balance_loss_mlp": 1.04266715, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 1.8324883776363103, "language_loss": 0.7874645, "learning_rate": 3.992291969910811e-06, "loss": 0.8100794, "num_input_tokens_seen": 20287715, "step": 950, "time_per_iteration": 2.623924732208252 }, { "auxiliary_loss_clip": 0.01180518, "auxiliary_loss_mlp": 0.01068771, "balance_loss_clip": 1.05322623, "balance_loss_mlp": 1.04384422, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 3.8045132244795816, "language_loss": 0.82477522, "learning_rate": 3.992257772158691e-06, "loss": 0.8472681, "num_input_tokens_seen": 20307070, "step": 951, "time_per_iteration": 2.697479724884033 }, { "auxiliary_loss_clip": 0.01167302, "auxiliary_loss_mlp": 0.01061039, "balance_loss_clip": 1.04906607, "balance_loss_mlp": 1.03375173, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 2.4180383362968634, "language_loss": 0.86899263, "learning_rate": 3.992223498859958e-06, "loss": 0.89127606, "num_input_tokens_seen": 20324945, "step": 952, "time_per_iteration": 2.707716226577759 }, { "auxiliary_loss_clip": 0.01191405, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.05511189, "balance_loss_mlp": 1.03630924, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.195434645270168, "language_loss": 0.79087842, "learning_rate": 3.9921891500159084e-06, "loss": 0.81343949, "num_input_tokens_seen": 20346135, "step": 953, "time_per_iteration": 2.671255588531494 }, { "auxiliary_loss_clip": 0.01190026, "auxiliary_loss_mlp": 0.01066447, "balance_loss_clip": 1.05984342, "balance_loss_mlp": 1.04056656, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.2066085695914466, "language_loss": 0.86644447, "learning_rate": 3.992154725627848e-06, "loss": 0.88900924, "num_input_tokens_seen": 20364450, "step": 954, "time_per_iteration": 2.671657085418701 }, { "auxiliary_loss_clip": 0.01210569, "auxiliary_loss_mlp": 0.01062619, "balance_loss_clip": 1.06119955, "balance_loss_mlp": 1.03723955, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.2872795023766113, "language_loss": 0.88071024, "learning_rate": 3.9921202256970804e-06, "loss": 0.90344214, "num_input_tokens_seen": 20383500, "step": 955, "time_per_iteration": 2.69960880279541 }, { "auxiliary_loss_clip": 0.01179864, "auxiliary_loss_mlp": 0.01068889, "balance_loss_clip": 1.0523231, "balance_loss_mlp": 1.04209054, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 1.9113555723128555, "language_loss": 0.89160776, "learning_rate": 3.992085650224914e-06, "loss": 0.91409534, "num_input_tokens_seen": 20400295, "step": 956, "time_per_iteration": 2.667868137359619 }, { "auxiliary_loss_clip": 0.01167867, "auxiliary_loss_mlp": 0.01060669, "balance_loss_clip": 1.05720079, "balance_loss_mlp": 1.03450251, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 3.2877973901728095, "language_loss": 0.75473189, "learning_rate": 3.99205099921266e-06, "loss": 0.77701724, "num_input_tokens_seen": 20419085, "step": 957, "time_per_iteration": 2.6938796043395996 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01072849, "balance_loss_clip": 1.05432248, "balance_loss_mlp": 1.0448705, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.0004055711005257, "language_loss": 0.79582155, "learning_rate": 3.992016272661633e-06, "loss": 0.81831264, "num_input_tokens_seen": 20437465, "step": 958, "time_per_iteration": 2.6933834552764893 }, { "auxiliary_loss_clip": 0.01186244, "auxiliary_loss_mlp": 0.01059908, "balance_loss_clip": 1.05851364, "balance_loss_mlp": 1.03572011, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.669863855173802, "language_loss": 0.8840394, "learning_rate": 3.99198147057315e-06, "loss": 0.906501, "num_input_tokens_seen": 20456235, "step": 959, "time_per_iteration": 2.7094578742980957 }, { "auxiliary_loss_clip": 0.01169479, "auxiliary_loss_mlp": 0.01063656, "balance_loss_clip": 1.05511999, "balance_loss_mlp": 1.03881276, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.0960373333994764, "language_loss": 0.78850955, "learning_rate": 3.991946592948529e-06, "loss": 0.8108409, "num_input_tokens_seen": 20476825, "step": 960, "time_per_iteration": 2.822922945022583 }, { "auxiliary_loss_clip": 0.0113413, "auxiliary_loss_mlp": 0.01067189, "balance_loss_clip": 1.05177355, "balance_loss_mlp": 1.04020023, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 2.063464892179025, "language_loss": 0.92986894, "learning_rate": 3.991911639789094e-06, "loss": 0.95188212, "num_input_tokens_seen": 20496965, "step": 961, "time_per_iteration": 2.793952226638794 }, { "auxiliary_loss_clip": 0.01182535, "auxiliary_loss_mlp": 0.0106764, "balance_loss_clip": 1.0554297, "balance_loss_mlp": 1.04091299, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.0649993155313067, "language_loss": 0.68164188, "learning_rate": 3.991876611096169e-06, "loss": 0.70414358, "num_input_tokens_seen": 20518035, "step": 962, "time_per_iteration": 2.8396694660186768 }, { "auxiliary_loss_clip": 0.01159524, "auxiliary_loss_mlp": 0.01073851, "balance_loss_clip": 1.05128908, "balance_loss_mlp": 1.04909074, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.2685465488517074, "language_loss": 0.8848027, "learning_rate": 3.991841506871084e-06, "loss": 0.90713644, "num_input_tokens_seen": 20534740, "step": 963, "time_per_iteration": 2.7077019214630127 }, { "auxiliary_loss_clip": 0.01183778, "auxiliary_loss_mlp": 0.01061251, "balance_loss_clip": 1.06018209, "balance_loss_mlp": 1.03516829, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.392959969035536, "language_loss": 0.85288298, "learning_rate": 3.99180632711517e-06, "loss": 0.87533331, "num_input_tokens_seen": 20553485, "step": 964, "time_per_iteration": 2.7218217849731445 }, { "auxiliary_loss_clip": 0.01188683, "auxiliary_loss_mlp": 0.01069422, "balance_loss_clip": 1.05959499, "balance_loss_mlp": 1.04325557, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 3.087349735715565, "language_loss": 0.78159416, "learning_rate": 3.99177107182976e-06, "loss": 0.80417526, "num_input_tokens_seen": 20572155, "step": 965, "time_per_iteration": 2.6902661323547363 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.0107109, "balance_loss_clip": 1.0523715, "balance_loss_mlp": 1.04462528, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9742288518319486, "language_loss": 0.81403655, "learning_rate": 3.99173574101619e-06, "loss": 0.83631277, "num_input_tokens_seen": 20590395, "step": 966, "time_per_iteration": 2.7423267364501953 }, { "auxiliary_loss_clip": 0.01198908, "auxiliary_loss_mlp": 0.01065021, "balance_loss_clip": 1.058887, "balance_loss_mlp": 1.04113197, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.8776530142118544, "language_loss": 0.76480806, "learning_rate": 3.9917003346758035e-06, "loss": 0.78744727, "num_input_tokens_seen": 20608435, "step": 967, "time_per_iteration": 2.642885446548462 }, { "auxiliary_loss_clip": 0.01084339, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.02675521, "balance_loss_mlp": 1.0269078, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.985564929959949, "language_loss": 0.57357776, "learning_rate": 3.991664852809939e-06, "loss": 0.59473509, "num_input_tokens_seen": 20668575, "step": 968, "time_per_iteration": 3.1017024517059326 }, { "auxiliary_loss_clip": 0.01188824, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.05784404, "balance_loss_mlp": 1.03147697, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.1276337565108485, "language_loss": 0.82286429, "learning_rate": 3.991629295419945e-06, "loss": 0.84532964, "num_input_tokens_seen": 20687355, "step": 969, "time_per_iteration": 2.669055461883545 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.00782724, "balance_loss_clip": 1.06255269, "balance_loss_mlp": 1.00024962, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 7.916507288074279, "language_loss": 0.7803669, "learning_rate": 3.991593662507167e-06, "loss": 0.80025685, "num_input_tokens_seen": 20705710, "step": 970, "time_per_iteration": 2.733030080795288 }, { "auxiliary_loss_clip": 0.01181452, "auxiliary_loss_mlp": 0.01064945, "balance_loss_clip": 1.05691695, "balance_loss_mlp": 1.03887415, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 3.163102883752813, "language_loss": 0.92229038, "learning_rate": 3.991557954072958e-06, "loss": 0.94475436, "num_input_tokens_seen": 20722405, "step": 971, "time_per_iteration": 2.730377435684204 }, { "auxiliary_loss_clip": 0.01180948, "auxiliary_loss_mlp": 0.01062613, "balance_loss_clip": 1.05320477, "balance_loss_mlp": 1.03722143, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.700187330091603, "language_loss": 0.85959208, "learning_rate": 3.991522170118673e-06, "loss": 0.88202775, "num_input_tokens_seen": 20741480, "step": 972, "time_per_iteration": 2.687185049057007 }, { "auxiliary_loss_clip": 0.0116993, "auxiliary_loss_mlp": 0.01079713, "balance_loss_clip": 1.05714142, "balance_loss_mlp": 1.05601454, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.00599255988541, "language_loss": 0.87503272, "learning_rate": 3.991486310645667e-06, "loss": 0.89752913, "num_input_tokens_seen": 20759685, "step": 973, "time_per_iteration": 2.7166664600372314 }, { "auxiliary_loss_clip": 0.01206524, "auxiliary_loss_mlp": 0.00784111, "balance_loss_clip": 1.06111121, "balance_loss_mlp": 1.00026989, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 1.879365930358842, "language_loss": 0.74800295, "learning_rate": 3.991450375655301e-06, "loss": 0.76790935, "num_input_tokens_seen": 20778180, "step": 974, "time_per_iteration": 2.713594675064087 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.00782207, "balance_loss_clip": 1.059551, "balance_loss_mlp": 1.00025892, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.5923993506380014, "language_loss": 0.76874506, "learning_rate": 3.991414365148936e-06, "loss": 0.78853816, "num_input_tokens_seen": 20802705, "step": 975, "time_per_iteration": 7.600914716720581 }, { "auxiliary_loss_clip": 0.01215491, "auxiliary_loss_mlp": 0.01069506, "balance_loss_clip": 1.06030774, "balance_loss_mlp": 1.0444721, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 3.6132976830219734, "language_loss": 0.76748288, "learning_rate": 3.99137827912794e-06, "loss": 0.79033279, "num_input_tokens_seen": 20822540, "step": 976, "time_per_iteration": 4.324799537658691 }, { "auxiliary_loss_clip": 0.01176132, "auxiliary_loss_mlp": 0.01077003, "balance_loss_clip": 1.05271626, "balance_loss_mlp": 1.04963279, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 1.943198757110789, "language_loss": 0.87343585, "learning_rate": 3.991342117593679e-06, "loss": 0.89596725, "num_input_tokens_seen": 20844175, "step": 977, "time_per_iteration": 2.7742488384246826 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.06209528, "balance_loss_mlp": 1.04231977, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 1.718987046197629, "language_loss": 0.7969116, "learning_rate": 3.991305880547527e-06, "loss": 0.81946236, "num_input_tokens_seen": 20864730, "step": 978, "time_per_iteration": 2.733372926712036 }, { "auxiliary_loss_clip": 0.01136264, "auxiliary_loss_mlp": 0.01076585, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04927468, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 1.8692877257975375, "language_loss": 0.80665666, "learning_rate": 3.991269567990855e-06, "loss": 0.82878518, "num_input_tokens_seen": 20885200, "step": 979, "time_per_iteration": 3.2624220848083496 }, { "auxiliary_loss_clip": 0.01074686, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.02640033, "balance_loss_mlp": 1.02495658, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9436493040005753, "language_loss": 0.59004962, "learning_rate": 3.9912331799250415e-06, "loss": 0.6110934, "num_input_tokens_seen": 20940325, "step": 980, "time_per_iteration": 3.4688587188720703 }, { "auxiliary_loss_clip": 0.01211665, "auxiliary_loss_mlp": 0.01078603, "balance_loss_clip": 1.06178868, "balance_loss_mlp": 1.05242431, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.2770545408130514, "language_loss": 0.86436182, "learning_rate": 3.9911967163514665e-06, "loss": 0.88726455, "num_input_tokens_seen": 20958220, "step": 981, "time_per_iteration": 2.5824644565582275 }, { "auxiliary_loss_clip": 0.01190085, "auxiliary_loss_mlp": 0.0106921, "balance_loss_clip": 1.05943286, "balance_loss_mlp": 1.04629803, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.1333982175691855, "language_loss": 0.79293346, "learning_rate": 3.991160177271513e-06, "loss": 0.81552643, "num_input_tokens_seen": 20978920, "step": 982, "time_per_iteration": 2.68428897857666 }, { "auxiliary_loss_clip": 0.01192274, "auxiliary_loss_mlp": 0.01068234, "balance_loss_clip": 1.05926657, "balance_loss_mlp": 1.04356933, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 2.319627739094249, "language_loss": 0.84413779, "learning_rate": 3.9911235626865654e-06, "loss": 0.86674285, "num_input_tokens_seen": 20999490, "step": 983, "time_per_iteration": 2.7006261348724365 }, { "auxiliary_loss_clip": 0.0120015, "auxiliary_loss_mlp": 0.01072669, "balance_loss_clip": 1.05969584, "balance_loss_mlp": 1.04799283, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.8014395118859294, "language_loss": 0.84510243, "learning_rate": 3.9910868725980125e-06, "loss": 0.86783063, "num_input_tokens_seen": 21017865, "step": 984, "time_per_iteration": 2.640246868133545 }, { "auxiliary_loss_clip": 0.01188594, "auxiliary_loss_mlp": 0.01055296, "balance_loss_clip": 1.05650342, "balance_loss_mlp": 1.03171611, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 2.473231587287368, "language_loss": 0.77611595, "learning_rate": 3.9910501070072465e-06, "loss": 0.7985549, "num_input_tokens_seen": 21035900, "step": 985, "time_per_iteration": 2.626371383666992 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03542209, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.9082382068459252, "language_loss": 0.90593231, "learning_rate": 3.991013265915661e-06, "loss": 0.92804158, "num_input_tokens_seen": 21053235, "step": 986, "time_per_iteration": 2.7834935188293457 }, { "auxiliary_loss_clip": 0.01200704, "auxiliary_loss_mlp": 0.01061312, "balance_loss_clip": 1.05555892, "balance_loss_mlp": 1.03425193, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.216017383423336, "language_loss": 0.75688565, "learning_rate": 3.9909763493246525e-06, "loss": 0.77950585, "num_input_tokens_seen": 21073090, "step": 987, "time_per_iteration": 2.6669981479644775 }, { "auxiliary_loss_clip": 0.01203558, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.06134868, "balance_loss_mlp": 1.03331852, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.2869993581633827, "language_loss": 0.71867943, "learning_rate": 3.990939357235621e-06, "loss": 0.7412926, "num_input_tokens_seen": 21094895, "step": 988, "time_per_iteration": 2.805851697921753 }, { "auxiliary_loss_clip": 0.0105006, "auxiliary_loss_mlp": 0.0101134, "balance_loss_clip": 1.02230322, "balance_loss_mlp": 1.00688171, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9416454944601763, "language_loss": 0.7124939, "learning_rate": 3.99090228964997e-06, "loss": 0.73310792, "num_input_tokens_seen": 21147555, "step": 989, "time_per_iteration": 3.100306749343872 }, { "auxiliary_loss_clip": 0.0117797, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.05793095, "balance_loss_mlp": 1.04389191, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 2.0167260155417113, "language_loss": 0.78245646, "learning_rate": 3.990865146569105e-06, "loss": 0.80495882, "num_input_tokens_seen": 21167845, "step": 990, "time_per_iteration": 2.8133904933929443 }, { "auxiliary_loss_clip": 0.01198295, "auxiliary_loss_mlp": 0.01053485, "balance_loss_clip": 1.06166339, "balance_loss_mlp": 1.02761686, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.2411623387553727, "language_loss": 0.86522102, "learning_rate": 3.990827927994434e-06, "loss": 0.88773882, "num_input_tokens_seen": 21185085, "step": 991, "time_per_iteration": 2.6964831352233887 }, { "auxiliary_loss_clip": 0.0121783, "auxiliary_loss_mlp": 0.01064707, "balance_loss_clip": 1.0613625, "balance_loss_mlp": 1.03943431, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 1.8566945591898132, "language_loss": 0.76738375, "learning_rate": 3.9907906339273674e-06, "loss": 0.79020917, "num_input_tokens_seen": 21204230, "step": 992, "time_per_iteration": 2.646942377090454 }, { "auxiliary_loss_clip": 0.01146457, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.05571234, "balance_loss_mlp": 1.03834832, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.3469050968731233, "language_loss": 0.75117075, "learning_rate": 3.9907532643693215e-06, "loss": 0.77325845, "num_input_tokens_seen": 21222655, "step": 993, "time_per_iteration": 2.7642974853515625 }, { "auxiliary_loss_clip": 0.01157785, "auxiliary_loss_mlp": 0.01075532, "balance_loss_clip": 1.05397618, "balance_loss_mlp": 1.04774487, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 2.725207959052886, "language_loss": 0.79177904, "learning_rate": 3.990715819321712e-06, "loss": 0.81411219, "num_input_tokens_seen": 21242310, "step": 994, "time_per_iteration": 2.8414714336395264 }, { "auxiliary_loss_clip": 0.01214724, "auxiliary_loss_mlp": 0.01079016, "balance_loss_clip": 1.06264019, "balance_loss_mlp": 1.05361295, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.8097993094234983, "language_loss": 0.79917169, "learning_rate": 3.99067829878596e-06, "loss": 0.82210916, "num_input_tokens_seen": 21261410, "step": 995, "time_per_iteration": 2.6524364948272705 }, { "auxiliary_loss_clip": 0.0116696, "auxiliary_loss_mlp": 0.01068218, "balance_loss_clip": 1.05704355, "balance_loss_mlp": 1.04208767, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 1.902030256537741, "language_loss": 0.87013257, "learning_rate": 3.990640702763487e-06, "loss": 0.89248431, "num_input_tokens_seen": 21280080, "step": 996, "time_per_iteration": 2.7431676387786865 }, { "auxiliary_loss_clip": 0.01177854, "auxiliary_loss_mlp": 0.01081123, "balance_loss_clip": 1.05684328, "balance_loss_mlp": 1.05055761, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 2.971039758986745, "language_loss": 0.87273014, "learning_rate": 3.990603031255718e-06, "loss": 0.89531994, "num_input_tokens_seen": 21296765, "step": 997, "time_per_iteration": 2.748448371887207 }, { "auxiliary_loss_clip": 0.01069915, "auxiliary_loss_mlp": 0.01014417, "balance_loss_clip": 1.02303648, "balance_loss_mlp": 1.00972033, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0091092068179202, "language_loss": 0.75381488, "learning_rate": 3.990565284264083e-06, "loss": 0.7746582, "num_input_tokens_seen": 21363345, "step": 998, "time_per_iteration": 3.2950518131256104 }, { "auxiliary_loss_clip": 0.01170062, "auxiliary_loss_mlp": 0.01065521, "balance_loss_clip": 1.05893683, "balance_loss_mlp": 1.03893745, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 1.8197691299520968, "language_loss": 0.76053095, "learning_rate": 3.990527461790013e-06, "loss": 0.7828868, "num_input_tokens_seen": 21385290, "step": 999, "time_per_iteration": 2.733802556991577 }, { "auxiliary_loss_clip": 0.01197834, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.05646563, "balance_loss_mlp": 1.03339899, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 2.5948629341774874, "language_loss": 0.82992184, "learning_rate": 3.990489563834943e-06, "loss": 0.85249555, "num_input_tokens_seen": 21407625, "step": 1000, "time_per_iteration": 2.710981845855713 }, { "auxiliary_loss_clip": 0.0118571, "auxiliary_loss_mlp": 0.01062188, "balance_loss_clip": 1.05856955, "balance_loss_mlp": 1.03480577, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 2.111409807940472, "language_loss": 0.85820085, "learning_rate": 3.990451590400309e-06, "loss": 0.88067985, "num_input_tokens_seen": 21426835, "step": 1001, "time_per_iteration": 2.73445463180542 }, { "auxiliary_loss_clip": 0.01191917, "auxiliary_loss_mlp": 0.01062059, "balance_loss_clip": 1.06167853, "balance_loss_mlp": 1.03719211, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 1.8359711451165206, "language_loss": 0.74128318, "learning_rate": 3.990413541487551e-06, "loss": 0.76382297, "num_input_tokens_seen": 21444920, "step": 1002, "time_per_iteration": 2.8861100673675537 }, { "auxiliary_loss_clip": 0.01214316, "auxiliary_loss_mlp": 0.01062589, "balance_loss_clip": 1.06316125, "balance_loss_mlp": 1.03737664, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.1835040648243997, "language_loss": 0.75520515, "learning_rate": 3.990375417098112e-06, "loss": 0.77797419, "num_input_tokens_seen": 21463555, "step": 1003, "time_per_iteration": 2.632889747619629 }, { "auxiliary_loss_clip": 0.01187709, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.05934548, "balance_loss_mlp": 1.03928304, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 2.3150099602993155, "language_loss": 0.70349169, "learning_rate": 3.990337217233437e-06, "loss": 0.72601682, "num_input_tokens_seen": 21481990, "step": 1004, "time_per_iteration": 2.6947617530822754 }, { "auxiliary_loss_clip": 0.01212815, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.06629324, "balance_loss_mlp": 1.05168116, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 2.276868338025253, "language_loss": 0.83444524, "learning_rate": 3.990298941894976e-06, "loss": 0.85734791, "num_input_tokens_seen": 21500385, "step": 1005, "time_per_iteration": 2.581683397293091 }, { "auxiliary_loss_clip": 0.01077621, "auxiliary_loss_mlp": 0.01004707, "balance_loss_clip": 1.02541244, "balance_loss_mlp": 1.00029612, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.903813421793838, "language_loss": 0.59018111, "learning_rate": 3.9902605910841794e-06, "loss": 0.61100447, "num_input_tokens_seen": 21561040, "step": 1006, "time_per_iteration": 3.222104787826538 }, { "auxiliary_loss_clip": 0.01183553, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.05334234, "balance_loss_mlp": 1.03284812, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.1584333764290853, "language_loss": 0.74229443, "learning_rate": 3.990222164802503e-06, "loss": 0.76472032, "num_input_tokens_seen": 21580655, "step": 1007, "time_per_iteration": 2.7130653858184814 }, { "auxiliary_loss_clip": 0.0119408, "auxiliary_loss_mlp": 0.01060431, "balance_loss_clip": 1.06008601, "balance_loss_mlp": 1.03493261, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 1.7956876298455304, "language_loss": 0.8081426, "learning_rate": 3.9901836630514006e-06, "loss": 0.8306877, "num_input_tokens_seen": 21599650, "step": 1008, "time_per_iteration": 2.7151994705200195 }, { "auxiliary_loss_clip": 0.01175291, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.0305717, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 2.3069524306559837, "language_loss": 0.78198558, "learning_rate": 3.990145085832335e-06, "loss": 0.8043021, "num_input_tokens_seen": 21617550, "step": 1009, "time_per_iteration": 2.7313599586486816 }, { "auxiliary_loss_clip": 0.01194621, "auxiliary_loss_mlp": 0.01061233, "balance_loss_clip": 1.06150866, "balance_loss_mlp": 1.03726041, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7452257697216769, "language_loss": 0.93148172, "learning_rate": 3.990106433146769e-06, "loss": 0.95404023, "num_input_tokens_seen": 21635865, "step": 1010, "time_per_iteration": 2.7233662605285645 }, { "auxiliary_loss_clip": 0.01148246, "auxiliary_loss_mlp": 0.00784144, "balance_loss_clip": 1.05304599, "balance_loss_mlp": 1.00029802, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.9999367504779517, "language_loss": 0.72022474, "learning_rate": 3.9900677049961665e-06, "loss": 0.73954868, "num_input_tokens_seen": 21653945, "step": 1011, "time_per_iteration": 2.804858446121216 }, { "auxiliary_loss_clip": 0.01194231, "auxiliary_loss_mlp": 0.01077344, "balance_loss_clip": 1.05968046, "balance_loss_mlp": 1.04868615, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 1.9573218215833301, "language_loss": 0.87526691, "learning_rate": 3.990028901381999e-06, "loss": 0.89798272, "num_input_tokens_seen": 21671230, "step": 1012, "time_per_iteration": 2.6466245651245117 }, { "auxiliary_loss_clip": 0.01184459, "auxiliary_loss_mlp": 0.01064264, "balance_loss_clip": 1.05652905, "balance_loss_mlp": 1.03838325, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 1.9062230938156723, "language_loss": 0.76947677, "learning_rate": 3.989990022305734e-06, "loss": 0.79196405, "num_input_tokens_seen": 21691155, "step": 1013, "time_per_iteration": 4.297588586807251 }, { "auxiliary_loss_clip": 0.01207383, "auxiliary_loss_mlp": 0.00783488, "balance_loss_clip": 1.06573224, "balance_loss_mlp": 1.00034499, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.441711811862119, "language_loss": 0.86151874, "learning_rate": 3.98995106776885e-06, "loss": 0.88142747, "num_input_tokens_seen": 21707405, "step": 1014, "time_per_iteration": 4.301488637924194 }, { "auxiliary_loss_clip": 0.0121503, "auxiliary_loss_mlp": 0.01072817, "balance_loss_clip": 1.06605387, "balance_loss_mlp": 1.04508948, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 2.4309754772209184, "language_loss": 0.73197287, "learning_rate": 3.98991203777282e-06, "loss": 0.75485134, "num_input_tokens_seen": 21728090, "step": 1015, "time_per_iteration": 4.384514808654785 }, { "auxiliary_loss_clip": 0.01187374, "auxiliary_loss_mlp": 0.01068593, "balance_loss_clip": 1.06084347, "balance_loss_mlp": 1.04228365, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.5896529502124837, "language_loss": 0.79109907, "learning_rate": 3.9898729323191275e-06, "loss": 0.81365877, "num_input_tokens_seen": 21747950, "step": 1016, "time_per_iteration": 4.3249351978302 }, { "auxiliary_loss_clip": 0.01173015, "auxiliary_loss_mlp": 0.0105746, "balance_loss_clip": 1.06036103, "balance_loss_mlp": 1.03249741, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.6772682648410928, "language_loss": 0.76014191, "learning_rate": 3.989833751409254e-06, "loss": 0.78244662, "num_input_tokens_seen": 21767900, "step": 1017, "time_per_iteration": 2.7983243465423584 }, { "auxiliary_loss_clip": 0.01188817, "auxiliary_loss_mlp": 0.01074603, "balance_loss_clip": 1.06584609, "balance_loss_mlp": 1.0483532, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 2.001716657382839, "language_loss": 0.85798436, "learning_rate": 3.989794495044685e-06, "loss": 0.88061857, "num_input_tokens_seen": 21787375, "step": 1018, "time_per_iteration": 2.702399253845215 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01069438, "balance_loss_clip": 1.06325769, "balance_loss_mlp": 1.04231787, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.9546929267460813, "language_loss": 0.76985347, "learning_rate": 3.989755163226909e-06, "loss": 0.79229128, "num_input_tokens_seen": 21806275, "step": 1019, "time_per_iteration": 2.780104875564575 }, { "auxiliary_loss_clip": 0.01160861, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.05355084, "balance_loss_mlp": 1.03511262, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.1848809329980288, "language_loss": 0.84122044, "learning_rate": 3.989715755957418e-06, "loss": 0.86344314, "num_input_tokens_seen": 21826430, "step": 1020, "time_per_iteration": 2.785963535308838 }, { "auxiliary_loss_clip": 0.01198473, "auxiliary_loss_mlp": 0.01063342, "balance_loss_clip": 1.06365371, "balance_loss_mlp": 1.03604269, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.933053672026977, "language_loss": 0.79114467, "learning_rate": 3.989676273237705e-06, "loss": 0.81376278, "num_input_tokens_seen": 21847800, "step": 1021, "time_per_iteration": 2.7968955039978027 }, { "auxiliary_loss_clip": 0.01189659, "auxiliary_loss_mlp": 0.01064044, "balance_loss_clip": 1.06159925, "balance_loss_mlp": 1.04114437, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 2.089525934673828, "language_loss": 0.87768298, "learning_rate": 3.9896367150692705e-06, "loss": 0.90022004, "num_input_tokens_seen": 21863385, "step": 1022, "time_per_iteration": 2.70906138420105 }, { "auxiliary_loss_clip": 0.01198737, "auxiliary_loss_mlp": 0.0106635, "balance_loss_clip": 1.06627858, "balance_loss_mlp": 1.04079151, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.7284486983379121, "language_loss": 0.82892007, "learning_rate": 3.989597081453611e-06, "loss": 0.85157096, "num_input_tokens_seen": 21881880, "step": 1023, "time_per_iteration": 2.71539568901062 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01010751, "balance_loss_clip": 1.03727341, "balance_loss_mlp": 1.00614953, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8894752517384502, "language_loss": 0.6505782, "learning_rate": 3.989557372392231e-06, "loss": 0.67169237, "num_input_tokens_seen": 21940550, "step": 1024, "time_per_iteration": 3.175217628479004 }, { "auxiliary_loss_clip": 0.01167458, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.05906856, "balance_loss_mlp": 1.04553604, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 2.320347485789288, "language_loss": 0.88069236, "learning_rate": 3.989517587886636e-06, "loss": 0.90310359, "num_input_tokens_seen": 21958390, "step": 1025, "time_per_iteration": 2.690725564956665 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.01066504, "balance_loss_clip": 1.06452, "balance_loss_mlp": 1.04173219, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.5217294712155414, "language_loss": 0.84536898, "learning_rate": 3.989477727938335e-06, "loss": 0.86800468, "num_input_tokens_seen": 21978625, "step": 1026, "time_per_iteration": 2.7420806884765625 }, { "auxiliary_loss_clip": 0.01160797, "auxiliary_loss_mlp": 0.0107525, "balance_loss_clip": 1.05669701, "balance_loss_mlp": 1.04934609, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 2.354014397396182, "language_loss": 0.8228389, "learning_rate": 3.989437792548839e-06, "loss": 0.84519935, "num_input_tokens_seen": 21996035, "step": 1027, "time_per_iteration": 2.6683874130249023 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01067253, "balance_loss_clip": 1.06181073, "balance_loss_mlp": 1.04232645, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 4.43492107605727, "language_loss": 0.83898664, "learning_rate": 3.989397781719663e-06, "loss": 0.86126107, "num_input_tokens_seen": 22011625, "step": 1028, "time_per_iteration": 2.705387592315674 }, { "auxiliary_loss_clip": 0.0106503, "auxiliary_loss_mlp": 0.01008074, "balance_loss_clip": 1.02410197, "balance_loss_mlp": 1.00347257, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 0.9383255649985517, "language_loss": 0.604738, "learning_rate": 3.989357695452323e-06, "loss": 0.62546903, "num_input_tokens_seen": 22066035, "step": 1029, "time_per_iteration": 3.0268616676330566 }, { "auxiliary_loss_clip": 0.01176182, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.05641246, "balance_loss_mlp": 1.04737473, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 4.246634693563946, "language_loss": 0.82589179, "learning_rate": 3.98931753374834e-06, "loss": 0.84839535, "num_input_tokens_seen": 22085015, "step": 1030, "time_per_iteration": 2.7035892009735107 }, { "auxiliary_loss_clip": 0.0122298, "auxiliary_loss_mlp": 0.01077745, "balance_loss_clip": 1.06850278, "balance_loss_mlp": 1.05185235, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 2.585240230669548, "language_loss": 0.79576576, "learning_rate": 3.989277296609237e-06, "loss": 0.81877303, "num_input_tokens_seen": 22102775, "step": 1031, "time_per_iteration": 2.60622501373291 }, { "auxiliary_loss_clip": 0.01188957, "auxiliary_loss_mlp": 0.01076754, "balance_loss_clip": 1.06396544, "balance_loss_mlp": 1.04982424, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.8815476991595563, "language_loss": 0.77384412, "learning_rate": 3.98923698403654e-06, "loss": 0.79650116, "num_input_tokens_seen": 22121680, "step": 1032, "time_per_iteration": 2.6753971576690674 }, { "auxiliary_loss_clip": 0.01198757, "auxiliary_loss_mlp": 0.01074736, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.04848623, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 3.147941025479245, "language_loss": 0.89323574, "learning_rate": 3.989196596031776e-06, "loss": 0.91597068, "num_input_tokens_seen": 22138155, "step": 1033, "time_per_iteration": 2.7313079833984375 }, { "auxiliary_loss_clip": 0.01209161, "auxiliary_loss_mlp": 0.01066082, "balance_loss_clip": 1.06214237, "balance_loss_mlp": 1.04119134, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 2.1035343880884145, "language_loss": 0.8455385, "learning_rate": 3.989156132596479e-06, "loss": 0.8682909, "num_input_tokens_seen": 22157420, "step": 1034, "time_per_iteration": 2.7541439533233643 }, { "auxiliary_loss_clip": 0.01180042, "auxiliary_loss_mlp": 0.01057312, "balance_loss_clip": 1.05896068, "balance_loss_mlp": 1.03155136, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8983498110529735, "language_loss": 0.8082794, "learning_rate": 3.989115593732182e-06, "loss": 0.83065289, "num_input_tokens_seen": 22178620, "step": 1035, "time_per_iteration": 2.7965424060821533 }, { "auxiliary_loss_clip": 0.01158806, "auxiliary_loss_mlp": 0.01072478, "balance_loss_clip": 1.05936599, "balance_loss_mlp": 1.04432034, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 2.145216314952277, "language_loss": 0.78365827, "learning_rate": 3.989074979440421e-06, "loss": 0.80597103, "num_input_tokens_seen": 22197125, "step": 1036, "time_per_iteration": 2.7858450412750244 }, { "auxiliary_loss_clip": 0.01192097, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.05977845, "balance_loss_mlp": 1.04663444, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.9535870339716077, "language_loss": 0.86544567, "learning_rate": 3.989034289722739e-06, "loss": 0.88808048, "num_input_tokens_seen": 22217575, "step": 1037, "time_per_iteration": 2.685373306274414 }, { "auxiliary_loss_clip": 0.01197778, "auxiliary_loss_mlp": 0.01057095, "balance_loss_clip": 1.06127763, "balance_loss_mlp": 1.02966499, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.697396725345887, "language_loss": 0.8067717, "learning_rate": 3.988993524580676e-06, "loss": 0.82932043, "num_input_tokens_seen": 22236840, "step": 1038, "time_per_iteration": 2.7305831909179688 }, { "auxiliary_loss_clip": 0.01145896, "auxiliary_loss_mlp": 0.01072721, "balance_loss_clip": 1.05226004, "balance_loss_mlp": 1.04330015, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 1.8888526922505675, "language_loss": 0.85465872, "learning_rate": 3.98895268401578e-06, "loss": 0.87684488, "num_input_tokens_seen": 22256465, "step": 1039, "time_per_iteration": 2.7351109981536865 }, { "auxiliary_loss_clip": 0.01188545, "auxiliary_loss_mlp": 0.01070323, "balance_loss_clip": 1.05834138, "balance_loss_mlp": 1.04472923, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 2.217895985816133, "language_loss": 0.81172895, "learning_rate": 3.9889117680296e-06, "loss": 0.83431756, "num_input_tokens_seen": 22274025, "step": 1040, "time_per_iteration": 2.6532907485961914 }, { "auxiliary_loss_clip": 0.0121654, "auxiliary_loss_mlp": 0.0106312, "balance_loss_clip": 1.06718016, "balance_loss_mlp": 1.03808582, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.1960038080149817, "language_loss": 0.69304991, "learning_rate": 3.988870776623685e-06, "loss": 0.71584648, "num_input_tokens_seen": 22292245, "step": 1041, "time_per_iteration": 2.6445486545562744 }, { "auxiliary_loss_clip": 0.01214659, "auxiliary_loss_mlp": 0.01057975, "balance_loss_clip": 1.06247008, "balance_loss_mlp": 1.03182077, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.7326158002445, "language_loss": 0.81187552, "learning_rate": 3.9888297097995905e-06, "loss": 0.83460188, "num_input_tokens_seen": 22311455, "step": 1042, "time_per_iteration": 2.6111559867858887 }, { "auxiliary_loss_clip": 0.01211653, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.06253886, "balance_loss_mlp": 1.02871442, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 1.7165873820424848, "language_loss": 0.76349056, "learning_rate": 3.988788567558874e-06, "loss": 0.78613389, "num_input_tokens_seen": 22333750, "step": 1043, "time_per_iteration": 2.761768341064453 }, { "auxiliary_loss_clip": 0.0118944, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.06111181, "balance_loss_mlp": 1.03912091, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 8.34017761542712, "language_loss": 0.92031956, "learning_rate": 3.988747349903097e-06, "loss": 0.94284582, "num_input_tokens_seen": 22351940, "step": 1044, "time_per_iteration": 2.636179208755493 }, { "auxiliary_loss_clip": 0.01192566, "auxiliary_loss_mlp": 0.01070128, "balance_loss_clip": 1.05862689, "balance_loss_mlp": 1.0456785, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 2.3486674311430944, "language_loss": 0.85913992, "learning_rate": 3.988706056833821e-06, "loss": 0.88176692, "num_input_tokens_seen": 22372085, "step": 1045, "time_per_iteration": 2.7749502658843994 }, { "auxiliary_loss_clip": 0.01179197, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.05804443, "balance_loss_mlp": 1.04053521, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 1.9846122850853416, "language_loss": 0.7796576, "learning_rate": 3.9886646883526125e-06, "loss": 0.80210131, "num_input_tokens_seen": 22392020, "step": 1046, "time_per_iteration": 2.803135871887207 }, { "auxiliary_loss_clip": 0.01197344, "auxiliary_loss_mlp": 0.01069269, "balance_loss_clip": 1.06361508, "balance_loss_mlp": 1.04558206, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 2.174325060947129, "language_loss": 0.77326387, "learning_rate": 3.988623244461039e-06, "loss": 0.79592997, "num_input_tokens_seen": 22411180, "step": 1047, "time_per_iteration": 2.647446632385254 }, { "auxiliary_loss_clip": 0.01200907, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.06238222, "balance_loss_mlp": 1.03314662, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.4899372640825046, "language_loss": 0.77190751, "learning_rate": 3.988581725160672e-06, "loss": 0.79449654, "num_input_tokens_seen": 22435105, "step": 1048, "time_per_iteration": 2.8167293071746826 }, { "auxiliary_loss_clip": 0.0118184, "auxiliary_loss_mlp": 0.01064361, "balance_loss_clip": 1.0613215, "balance_loss_mlp": 1.03914821, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 4.606540291271834, "language_loss": 0.77258086, "learning_rate": 3.988540130453087e-06, "loss": 0.79504287, "num_input_tokens_seen": 22452710, "step": 1049, "time_per_iteration": 2.6908538341522217 }, { "auxiliary_loss_clip": 0.01194538, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.06043661, "balance_loss_mlp": 1.03290701, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.515998307474139, "language_loss": 0.83302009, "learning_rate": 3.988498460339862e-06, "loss": 0.85553372, "num_input_tokens_seen": 22470175, "step": 1050, "time_per_iteration": 2.62186861038208 }, { "auxiliary_loss_clip": 0.01210654, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.06468701, "balance_loss_mlp": 1.04008913, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 5.5478202090132065, "language_loss": 0.76564771, "learning_rate": 3.988456714822575e-06, "loss": 0.78838319, "num_input_tokens_seen": 22490020, "step": 1051, "time_per_iteration": 2.732269525527954 }, { "auxiliary_loss_clip": 0.01188416, "auxiliary_loss_mlp": 0.01069443, "balance_loss_clip": 1.06340146, "balance_loss_mlp": 1.04492211, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 1.9993900469270787, "language_loss": 0.80410004, "learning_rate": 3.98841489390281e-06, "loss": 0.82667863, "num_input_tokens_seen": 22509685, "step": 1052, "time_per_iteration": 2.7683873176574707 }, { "auxiliary_loss_clip": 0.01211333, "auxiliary_loss_mlp": 0.01058255, "balance_loss_clip": 1.06324601, "balance_loss_mlp": 1.03468728, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 2.370007457349547, "language_loss": 0.77433288, "learning_rate": 3.988372997582155e-06, "loss": 0.79702866, "num_input_tokens_seen": 22527905, "step": 1053, "time_per_iteration": 5.757168531417847 }, { "auxiliary_loss_clip": 0.01190721, "auxiliary_loss_mlp": 0.00780448, "balance_loss_clip": 1.06378174, "balance_loss_mlp": 1.00028598, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 3.085258828985267, "language_loss": 0.84931248, "learning_rate": 3.988331025862195e-06, "loss": 0.86902416, "num_input_tokens_seen": 22546335, "step": 1054, "time_per_iteration": 2.7733829021453857 }, { "auxiliary_loss_clip": 0.01172281, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.05722666, "balance_loss_mlp": 1.03753328, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.0168531459993435, "language_loss": 0.85884213, "learning_rate": 3.9882889787445225e-06, "loss": 0.88117731, "num_input_tokens_seen": 22563885, "step": 1055, "time_per_iteration": 4.490305185317993 }, { "auxiliary_loss_clip": 0.01164237, "auxiliary_loss_mlp": 0.01069785, "balance_loss_clip": 1.05727792, "balance_loss_mlp": 1.04534709, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 2.4509218988768, "language_loss": 0.8113938, "learning_rate": 3.988246856230734e-06, "loss": 0.83373404, "num_input_tokens_seen": 22583035, "step": 1056, "time_per_iteration": 5.345282793045044 }, { "auxiliary_loss_clip": 0.01144181, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.04991364, "balance_loss_mlp": 1.03449368, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.2117272688527128, "language_loss": 0.81083393, "learning_rate": 3.988204658322426e-06, "loss": 0.83288693, "num_input_tokens_seen": 22605055, "step": 1057, "time_per_iteration": 2.866757392883301 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01061742, "balance_loss_clip": 1.04970908, "balance_loss_mlp": 1.03918755, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 1.9636971972870172, "language_loss": 0.83353591, "learning_rate": 3.988162385021196e-06, "loss": 0.85554409, "num_input_tokens_seen": 22623760, "step": 1058, "time_per_iteration": 2.767024278640747 }, { "auxiliary_loss_clip": 0.0117752, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.0576936, "balance_loss_mlp": 1.03408027, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 2.137077300251244, "language_loss": 0.87556928, "learning_rate": 3.988120036328651e-06, "loss": 0.89794087, "num_input_tokens_seen": 22643000, "step": 1059, "time_per_iteration": 2.794734239578247 }, { "auxiliary_loss_clip": 0.01169658, "auxiliary_loss_mlp": 0.01063463, "balance_loss_clip": 1.06196678, "balance_loss_mlp": 1.0383693, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 2.543966627588717, "language_loss": 0.91561133, "learning_rate": 3.988077612246394e-06, "loss": 0.93794256, "num_input_tokens_seen": 22660460, "step": 1060, "time_per_iteration": 2.8223626613616943 }, { "auxiliary_loss_clip": 0.01173933, "auxiliary_loss_mlp": 0.01065151, "balance_loss_clip": 1.05715585, "balance_loss_mlp": 1.03981876, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 1.9401711052692647, "language_loss": 0.87242293, "learning_rate": 3.988035112776035e-06, "loss": 0.89481378, "num_input_tokens_seen": 22679270, "step": 1061, "time_per_iteration": 2.7783865928649902 }, { "auxiliary_loss_clip": 0.01190039, "auxiliary_loss_mlp": 0.01059971, "balance_loss_clip": 1.05976009, "balance_loss_mlp": 1.03388786, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 5.360593029379932, "language_loss": 0.77407908, "learning_rate": 3.987992537919185e-06, "loss": 0.79657912, "num_input_tokens_seen": 22699330, "step": 1062, "time_per_iteration": 2.872587203979492 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.01061175, "balance_loss_clip": 1.05884075, "balance_loss_mlp": 1.03798842, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 2.2658654128491436, "language_loss": 0.86522883, "learning_rate": 3.987949887677459e-06, "loss": 0.88756967, "num_input_tokens_seen": 22717945, "step": 1063, "time_per_iteration": 2.7915029525756836 }, { "auxiliary_loss_clip": 0.01207773, "auxiliary_loss_mlp": 0.01062698, "balance_loss_clip": 1.05969334, "balance_loss_mlp": 1.03846335, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.302236346678267, "language_loss": 0.79908657, "learning_rate": 3.9879071620524744e-06, "loss": 0.82179129, "num_input_tokens_seen": 22736790, "step": 1064, "time_per_iteration": 2.6880991458892822 }, { "auxiliary_loss_clip": 0.01198826, "auxiliary_loss_mlp": 0.01066465, "balance_loss_clip": 1.0603801, "balance_loss_mlp": 1.04149103, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 3.1552731138796215, "language_loss": 0.84327948, "learning_rate": 3.987864361045851e-06, "loss": 0.8659324, "num_input_tokens_seen": 22754745, "step": 1065, "time_per_iteration": 2.6956398487091064 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.01054905, "balance_loss_clip": 1.0597136, "balance_loss_mlp": 1.03162324, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.52830872536012, "language_loss": 0.68177885, "learning_rate": 3.987821484659211e-06, "loss": 0.70400894, "num_input_tokens_seen": 22776780, "step": 1066, "time_per_iteration": 2.9867773056030273 }, { "auxiliary_loss_clip": 0.01214184, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.06780005, "balance_loss_mlp": 1.04609215, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.8546001537284342, "language_loss": 0.90349269, "learning_rate": 3.987778532894181e-06, "loss": 0.926341, "num_input_tokens_seen": 22793915, "step": 1067, "time_per_iteration": 2.685896873474121 }, { "auxiliary_loss_clip": 0.01188134, "auxiliary_loss_mlp": 0.01063022, "balance_loss_clip": 1.0623709, "balance_loss_mlp": 1.03969264, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 2.189788428445167, "language_loss": 0.83437371, "learning_rate": 3.987735505752391e-06, "loss": 0.85688531, "num_input_tokens_seen": 22812670, "step": 1068, "time_per_iteration": 2.851602554321289 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01057745, "balance_loss_clip": 1.05909026, "balance_loss_mlp": 1.03426039, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 3.045176948020938, "language_loss": 0.89311272, "learning_rate": 3.987692403235471e-06, "loss": 0.9154799, "num_input_tokens_seen": 22832440, "step": 1069, "time_per_iteration": 2.7825255393981934 }, { "auxiliary_loss_clip": 0.01185672, "auxiliary_loss_mlp": 0.01071834, "balance_loss_clip": 1.06158304, "balance_loss_mlp": 1.04663301, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 2.7038488706194808, "language_loss": 0.95759481, "learning_rate": 3.987649225345056e-06, "loss": 0.98016989, "num_input_tokens_seen": 22845495, "step": 1070, "time_per_iteration": 2.715296506881714 }, { "auxiliary_loss_clip": 0.01140792, "auxiliary_loss_mlp": 0.01056718, "balance_loss_clip": 1.05607581, "balance_loss_mlp": 1.03027749, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.630790580283393, "language_loss": 0.8811003, "learning_rate": 3.987605972082782e-06, "loss": 0.90307534, "num_input_tokens_seen": 22865390, "step": 1071, "time_per_iteration": 2.8445394039154053 }, { "auxiliary_loss_clip": 0.01155172, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.05483651, "balance_loss_mlp": 1.03102481, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.8349443396730127, "language_loss": 0.76116478, "learning_rate": 3.987562643450292e-06, "loss": 0.78326637, "num_input_tokens_seen": 22885495, "step": 1072, "time_per_iteration": 2.8330819606781006 }, { "auxiliary_loss_clip": 0.01172997, "auxiliary_loss_mlp": 0.01070104, "balance_loss_clip": 1.05975842, "balance_loss_mlp": 1.04362798, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 2.724283900767911, "language_loss": 0.80849886, "learning_rate": 3.987519239449226e-06, "loss": 0.83092993, "num_input_tokens_seen": 22904845, "step": 1073, "time_per_iteration": 2.748286247253418 }, { "auxiliary_loss_clip": 0.01194712, "auxiliary_loss_mlp": 0.01062452, "balance_loss_clip": 1.06345201, "balance_loss_mlp": 1.03825283, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 5.0538746884234245, "language_loss": 0.80282539, "learning_rate": 3.987475760081233e-06, "loss": 0.82539707, "num_input_tokens_seen": 22925940, "step": 1074, "time_per_iteration": 2.7482337951660156 }, { "auxiliary_loss_clip": 0.01173366, "auxiliary_loss_mlp": 0.01057774, "balance_loss_clip": 1.05920076, "balance_loss_mlp": 1.03256142, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 2.0209756517373707, "language_loss": 0.79249811, "learning_rate": 3.987432205347958e-06, "loss": 0.8148095, "num_input_tokens_seen": 22944375, "step": 1075, "time_per_iteration": 2.6937224864959717 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01063569, "balance_loss_clip": 1.05735481, "balance_loss_mlp": 1.04025126, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.9028991302223357, "language_loss": 0.88208115, "learning_rate": 3.987388575251055e-06, "loss": 0.90442967, "num_input_tokens_seen": 22959145, "step": 1076, "time_per_iteration": 2.878103256225586 }, { "auxiliary_loss_clip": 0.01192915, "auxiliary_loss_mlp": 0.01052877, "balance_loss_clip": 1.06164443, "balance_loss_mlp": 1.0288558, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 2.225760792581628, "language_loss": 0.80876106, "learning_rate": 3.98734486979218e-06, "loss": 0.83121902, "num_input_tokens_seen": 22978100, "step": 1077, "time_per_iteration": 2.7221076488494873 }, { "auxiliary_loss_clip": 0.01200466, "auxiliary_loss_mlp": 0.01064019, "balance_loss_clip": 1.0656153, "balance_loss_mlp": 1.03866291, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 2.256787147683815, "language_loss": 0.91727465, "learning_rate": 3.987301088972986e-06, "loss": 0.93991947, "num_input_tokens_seen": 22997285, "step": 1078, "time_per_iteration": 2.862365484237671 }, { "auxiliary_loss_clip": 0.0122435, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.06826639, "balance_loss_mlp": 1.03552508, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 2.080056711608912, "language_loss": 0.78349572, "learning_rate": 3.987257232795137e-06, "loss": 0.80634147, "num_input_tokens_seen": 23016285, "step": 1079, "time_per_iteration": 2.6435368061065674 }, { "auxiliary_loss_clip": 0.01156927, "auxiliary_loss_mlp": 0.01063794, "balance_loss_clip": 1.05512071, "balance_loss_mlp": 1.03899896, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 2.274862403364013, "language_loss": 0.68702769, "learning_rate": 3.987213301260294e-06, "loss": 0.70923495, "num_input_tokens_seen": 23036420, "step": 1080, "time_per_iteration": 2.7782626152038574 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01062351, "balance_loss_clip": 1.06640029, "balance_loss_mlp": 1.03610086, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.886196453243775, "language_loss": 0.72291583, "learning_rate": 3.987169294370123e-06, "loss": 0.74529618, "num_input_tokens_seen": 23056945, "step": 1081, "time_per_iteration": 2.7983880043029785 }, { "auxiliary_loss_clip": 0.01139671, "auxiliary_loss_mlp": 0.01066686, "balance_loss_clip": 1.0504055, "balance_loss_mlp": 1.04076982, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 3.3093934650613566, "language_loss": 0.84059012, "learning_rate": 3.987125212126294e-06, "loss": 0.86265367, "num_input_tokens_seen": 23074940, "step": 1082, "time_per_iteration": 2.8351900577545166 }, { "auxiliary_loss_clip": 0.01204185, "auxiliary_loss_mlp": 0.01063692, "balance_loss_clip": 1.06306195, "balance_loss_mlp": 1.03809738, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.894360492506304, "language_loss": 0.82550305, "learning_rate": 3.987081054530478e-06, "loss": 0.84818184, "num_input_tokens_seen": 23093420, "step": 1083, "time_per_iteration": 2.866729974746704 }, { "auxiliary_loss_clip": 0.01168245, "auxiliary_loss_mlp": 0.01062938, "balance_loss_clip": 1.06021011, "balance_loss_mlp": 1.03655696, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.468736383036802, "language_loss": 0.79289383, "learning_rate": 3.987036821584348e-06, "loss": 0.81520569, "num_input_tokens_seen": 23111550, "step": 1084, "time_per_iteration": 2.816601276397705 }, { "auxiliary_loss_clip": 0.01174068, "auxiliary_loss_mlp": 0.0106167, "balance_loss_clip": 1.05854714, "balance_loss_mlp": 1.03667152, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.571590277205686, "language_loss": 0.66443276, "learning_rate": 3.986992513289584e-06, "loss": 0.68679011, "num_input_tokens_seen": 23130335, "step": 1085, "time_per_iteration": 2.8260092735290527 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01062435, "balance_loss_clip": 1.0600934, "balance_loss_mlp": 1.03833067, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 2.0478791529086977, "language_loss": 0.76548934, "learning_rate": 3.9869481296478645e-06, "loss": 0.78782183, "num_input_tokens_seen": 23152380, "step": 1086, "time_per_iteration": 2.7937023639678955 }, { "auxiliary_loss_clip": 0.01198609, "auxiliary_loss_mlp": 0.01059288, "balance_loss_clip": 1.06335294, "balance_loss_mlp": 1.03519547, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 2.1629448601391017, "language_loss": 0.85109925, "learning_rate": 3.986903670660872e-06, "loss": 0.87367821, "num_input_tokens_seen": 23171630, "step": 1087, "time_per_iteration": 2.7510013580322266 }, { "auxiliary_loss_clip": 0.01184978, "auxiliary_loss_mlp": 0.01059017, "balance_loss_clip": 1.06293821, "balance_loss_mlp": 1.03510392, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.7886353193129139, "language_loss": 0.77776635, "learning_rate": 3.9868591363302945e-06, "loss": 0.80020636, "num_input_tokens_seen": 23192520, "step": 1088, "time_per_iteration": 2.7792751789093018 }, { "auxiliary_loss_clip": 0.01192707, "auxiliary_loss_mlp": 0.01067634, "balance_loss_clip": 1.06569457, "balance_loss_mlp": 1.04498422, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 3.0334087154373375, "language_loss": 0.71050513, "learning_rate": 3.9868145266578186e-06, "loss": 0.73310852, "num_input_tokens_seen": 23210710, "step": 1089, "time_per_iteration": 2.8832852840423584 }, { "auxiliary_loss_clip": 0.01173663, "auxiliary_loss_mlp": 0.00781529, "balance_loss_clip": 1.06159782, "balance_loss_mlp": 1.00019014, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 2.02973275746688, "language_loss": 0.85650897, "learning_rate": 3.9867698416451366e-06, "loss": 0.87606084, "num_input_tokens_seen": 23230305, "step": 1090, "time_per_iteration": 2.7933149337768555 }, { "auxiliary_loss_clip": 0.01214666, "auxiliary_loss_mlp": 0.0105885, "balance_loss_clip": 1.06735325, "balance_loss_mlp": 1.03460288, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 2.137212216289862, "language_loss": 0.71829313, "learning_rate": 3.9867250812939434e-06, "loss": 0.74102825, "num_input_tokens_seen": 23249015, "step": 1091, "time_per_iteration": 2.646592855453491 }, { "auxiliary_loss_clip": 0.01121055, "auxiliary_loss_mlp": 0.0106405, "balance_loss_clip": 1.05242276, "balance_loss_mlp": 1.03961205, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.2773849385721956, "language_loss": 0.82839823, "learning_rate": 3.986680245605936e-06, "loss": 0.85024923, "num_input_tokens_seen": 23265105, "step": 1092, "time_per_iteration": 4.799649715423584 }, { "auxiliary_loss_clip": 0.01215092, "auxiliary_loss_mlp": 0.01059151, "balance_loss_clip": 1.0640471, "balance_loss_mlp": 1.03352082, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 2.268968080418226, "language_loss": 0.71134168, "learning_rate": 3.986635334582814e-06, "loss": 0.73408413, "num_input_tokens_seen": 23283950, "step": 1093, "time_per_iteration": 5.3356239795684814 }, { "auxiliary_loss_clip": 0.01190682, "auxiliary_loss_mlp": 0.01064498, "balance_loss_clip": 1.06751943, "balance_loss_mlp": 1.0392611, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 3.829837904337144, "language_loss": 0.87996346, "learning_rate": 3.986590348226282e-06, "loss": 0.90251523, "num_input_tokens_seen": 23305005, "step": 1094, "time_per_iteration": 2.853489637374878 }, { "auxiliary_loss_clip": 0.01192742, "auxiliary_loss_mlp": 0.01065068, "balance_loss_clip": 1.06367433, "balance_loss_mlp": 1.03843689, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 1.6736216436017588, "language_loss": 0.81483954, "learning_rate": 3.986545286538044e-06, "loss": 0.8374176, "num_input_tokens_seen": 23323220, "step": 1095, "time_per_iteration": 5.1613922119140625 }, { "auxiliary_loss_clip": 0.01166049, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.06295943, "balance_loss_mlp": 1.03598547, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.0200125290673068, "language_loss": 0.69789279, "learning_rate": 3.986500149519811e-06, "loss": 0.72014272, "num_input_tokens_seen": 23342235, "step": 1096, "time_per_iteration": 2.804025173187256 }, { "auxiliary_loss_clip": 0.01201939, "auxiliary_loss_mlp": 0.01070786, "balance_loss_clip": 1.06405246, "balance_loss_mlp": 1.04614568, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 1.7011375462517908, "language_loss": 0.77430046, "learning_rate": 3.986454937173292e-06, "loss": 0.79702777, "num_input_tokens_seen": 23363680, "step": 1097, "time_per_iteration": 2.7658958435058594 }, { "auxiliary_loss_clip": 0.01215996, "auxiliary_loss_mlp": 0.01063445, "balance_loss_clip": 1.06707537, "balance_loss_mlp": 1.03959155, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 1.8316558452843608, "language_loss": 0.78217584, "learning_rate": 3.986409649500203e-06, "loss": 0.80497026, "num_input_tokens_seen": 23385590, "step": 1098, "time_per_iteration": 2.865684747695923 }, { "auxiliary_loss_clip": 0.01197349, "auxiliary_loss_mlp": 0.01069192, "balance_loss_clip": 1.06328607, "balance_loss_mlp": 1.04443276, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.9237510259783663, "language_loss": 0.81525648, "learning_rate": 3.986364286502261e-06, "loss": 0.83792192, "num_input_tokens_seen": 23402945, "step": 1099, "time_per_iteration": 2.690377950668335 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.0105819, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03428841, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.9906927310803755, "language_loss": 0.82793295, "learning_rate": 3.986318848181186e-06, "loss": 0.8503148, "num_input_tokens_seen": 23421410, "step": 1100, "time_per_iteration": 2.7613909244537354 }, { "auxiliary_loss_clip": 0.01191263, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.06985724, "balance_loss_mlp": 1.03529549, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 2.079994286400427, "language_loss": 0.73502243, "learning_rate": 3.986273334538702e-06, "loss": 0.75752538, "num_input_tokens_seen": 23438870, "step": 1101, "time_per_iteration": 2.7795870304107666 }, { "auxiliary_loss_clip": 0.01199256, "auxiliary_loss_mlp": 0.01061171, "balance_loss_clip": 1.06278944, "balance_loss_mlp": 1.03773487, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.875757629612747, "language_loss": 0.85861301, "learning_rate": 3.986227745576533e-06, "loss": 0.88121736, "num_input_tokens_seen": 23456975, "step": 1102, "time_per_iteration": 2.737269401550293 }, { "auxiliary_loss_clip": 0.01191982, "auxiliary_loss_mlp": 0.01058639, "balance_loss_clip": 1.06898165, "balance_loss_mlp": 1.03410578, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 2.8924251757501778, "language_loss": 0.81655926, "learning_rate": 3.98618208129641e-06, "loss": 0.83906543, "num_input_tokens_seen": 23473440, "step": 1103, "time_per_iteration": 2.9345293045043945 }, { "auxiliary_loss_clip": 0.01203522, "auxiliary_loss_mlp": 0.00780451, "balance_loss_clip": 1.06721628, "balance_loss_mlp": 1.00042021, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 5.176370819061919, "language_loss": 0.81749105, "learning_rate": 3.986136341700063e-06, "loss": 0.83733076, "num_input_tokens_seen": 23493880, "step": 1104, "time_per_iteration": 2.753657102584839 }, { "auxiliary_loss_clip": 0.0116508, "auxiliary_loss_mlp": 0.01050687, "balance_loss_clip": 1.0576005, "balance_loss_mlp": 1.02608228, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.5448539486038575, "language_loss": 0.80422902, "learning_rate": 3.986090526789227e-06, "loss": 0.82638663, "num_input_tokens_seen": 23514920, "step": 1105, "time_per_iteration": 2.8904521465301514 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0106197, "balance_loss_clip": 1.06348729, "balance_loss_mlp": 1.0391891, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 2.7426455725749896, "language_loss": 0.96762037, "learning_rate": 3.986044636565639e-06, "loss": 0.99003398, "num_input_tokens_seen": 23531635, "step": 1106, "time_per_iteration": 2.890073299407959 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01059975, "balance_loss_clip": 1.06069684, "balance_loss_mlp": 1.03511953, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 1.9297768479693453, "language_loss": 0.82528949, "learning_rate": 3.985998671031039e-06, "loss": 0.84786987, "num_input_tokens_seen": 23551020, "step": 1107, "time_per_iteration": 2.778857469558716 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.01010935, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.0072155, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.7967940032222198, "language_loss": 0.56789279, "learning_rate": 3.9859526301871705e-06, "loss": 0.58904392, "num_input_tokens_seen": 23610675, "step": 1108, "time_per_iteration": 3.2717819213867188 }, { "auxiliary_loss_clip": 0.0118327, "auxiliary_loss_mlp": 0.01062625, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.0376507, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 2.682842555407744, "language_loss": 0.7287578, "learning_rate": 3.9859065140357795e-06, "loss": 0.75121677, "num_input_tokens_seen": 23628710, "step": 1109, "time_per_iteration": 2.829623222351074 }, { "auxiliary_loss_clip": 0.01148971, "auxiliary_loss_mlp": 0.01071895, "balance_loss_clip": 1.05459642, "balance_loss_mlp": 1.04714715, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.7914435942805436, "language_loss": 0.78140426, "learning_rate": 3.985860322578614e-06, "loss": 0.80361295, "num_input_tokens_seen": 23649160, "step": 1110, "time_per_iteration": 2.892786741256714 }, { "auxiliary_loss_clip": 0.01153553, "auxiliary_loss_mlp": 0.0106147, "balance_loss_clip": 1.05590594, "balance_loss_mlp": 1.03700781, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 2.5260725451831805, "language_loss": 0.71425366, "learning_rate": 3.985814055817427e-06, "loss": 0.73640382, "num_input_tokens_seen": 23671995, "step": 1111, "time_per_iteration": 2.9349052906036377 }, { "auxiliary_loss_clip": 0.01170538, "auxiliary_loss_mlp": 0.01066103, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.04199934, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.8396663794990693, "language_loss": 0.78767776, "learning_rate": 3.985767713753971e-06, "loss": 0.81004417, "num_input_tokens_seen": 23690705, "step": 1112, "time_per_iteration": 2.8676345348358154 }, { "auxiliary_loss_clip": 0.01153291, "auxiliary_loss_mlp": 0.01065421, "balance_loss_clip": 1.05340791, "balance_loss_mlp": 1.04163861, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.071048188460824, "language_loss": 0.78481978, "learning_rate": 3.985721296390005e-06, "loss": 0.80700684, "num_input_tokens_seen": 23709990, "step": 1113, "time_per_iteration": 2.8688411712646484 }, { "auxiliary_loss_clip": 0.0114872, "auxiliary_loss_mlp": 0.01057074, "balance_loss_clip": 1.05157375, "balance_loss_mlp": 1.03376842, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 1.7560007918285245, "language_loss": 0.82399213, "learning_rate": 3.985674803727289e-06, "loss": 0.84605002, "num_input_tokens_seen": 23728485, "step": 1114, "time_per_iteration": 2.832458019256592 }, { "auxiliary_loss_clip": 0.01075626, "auxiliary_loss_mlp": 0.01006906, "balance_loss_clip": 1.04995251, "balance_loss_mlp": 1.00271022, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8370646888074905, "language_loss": 0.58147323, "learning_rate": 3.985628235767584e-06, "loss": 0.60229862, "num_input_tokens_seen": 23786650, "step": 1115, "time_per_iteration": 3.550837755203247 }, { "auxiliary_loss_clip": 0.01177193, "auxiliary_loss_mlp": 0.01059174, "balance_loss_clip": 1.05986214, "balance_loss_mlp": 1.03381801, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.8944873563712235, "language_loss": 0.91280693, "learning_rate": 3.985581592512658e-06, "loss": 0.93517065, "num_input_tokens_seen": 23802555, "step": 1116, "time_per_iteration": 2.994608163833618 }, { "auxiliary_loss_clip": 0.01169376, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.05839634, "balance_loss_mlp": 1.00045347, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9249158333763592, "language_loss": 0.87154609, "learning_rate": 3.985534873964279e-06, "loss": 0.89106256, "num_input_tokens_seen": 23822945, "step": 1117, "time_per_iteration": 2.794400453567505 }, { "auxiliary_loss_clip": 0.01095782, "auxiliary_loss_mlp": 0.01003785, "balance_loss_clip": 1.0387876, "balance_loss_mlp": 0.99963647, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8644388721740246, "language_loss": 0.5981611, "learning_rate": 3.985488080124218e-06, "loss": 0.61915678, "num_input_tokens_seen": 23874075, "step": 1118, "time_per_iteration": 3.1695809364318848 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01051993, "balance_loss_clip": 1.05301392, "balance_loss_mlp": 1.02780545, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.6923711141076447, "language_loss": 0.83045954, "learning_rate": 3.985441210994251e-06, "loss": 0.85276914, "num_input_tokens_seen": 23889720, "step": 1119, "time_per_iteration": 2.7538814544677734 }, { "auxiliary_loss_clip": 0.01182384, "auxiliary_loss_mlp": 0.01058422, "balance_loss_clip": 1.06102347, "balance_loss_mlp": 1.03566504, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 4.541743494234462, "language_loss": 0.8451674, "learning_rate": 3.9853942665761545e-06, "loss": 0.86757541, "num_input_tokens_seen": 23909385, "step": 1120, "time_per_iteration": 2.76581072807312 }, { "auxiliary_loss_clip": 0.0121565, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.06757379, "balance_loss_mlp": 1.04028773, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 2.503866645162978, "language_loss": 0.78722781, "learning_rate": 3.985347246871708e-06, "loss": 0.81003344, "num_input_tokens_seen": 23926830, "step": 1121, "time_per_iteration": 2.651175022125244 }, { "auxiliary_loss_clip": 0.01080914, "auxiliary_loss_mlp": 0.01011889, "balance_loss_clip": 1.03108025, "balance_loss_mlp": 1.00802636, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7540288133642103, "language_loss": 0.58320796, "learning_rate": 3.985300151882694e-06, "loss": 0.60413599, "num_input_tokens_seen": 23992640, "step": 1122, "time_per_iteration": 3.3794541358947754 }, { "auxiliary_loss_clip": 0.01145486, "auxiliary_loss_mlp": 0.01066136, "balance_loss_clip": 1.05581403, "balance_loss_mlp": 1.04167438, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 2.3361170394687076, "language_loss": 0.71965349, "learning_rate": 3.985252981610901e-06, "loss": 0.74176967, "num_input_tokens_seen": 24011135, "step": 1123, "time_per_iteration": 2.8049354553222656 }, { "auxiliary_loss_clip": 0.01144994, "auxiliary_loss_mlp": 0.01064196, "balance_loss_clip": 1.05373979, "balance_loss_mlp": 1.03612232, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 1.7380479869896208, "language_loss": 0.78987843, "learning_rate": 3.985205736058114e-06, "loss": 0.81197035, "num_input_tokens_seen": 24030695, "step": 1124, "time_per_iteration": 2.8595056533813477 }, { "auxiliary_loss_clip": 0.01189686, "auxiliary_loss_mlp": 0.01055169, "balance_loss_clip": 1.05663013, "balance_loss_mlp": 1.03200674, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 3.1450673626590793, "language_loss": 0.70999855, "learning_rate": 3.985158415226128e-06, "loss": 0.73244709, "num_input_tokens_seen": 24050680, "step": 1125, "time_per_iteration": 2.726163625717163 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01068918, "balance_loss_clip": 1.05826426, "balance_loss_mlp": 1.04290628, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 3.340323364887528, "language_loss": 0.81440383, "learning_rate": 3.985111019116736e-06, "loss": 0.83674812, "num_input_tokens_seen": 24067205, "step": 1126, "time_per_iteration": 2.7356598377227783 }, { "auxiliary_loss_clip": 0.0107201, "auxiliary_loss_mlp": 0.01004999, "balance_loss_clip": 1.0293622, "balance_loss_mlp": 1.00092208, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.77802311726495, "language_loss": 0.59720373, "learning_rate": 3.985063547731735e-06, "loss": 0.6179738, "num_input_tokens_seen": 24131320, "step": 1127, "time_per_iteration": 3.2627320289611816 }, { "auxiliary_loss_clip": 0.01206438, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.06308687, "balance_loss_mlp": 1.03189397, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 2.2535941175889054, "language_loss": 0.81097019, "learning_rate": 3.985016001072925e-06, "loss": 0.83358967, "num_input_tokens_seen": 24149930, "step": 1128, "time_per_iteration": 2.6652371883392334 }, { "auxiliary_loss_clip": 0.01158345, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.02804112, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.24200367657907, "language_loss": 0.75559127, "learning_rate": 3.984968379142109e-06, "loss": 0.77770138, "num_input_tokens_seen": 24169590, "step": 1129, "time_per_iteration": 2.7023732662200928 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01053995, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.03006983, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.890559803272908, "language_loss": 0.71710479, "learning_rate": 3.984920681941094e-06, "loss": 0.73882067, "num_input_tokens_seen": 24189965, "step": 1130, "time_per_iteration": 3.0757689476013184 }, { "auxiliary_loss_clip": 0.01158117, "auxiliary_loss_mlp": 0.010592, "balance_loss_clip": 1.05734515, "balance_loss_mlp": 1.03481019, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.24421862356218, "language_loss": 0.80776262, "learning_rate": 3.984872909471688e-06, "loss": 0.82993579, "num_input_tokens_seen": 24208045, "step": 1131, "time_per_iteration": 5.00832724571228 }, { "auxiliary_loss_clip": 0.01195331, "auxiliary_loss_mlp": 0.01070142, "balance_loss_clip": 1.06155944, "balance_loss_mlp": 1.04614532, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.0533244923502463, "language_loss": 0.80371779, "learning_rate": 3.984825061735701e-06, "loss": 0.8263725, "num_input_tokens_seen": 24223805, "step": 1132, "time_per_iteration": 4.487931251525879 }, { "auxiliary_loss_clip": 0.01170581, "auxiliary_loss_mlp": 0.01061867, "balance_loss_clip": 1.05438542, "balance_loss_mlp": 1.03756022, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.7182324226465766, "language_loss": 0.6341064, "learning_rate": 3.9847771387349495e-06, "loss": 0.65643084, "num_input_tokens_seen": 24249475, "step": 1133, "time_per_iteration": 4.48089337348938 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01055984, "balance_loss_clip": 1.04700482, "balance_loss_mlp": 1.02973366, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 1.9264963116598819, "language_loss": 0.74771935, "learning_rate": 3.9847291404712506e-06, "loss": 0.76953518, "num_input_tokens_seen": 24267980, "step": 1134, "time_per_iteration": 5.287277936935425 }, { "auxiliary_loss_clip": 0.01169269, "auxiliary_loss_mlp": 0.00782536, "balance_loss_clip": 1.05878353, "balance_loss_mlp": 1.00042605, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.151108605399924, "language_loss": 0.86871451, "learning_rate": 3.984681066946423e-06, "loss": 0.88823259, "num_input_tokens_seen": 24286805, "step": 1135, "time_per_iteration": 2.8024110794067383 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.007818, "balance_loss_clip": 1.0543226, "balance_loss_mlp": 1.00046515, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.521942237810997, "language_loss": 0.78131735, "learning_rate": 3.984632918162291e-06, "loss": 0.80090094, "num_input_tokens_seen": 24305855, "step": 1136, "time_per_iteration": 2.7595040798187256 }, { "auxiliary_loss_clip": 0.01185832, "auxiliary_loss_mlp": 0.01063587, "balance_loss_clip": 1.05952621, "balance_loss_mlp": 1.03868449, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.275643110468061, "language_loss": 0.83968467, "learning_rate": 3.984584694120679e-06, "loss": 0.86217892, "num_input_tokens_seen": 24326535, "step": 1137, "time_per_iteration": 2.7738285064697266 }, { "auxiliary_loss_clip": 0.01153105, "auxiliary_loss_mlp": 0.01059471, "balance_loss_clip": 1.05239427, "balance_loss_mlp": 1.0348897, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.068206081593879, "language_loss": 0.788486, "learning_rate": 3.984536394823418e-06, "loss": 0.81061178, "num_input_tokens_seen": 24345810, "step": 1138, "time_per_iteration": 2.804537296295166 }, { "auxiliary_loss_clip": 0.01209658, "auxiliary_loss_mlp": 0.01058353, "balance_loss_clip": 1.06288362, "balance_loss_mlp": 1.03415346, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 2.3335265924104096, "language_loss": 0.85507643, "learning_rate": 3.984488020272336e-06, "loss": 0.87775654, "num_input_tokens_seen": 24366095, "step": 1139, "time_per_iteration": 2.746884822845459 }, { "auxiliary_loss_clip": 0.01153855, "auxiliary_loss_mlp": 0.01063721, "balance_loss_clip": 1.05325532, "balance_loss_mlp": 1.03679228, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 1.9254794009430078, "language_loss": 0.74899161, "learning_rate": 3.984439570469271e-06, "loss": 0.7711674, "num_input_tokens_seen": 24388665, "step": 1140, "time_per_iteration": 2.938143253326416 }, { "auxiliary_loss_clip": 0.01186218, "auxiliary_loss_mlp": 0.00782227, "balance_loss_clip": 1.06101704, "balance_loss_mlp": 1.00036597, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.1250887020504767, "language_loss": 0.68258876, "learning_rate": 3.9843910454160574e-06, "loss": 0.70227319, "num_input_tokens_seen": 24407705, "step": 1141, "time_per_iteration": 2.8180530071258545 }, { "auxiliary_loss_clip": 0.01197117, "auxiliary_loss_mlp": 0.01067748, "balance_loss_clip": 1.05978489, "balance_loss_mlp": 1.04266596, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 1.8460768582410394, "language_loss": 0.78959155, "learning_rate": 3.984342445114538e-06, "loss": 0.81224018, "num_input_tokens_seen": 24428390, "step": 1142, "time_per_iteration": 2.712876558303833 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01060882, "balance_loss_clip": 1.06245089, "balance_loss_mlp": 1.03702831, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 1.7867268614306446, "language_loss": 0.68287402, "learning_rate": 3.984293769566553e-06, "loss": 0.70535195, "num_input_tokens_seen": 24450810, "step": 1143, "time_per_iteration": 2.752659320831299 }, { "auxiliary_loss_clip": 0.01177843, "auxiliary_loss_mlp": 0.01059894, "balance_loss_clip": 1.05798244, "balance_loss_mlp": 1.03773308, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.7582250309313294, "language_loss": 0.74307454, "learning_rate": 3.98424501877395e-06, "loss": 0.76545191, "num_input_tokens_seen": 24469965, "step": 1144, "time_per_iteration": 2.6448662281036377 }, { "auxiliary_loss_clip": 0.01189197, "auxiliary_loss_mlp": 0.0106544, "balance_loss_clip": 1.0565474, "balance_loss_mlp": 1.04039407, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.699041414372958, "language_loss": 0.91755033, "learning_rate": 3.984196192738577e-06, "loss": 0.94009674, "num_input_tokens_seen": 24486370, "step": 1145, "time_per_iteration": 2.6621482372283936 }, { "auxiliary_loss_clip": 0.01212189, "auxiliary_loss_mlp": 0.0106819, "balance_loss_clip": 1.06225932, "balance_loss_mlp": 1.04258406, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 2.2014676012481487, "language_loss": 0.81726635, "learning_rate": 3.984147291462285e-06, "loss": 0.84007025, "num_input_tokens_seen": 24503780, "step": 1146, "time_per_iteration": 2.623964548110962 }, { "auxiliary_loss_clip": 0.01204602, "auxiliary_loss_mlp": 0.01065301, "balance_loss_clip": 1.06215203, "balance_loss_mlp": 1.04191244, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.1265245828428108, "language_loss": 0.84968954, "learning_rate": 3.98409831494693e-06, "loss": 0.8723886, "num_input_tokens_seen": 24522320, "step": 1147, "time_per_iteration": 2.5898265838623047 }, { "auxiliary_loss_clip": 0.01156886, "auxiliary_loss_mlp": 0.01064453, "balance_loss_clip": 1.05563867, "balance_loss_mlp": 1.03949046, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.7557033260323716, "language_loss": 0.86094105, "learning_rate": 3.984049263194367e-06, "loss": 0.88315445, "num_input_tokens_seen": 24540445, "step": 1148, "time_per_iteration": 2.748782157897949 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.05569541, "balance_loss_mlp": 1.03370178, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.322434023005448, "language_loss": 0.69602191, "learning_rate": 3.9840001362064575e-06, "loss": 0.71835601, "num_input_tokens_seen": 24557105, "step": 1149, "time_per_iteration": 2.741854429244995 }, { "auxiliary_loss_clip": 0.01207871, "auxiliary_loss_mlp": 0.01051245, "balance_loss_clip": 1.06034219, "balance_loss_mlp": 1.02692604, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.9440351937259064, "language_loss": 0.8374452, "learning_rate": 3.983950933985064e-06, "loss": 0.86003637, "num_input_tokens_seen": 24578240, "step": 1150, "time_per_iteration": 2.6919586658477783 }, { "auxiliary_loss_clip": 0.01181406, "auxiliary_loss_mlp": 0.01058015, "balance_loss_clip": 1.06063652, "balance_loss_mlp": 1.03380394, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 4.11905785776886, "language_loss": 0.81464434, "learning_rate": 3.983901656532052e-06, "loss": 0.83703858, "num_input_tokens_seen": 24593585, "step": 1151, "time_per_iteration": 2.7979934215545654 }, { "auxiliary_loss_clip": 0.01206831, "auxiliary_loss_mlp": 0.01058184, "balance_loss_clip": 1.06409955, "balance_loss_mlp": 1.03434169, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 2.0324362571668724, "language_loss": 0.85408235, "learning_rate": 3.983852303849291e-06, "loss": 0.87673247, "num_input_tokens_seen": 24613110, "step": 1152, "time_per_iteration": 2.686021089553833 }, { "auxiliary_loss_clip": 0.01190935, "auxiliary_loss_mlp": 0.01062076, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.03866374, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.182544196511779, "language_loss": 0.90594423, "learning_rate": 3.983802875938651e-06, "loss": 0.92847437, "num_input_tokens_seen": 24628795, "step": 1153, "time_per_iteration": 2.58366060256958 }, { "auxiliary_loss_clip": 0.01169877, "auxiliary_loss_mlp": 0.01055253, "balance_loss_clip": 1.05681062, "balance_loss_mlp": 1.03088629, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.1214794624630846, "language_loss": 0.81526846, "learning_rate": 3.983753372802008e-06, "loss": 0.83751976, "num_input_tokens_seen": 24645480, "step": 1154, "time_per_iteration": 2.696794271469116 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01066335, "balance_loss_clip": 1.0691216, "balance_loss_mlp": 1.04200506, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 2.102018399986892, "language_loss": 0.75022292, "learning_rate": 3.983703794441237e-06, "loss": 0.77277398, "num_input_tokens_seen": 24664630, "step": 1155, "time_per_iteration": 2.7718143463134766 }, { "auxiliary_loss_clip": 0.01180696, "auxiliary_loss_mlp": 0.00782152, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.00041056, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.7459449483933205, "language_loss": 0.7110405, "learning_rate": 3.98365414085822e-06, "loss": 0.73066902, "num_input_tokens_seen": 24684210, "step": 1156, "time_per_iteration": 2.7014200687408447 }, { "auxiliary_loss_clip": 0.01179101, "auxiliary_loss_mlp": 0.00782674, "balance_loss_clip": 1.0593586, "balance_loss_mlp": 1.00037348, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 2.067241397655847, "language_loss": 0.74882817, "learning_rate": 3.98360441205484e-06, "loss": 0.76844591, "num_input_tokens_seen": 24702490, "step": 1157, "time_per_iteration": 2.7571897506713867 }, { "auxiliary_loss_clip": 0.01178249, "auxiliary_loss_mlp": 0.01061737, "balance_loss_clip": 1.05653787, "balance_loss_mlp": 1.03697729, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 1.9827644507913538, "language_loss": 0.7165724, "learning_rate": 3.983554608032982e-06, "loss": 0.73897225, "num_input_tokens_seen": 24724340, "step": 1158, "time_per_iteration": 2.839745044708252 }, { "auxiliary_loss_clip": 0.01207855, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.03370285, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.9692207215605615, "language_loss": 0.79595017, "learning_rate": 3.983504728794533e-06, "loss": 0.8186143, "num_input_tokens_seen": 24745550, "step": 1159, "time_per_iteration": 2.7535817623138428 }, { "auxiliary_loss_clip": 0.01212717, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.06535673, "balance_loss_mlp": 1.04094958, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 3.5530789367722373, "language_loss": 0.80517769, "learning_rate": 3.983454774341387e-06, "loss": 0.82799017, "num_input_tokens_seen": 24762575, "step": 1160, "time_per_iteration": 2.7455785274505615 }, { "auxiliary_loss_clip": 0.0119075, "auxiliary_loss_mlp": 0.01057887, "balance_loss_clip": 1.05680609, "balance_loss_mlp": 1.03294837, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.6303409062485206, "language_loss": 0.7607069, "learning_rate": 3.983404744675437e-06, "loss": 0.78319323, "num_input_tokens_seen": 24782605, "step": 1161, "time_per_iteration": 2.773775100708008 }, { "auxiliary_loss_clip": 0.01175787, "auxiliary_loss_mlp": 0.01062083, "balance_loss_clip": 1.05773759, "balance_loss_mlp": 1.03673923, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.6605796421434038, "language_loss": 0.82758528, "learning_rate": 3.9833546397985794e-06, "loss": 0.84996402, "num_input_tokens_seen": 24802910, "step": 1162, "time_per_iteration": 2.7426044940948486 }, { "auxiliary_loss_clip": 0.01182513, "auxiliary_loss_mlp": 0.01058124, "balance_loss_clip": 1.05717576, "balance_loss_mlp": 1.03092098, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 1.9523155091610094, "language_loss": 0.79563475, "learning_rate": 3.983304459712716e-06, "loss": 0.81804121, "num_input_tokens_seen": 24823305, "step": 1163, "time_per_iteration": 2.720947742462158 }, { "auxiliary_loss_clip": 0.01190519, "auxiliary_loss_mlp": 0.01063375, "balance_loss_clip": 1.05861616, "balance_loss_mlp": 1.03722012, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.213365660843382, "language_loss": 0.79187214, "learning_rate": 3.983254204419749e-06, "loss": 0.81441104, "num_input_tokens_seen": 24842155, "step": 1164, "time_per_iteration": 2.6554183959960938 }, { "auxiliary_loss_clip": 0.01143916, "auxiliary_loss_mlp": 0.01067459, "balance_loss_clip": 1.05240798, "balance_loss_mlp": 1.03875315, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.421930435008642, "language_loss": 0.72855628, "learning_rate": 3.983203873921583e-06, "loss": 0.75067008, "num_input_tokens_seen": 24862080, "step": 1165, "time_per_iteration": 2.753063440322876 }, { "auxiliary_loss_clip": 0.01183824, "auxiliary_loss_mlp": 0.01059612, "balance_loss_clip": 1.06135893, "balance_loss_mlp": 1.03522193, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 2.453348821242437, "language_loss": 0.81136239, "learning_rate": 3.983153468220128e-06, "loss": 0.83379674, "num_input_tokens_seen": 24886165, "step": 1166, "time_per_iteration": 2.802016496658325 }, { "auxiliary_loss_clip": 0.011718, "auxiliary_loss_mlp": 0.01053529, "balance_loss_clip": 1.05450797, "balance_loss_mlp": 1.02754176, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.457667377154448, "language_loss": 0.84640259, "learning_rate": 3.983102987317295e-06, "loss": 0.86865586, "num_input_tokens_seen": 24905775, "step": 1167, "time_per_iteration": 2.7066097259521484 }, { "auxiliary_loss_clip": 0.01193446, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.06136739, "balance_loss_mlp": 1.03887713, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.6158204436543, "language_loss": 0.89524722, "learning_rate": 3.983052431214997e-06, "loss": 0.91782373, "num_input_tokens_seen": 24924295, "step": 1168, "time_per_iteration": 2.6258392333984375 }, { "auxiliary_loss_clip": 0.01190821, "auxiliary_loss_mlp": 0.01065905, "balance_loss_clip": 1.06090224, "balance_loss_mlp": 1.03705645, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 2.6445150319591035, "language_loss": 0.89008862, "learning_rate": 3.983001799915153e-06, "loss": 0.91265589, "num_input_tokens_seen": 24943210, "step": 1169, "time_per_iteration": 2.6858527660369873 }, { "auxiliary_loss_clip": 0.01211063, "auxiliary_loss_mlp": 0.01065533, "balance_loss_clip": 1.06400895, "balance_loss_mlp": 1.03950977, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 1.9672897290124218, "language_loss": 0.83834457, "learning_rate": 3.982951093419681e-06, "loss": 0.86111057, "num_input_tokens_seen": 24960360, "step": 1170, "time_per_iteration": 2.6278069019317627 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.00782328, "balance_loss_clip": 1.0613637, "balance_loss_mlp": 1.00041986, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 1.8542795171503503, "language_loss": 0.75687242, "learning_rate": 3.982900311730506e-06, "loss": 0.77649903, "num_input_tokens_seen": 24978290, "step": 1171, "time_per_iteration": 5.806530475616455 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06133175, "balance_loss_mlp": 1.03919196, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 2.482864122539831, "language_loss": 0.88865125, "learning_rate": 3.9828494548495514e-06, "loss": 0.91108704, "num_input_tokens_seen": 24997055, "step": 1172, "time_per_iteration": 4.371561288833618 }, { "auxiliary_loss_clip": 0.01197698, "auxiliary_loss_mlp": 0.01054991, "balance_loss_clip": 1.06532764, "balance_loss_mlp": 1.02858603, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.6816354314161714, "language_loss": 0.82075119, "learning_rate": 3.982798522778748e-06, "loss": 0.84327805, "num_input_tokens_seen": 25017490, "step": 1173, "time_per_iteration": 4.611542463302612 }, { "auxiliary_loss_clip": 0.01200886, "auxiliary_loss_mlp": 0.01060851, "balance_loss_clip": 1.06317592, "balance_loss_mlp": 1.03503036, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.007232853627583, "language_loss": 0.82071686, "learning_rate": 3.9827475155200245e-06, "loss": 0.8433342, "num_input_tokens_seen": 25035660, "step": 1174, "time_per_iteration": 2.6334969997406006 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.05857778, "balance_loss_mlp": 1.03473568, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 2.09222115072597, "language_loss": 0.85013211, "learning_rate": 3.982696433075317e-06, "loss": 0.87254095, "num_input_tokens_seen": 25054785, "step": 1175, "time_per_iteration": 2.861591339111328 }, { "auxiliary_loss_clip": 0.01196955, "auxiliary_loss_mlp": 0.01069941, "balance_loss_clip": 1.06447482, "balance_loss_mlp": 1.04605186, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.7270820646539309, "language_loss": 0.83103871, "learning_rate": 3.982645275446563e-06, "loss": 0.85370767, "num_input_tokens_seen": 25075180, "step": 1176, "time_per_iteration": 2.754521608352661 }, { "auxiliary_loss_clip": 0.01152261, "auxiliary_loss_mlp": 0.01062154, "balance_loss_clip": 1.05370057, "balance_loss_mlp": 1.0352838, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 3.4939498355716996, "language_loss": 0.74409902, "learning_rate": 3.982594042635701e-06, "loss": 0.7662431, "num_input_tokens_seen": 25093035, "step": 1177, "time_per_iteration": 2.692426919937134 }, { "auxiliary_loss_clip": 0.01188551, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06080353, "balance_loss_mlp": 1.03801203, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.8240190288677762, "language_loss": 0.85965598, "learning_rate": 3.982542734644673e-06, "loss": 0.88218087, "num_input_tokens_seen": 25112520, "step": 1178, "time_per_iteration": 2.7197048664093018 }, { "auxiliary_loss_clip": 0.01082521, "auxiliary_loss_mlp": 0.01013999, "balance_loss_clip": 1.03661168, "balance_loss_mlp": 1.01023197, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8453670789764802, "language_loss": 0.63256603, "learning_rate": 3.982491351475427e-06, "loss": 0.65353125, "num_input_tokens_seen": 25177760, "step": 1179, "time_per_iteration": 3.3419978618621826 }, { "auxiliary_loss_clip": 0.01211274, "auxiliary_loss_mlp": 0.01073372, "balance_loss_clip": 1.06935215, "balance_loss_mlp": 1.04858887, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 3.2714198066984177, "language_loss": 0.83388901, "learning_rate": 3.98243989312991e-06, "loss": 0.85673553, "num_input_tokens_seen": 25195260, "step": 1180, "time_per_iteration": 2.631992816925049 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01071326, "balance_loss_clip": 1.06119037, "balance_loss_mlp": 1.04624391, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.0409456536886386, "language_loss": 0.88649988, "learning_rate": 3.982388359610074e-06, "loss": 0.90903974, "num_input_tokens_seen": 25212740, "step": 1181, "time_per_iteration": 2.696789264678955 }, { "auxiliary_loss_clip": 0.01180377, "auxiliary_loss_mlp": 0.01070036, "balance_loss_clip": 1.06187141, "balance_loss_mlp": 1.04516935, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.8294049229574356, "language_loss": 0.83244783, "learning_rate": 3.9823367509178725e-06, "loss": 0.85495198, "num_input_tokens_seen": 25236420, "step": 1182, "time_per_iteration": 2.9415605068206787 }, { "auxiliary_loss_clip": 0.01193669, "auxiliary_loss_mlp": 0.01067019, "balance_loss_clip": 1.0641923, "balance_loss_mlp": 1.04150808, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 3.5892595189310903, "language_loss": 0.79067838, "learning_rate": 3.982285067055262e-06, "loss": 0.81328523, "num_input_tokens_seen": 25255120, "step": 1183, "time_per_iteration": 2.7284862995147705 }, { "auxiliary_loss_clip": 0.01211976, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.06126475, "balance_loss_mlp": 1.03866172, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.5463322111759354, "language_loss": 0.788867, "learning_rate": 3.982233308024204e-06, "loss": 0.81163466, "num_input_tokens_seen": 25275150, "step": 1184, "time_per_iteration": 2.7531635761260986 }, { "auxiliary_loss_clip": 0.01152059, "auxiliary_loss_mlp": 0.01062006, "balance_loss_clip": 1.05961919, "balance_loss_mlp": 1.03752065, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.904751850318294, "language_loss": 0.76806915, "learning_rate": 3.98218147382666e-06, "loss": 0.79020983, "num_input_tokens_seen": 25293680, "step": 1185, "time_per_iteration": 2.732539176940918 }, { "auxiliary_loss_clip": 0.01208288, "auxiliary_loss_mlp": 0.01073792, "balance_loss_clip": 1.06328642, "balance_loss_mlp": 1.04903185, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.1301142092644696, "language_loss": 0.65472758, "learning_rate": 3.982129564464596e-06, "loss": 0.67754835, "num_input_tokens_seen": 25310050, "step": 1186, "time_per_iteration": 2.757812261581421 }, { "auxiliary_loss_clip": 0.01195497, "auxiliary_loss_mlp": 0.01057322, "balance_loss_clip": 1.06479859, "balance_loss_mlp": 1.03274107, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 2.1671481434625894, "language_loss": 0.69743419, "learning_rate": 3.98207757993998e-06, "loss": 0.71996236, "num_input_tokens_seen": 25331020, "step": 1187, "time_per_iteration": 2.746615409851074 }, { "auxiliary_loss_clip": 0.01151827, "auxiliary_loss_mlp": 0.01067347, "balance_loss_clip": 1.05412316, "balance_loss_mlp": 1.04367232, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.8037131445876597, "language_loss": 0.7861973, "learning_rate": 3.9820255202547845e-06, "loss": 0.80838895, "num_input_tokens_seen": 25347875, "step": 1188, "time_per_iteration": 2.738281726837158 }, { "auxiliary_loss_clip": 0.01203626, "auxiliary_loss_mlp": 0.01059966, "balance_loss_clip": 1.06304908, "balance_loss_mlp": 1.03530121, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 1.8909260147246576, "language_loss": 0.84754103, "learning_rate": 3.981973385410981e-06, "loss": 0.87017697, "num_input_tokens_seen": 25366715, "step": 1189, "time_per_iteration": 2.5770246982574463 }, { "auxiliary_loss_clip": 0.01173135, "auxiliary_loss_mlp": 0.0078213, "balance_loss_clip": 1.06234396, "balance_loss_mlp": 1.00041807, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 5.212083930118342, "language_loss": 0.76932275, "learning_rate": 3.9819211754105494e-06, "loss": 0.78887534, "num_input_tokens_seen": 25385450, "step": 1190, "time_per_iteration": 2.7057712078094482 }, { "auxiliary_loss_clip": 0.01208346, "auxiliary_loss_mlp": 0.01074705, "balance_loss_clip": 1.06283545, "balance_loss_mlp": 1.04751348, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.5312098602102084, "language_loss": 0.75201792, "learning_rate": 3.981868890255468e-06, "loss": 0.7748484, "num_input_tokens_seen": 25403940, "step": 1191, "time_per_iteration": 2.6071674823760986 }, { "auxiliary_loss_clip": 0.01162268, "auxiliary_loss_mlp": 0.01063437, "balance_loss_clip": 1.0519917, "balance_loss_mlp": 1.03649545, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.470839013019174, "language_loss": 0.74334443, "learning_rate": 3.981816529947719e-06, "loss": 0.76560152, "num_input_tokens_seen": 25420410, "step": 1192, "time_per_iteration": 2.661078453063965 }, { "auxiliary_loss_clip": 0.01202036, "auxiliary_loss_mlp": 0.01054727, "balance_loss_clip": 1.05904579, "balance_loss_mlp": 1.03099298, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 2.443309122344248, "language_loss": 0.78010541, "learning_rate": 3.9817640944892896e-06, "loss": 0.8026731, "num_input_tokens_seen": 25439415, "step": 1193, "time_per_iteration": 2.5603158473968506 }, { "auxiliary_loss_clip": 0.01186747, "auxiliary_loss_mlp": 0.01059465, "balance_loss_clip": 1.06358278, "balance_loss_mlp": 1.03319085, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 2.1011663585924585, "language_loss": 0.85497916, "learning_rate": 3.981711583882166e-06, "loss": 0.87744129, "num_input_tokens_seen": 25458715, "step": 1194, "time_per_iteration": 2.6819851398468018 }, { "auxiliary_loss_clip": 0.01184191, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05706751, "balance_loss_mlp": 1.04135609, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 2.0205668140023185, "language_loss": 0.8183766, "learning_rate": 3.981658998128341e-06, "loss": 0.84089589, "num_input_tokens_seen": 25477985, "step": 1195, "time_per_iteration": 2.6646647453308105 }, { "auxiliary_loss_clip": 0.01165951, "auxiliary_loss_mlp": 0.01063438, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03976321, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 2.161995064372768, "language_loss": 0.80093575, "learning_rate": 3.981606337229808e-06, "loss": 0.82322967, "num_input_tokens_seen": 25497110, "step": 1196, "time_per_iteration": 2.7217979431152344 }, { "auxiliary_loss_clip": 0.01176131, "auxiliary_loss_mlp": 0.00784114, "balance_loss_clip": 1.06106043, "balance_loss_mlp": 1.00034249, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 2.5905261146074263, "language_loss": 0.71339291, "learning_rate": 3.9815536011885655e-06, "loss": 0.73299539, "num_input_tokens_seen": 25516555, "step": 1197, "time_per_iteration": 2.7931766510009766 }, { "auxiliary_loss_clip": 0.01157444, "auxiliary_loss_mlp": 0.01055247, "balance_loss_clip": 1.06130266, "balance_loss_mlp": 1.03074968, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 3.074283933156949, "language_loss": 0.85951984, "learning_rate": 3.98150079000661e-06, "loss": 0.88164675, "num_input_tokens_seen": 25533895, "step": 1198, "time_per_iteration": 2.7241532802581787 }, { "auxiliary_loss_clip": 0.01160083, "auxiliary_loss_mlp": 0.0106501, "balance_loss_clip": 1.0597434, "balance_loss_mlp": 1.03944004, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 2.052617638295489, "language_loss": 0.83840948, "learning_rate": 3.981447903685947e-06, "loss": 0.86066043, "num_input_tokens_seen": 25554195, "step": 1199, "time_per_iteration": 2.71362566947937 }, { "auxiliary_loss_clip": 0.01212755, "auxiliary_loss_mlp": 0.01060557, "balance_loss_clip": 1.06877887, "balance_loss_mlp": 1.03709614, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 3.1601590133124837, "language_loss": 0.7623595, "learning_rate": 3.981394942228581e-06, "loss": 0.78509259, "num_input_tokens_seen": 25574155, "step": 1200, "time_per_iteration": 2.6913061141967773 }, { "auxiliary_loss_clip": 0.0119008, "auxiliary_loss_mlp": 0.010701, "balance_loss_clip": 1.06442261, "balance_loss_mlp": 1.04487491, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.2017873087036226, "language_loss": 0.83013475, "learning_rate": 3.98134190563652e-06, "loss": 0.85273659, "num_input_tokens_seen": 25592735, "step": 1201, "time_per_iteration": 2.6983115673065186 }, { "auxiliary_loss_clip": 0.01196941, "auxiliary_loss_mlp": 0.01065672, "balance_loss_clip": 1.06197119, "balance_loss_mlp": 1.03952968, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 20.835065187143087, "language_loss": 0.68601412, "learning_rate": 3.981288793911775e-06, "loss": 0.70864022, "num_input_tokens_seen": 25611510, "step": 1202, "time_per_iteration": 2.691742420196533 }, { "auxiliary_loss_clip": 0.01182684, "auxiliary_loss_mlp": 0.00782201, "balance_loss_clip": 1.06256962, "balance_loss_mlp": 1.00038218, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 1.9661831136137597, "language_loss": 0.87487721, "learning_rate": 3.98123560705636e-06, "loss": 0.89452606, "num_input_tokens_seen": 25629560, "step": 1203, "time_per_iteration": 2.7832019329071045 }, { "auxiliary_loss_clip": 0.01154778, "auxiliary_loss_mlp": 0.01065748, "balance_loss_clip": 1.05210066, "balance_loss_mlp": 1.04065442, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 1.731721557525142, "language_loss": 0.78053147, "learning_rate": 3.981182345072293e-06, "loss": 0.80273676, "num_input_tokens_seen": 25648330, "step": 1204, "time_per_iteration": 2.7754547595977783 }, { "auxiliary_loss_clip": 0.01191832, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.06211591, "balance_loss_mlp": 1.04084373, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.5043252978087258, "language_loss": 0.82094097, "learning_rate": 3.981129007961593e-06, "loss": 0.84351724, "num_input_tokens_seen": 25669470, "step": 1205, "time_per_iteration": 2.680457353591919 }, { "auxiliary_loss_clip": 0.01180244, "auxiliary_loss_mlp": 0.00782807, "balance_loss_clip": 1.06221068, "balance_loss_mlp": 1.00036049, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.6438962430217685, "language_loss": 0.76715982, "learning_rate": 3.981075595726283e-06, "loss": 0.78679025, "num_input_tokens_seen": 25690470, "step": 1206, "time_per_iteration": 2.7028439044952393 }, { "auxiliary_loss_clip": 0.01188223, "auxiliary_loss_mlp": 0.01059861, "balance_loss_clip": 1.06262684, "balance_loss_mlp": 1.03442228, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 1.9378198243304647, "language_loss": 0.77272987, "learning_rate": 3.981022108368387e-06, "loss": 0.79521072, "num_input_tokens_seen": 25709205, "step": 1207, "time_per_iteration": 2.779289960861206 }, { "auxiliary_loss_clip": 0.01185538, "auxiliary_loss_mlp": 0.01053693, "balance_loss_clip": 1.05844951, "balance_loss_mlp": 1.03062558, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 1.8716528383816402, "language_loss": 0.79480875, "learning_rate": 3.9809685458899345e-06, "loss": 0.81720108, "num_input_tokens_seen": 25728485, "step": 1208, "time_per_iteration": 2.682965040206909 }, { "auxiliary_loss_clip": 0.01184899, "auxiliary_loss_mlp": 0.01054862, "balance_loss_clip": 1.05801737, "balance_loss_mlp": 1.03198612, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 2.5612886109689765, "language_loss": 0.78537548, "learning_rate": 3.980914908292955e-06, "loss": 0.80777311, "num_input_tokens_seen": 25747730, "step": 1209, "time_per_iteration": 2.6582658290863037 }, { "auxiliary_loss_clip": 0.01191905, "auxiliary_loss_mlp": 0.01067741, "balance_loss_clip": 1.05931175, "balance_loss_mlp": 1.04408956, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.351303434522043, "language_loss": 0.80920583, "learning_rate": 3.980861195579486e-06, "loss": 0.83180225, "num_input_tokens_seen": 25768050, "step": 1210, "time_per_iteration": 4.241993427276611 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01063711, "balance_loss_clip": 1.06087565, "balance_loss_mlp": 1.03891551, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.875347829314158, "language_loss": 0.84302205, "learning_rate": 3.98080740775156e-06, "loss": 0.86540848, "num_input_tokens_seen": 25787985, "step": 1211, "time_per_iteration": 4.289919853210449 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01060218, "balance_loss_clip": 1.05356658, "balance_loss_mlp": 1.03629231, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.991110515222773, "language_loss": 0.90684664, "learning_rate": 3.98075354481122e-06, "loss": 0.92906934, "num_input_tokens_seen": 25803620, "step": 1212, "time_per_iteration": 2.660780906677246 }, { "auxiliary_loss_clip": 0.01202443, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.0623759, "balance_loss_mlp": 1.03490353, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7918815842724805, "language_loss": 0.72358596, "learning_rate": 3.9806996067605055e-06, "loss": 0.74619853, "num_input_tokens_seen": 25823315, "step": 1213, "time_per_iteration": 4.303524017333984 }, { "auxiliary_loss_clip": 0.01153662, "auxiliary_loss_mlp": 0.01055706, "balance_loss_clip": 1.05658662, "balance_loss_mlp": 1.03089869, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 1.8655932637344164, "language_loss": 0.84356117, "learning_rate": 3.980645593601465e-06, "loss": 0.86565483, "num_input_tokens_seen": 25842605, "step": 1214, "time_per_iteration": 2.7505569458007812 }, { "auxiliary_loss_clip": 0.01208881, "auxiliary_loss_mlp": 0.01062075, "balance_loss_clip": 1.06484771, "balance_loss_mlp": 1.03723145, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.025651344907852, "language_loss": 0.84113681, "learning_rate": 3.980591505336144e-06, "loss": 0.86384636, "num_input_tokens_seen": 25863030, "step": 1215, "time_per_iteration": 2.7235965728759766 }, { "auxiliary_loss_clip": 0.01149957, "auxiliary_loss_mlp": 0.01062992, "balance_loss_clip": 1.05138278, "balance_loss_mlp": 1.03744531, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.9312816725096997, "language_loss": 0.80926049, "learning_rate": 3.980537341966595e-06, "loss": 0.83139002, "num_input_tokens_seen": 25888015, "step": 1216, "time_per_iteration": 2.9129130840301514 }, { "auxiliary_loss_clip": 0.01167944, "auxiliary_loss_mlp": 0.01060276, "balance_loss_clip": 1.05619049, "balance_loss_mlp": 1.03680408, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 3.2846247291101975, "language_loss": 0.75949144, "learning_rate": 3.980483103494872e-06, "loss": 0.78177369, "num_input_tokens_seen": 25908660, "step": 1217, "time_per_iteration": 2.7106521129608154 }, { "auxiliary_loss_clip": 0.01169026, "auxiliary_loss_mlp": 0.01056631, "balance_loss_clip": 1.06182647, "balance_loss_mlp": 1.03477991, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 1.9658490798069863, "language_loss": 0.86455309, "learning_rate": 3.98042878992303e-06, "loss": 0.88680959, "num_input_tokens_seen": 25927215, "step": 1218, "time_per_iteration": 2.5911786556243896 }, { "auxiliary_loss_clip": 0.01192266, "auxiliary_loss_mlp": 0.0106258, "balance_loss_clip": 1.06015348, "balance_loss_mlp": 1.03916681, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 2.2310702082820675, "language_loss": 0.86782354, "learning_rate": 3.9803744012531305e-06, "loss": 0.89037204, "num_input_tokens_seen": 25945500, "step": 1219, "time_per_iteration": 2.608562707901001 }, { "auxiliary_loss_clip": 0.01201545, "auxiliary_loss_mlp": 0.01058282, "balance_loss_clip": 1.06024373, "balance_loss_mlp": 1.03539419, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.095886373367052, "language_loss": 0.84608674, "learning_rate": 3.980319937487235e-06, "loss": 0.86868501, "num_input_tokens_seen": 25963105, "step": 1220, "time_per_iteration": 2.469189405441284 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.05358922, "balance_loss_mlp": 1.03942597, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.648884311755534, "language_loss": 0.77114344, "learning_rate": 3.98026539862741e-06, "loss": 0.79336596, "num_input_tokens_seen": 25981690, "step": 1221, "time_per_iteration": 2.671762466430664 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.05726743, "balance_loss_mlp": 1.04082406, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 2.5357389392469942, "language_loss": 0.91631913, "learning_rate": 3.980210784675722e-06, "loss": 0.93855029, "num_input_tokens_seen": 25999890, "step": 1222, "time_per_iteration": 2.6973063945770264 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01064872, "balance_loss_clip": 1.05333126, "balance_loss_mlp": 1.04169726, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.8024324299253047, "language_loss": 0.90976465, "learning_rate": 3.980156095634242e-06, "loss": 0.93177247, "num_input_tokens_seen": 26016445, "step": 1223, "time_per_iteration": 2.8141093254089355 }, { "auxiliary_loss_clip": 0.01202875, "auxiliary_loss_mlp": 0.01077185, "balance_loss_clip": 1.06232905, "balance_loss_mlp": 1.05341494, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 1.9348534518871447, "language_loss": 0.82161939, "learning_rate": 3.980101331505045e-06, "loss": 0.84442002, "num_input_tokens_seen": 26036080, "step": 1224, "time_per_iteration": 2.640432119369507 }, { "auxiliary_loss_clip": 0.01200329, "auxiliary_loss_mlp": 0.01057586, "balance_loss_clip": 1.05987597, "balance_loss_mlp": 1.03229022, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.31744406237409, "language_loss": 0.83194047, "learning_rate": 3.9800464922902076e-06, "loss": 0.85451961, "num_input_tokens_seen": 26055805, "step": 1225, "time_per_iteration": 2.6159210205078125 }, { "auxiliary_loss_clip": 0.01170115, "auxiliary_loss_mlp": 0.01056068, "balance_loss_clip": 1.05743551, "balance_loss_mlp": 1.03190422, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 2.2959030425986544, "language_loss": 0.90388274, "learning_rate": 3.979991577991808e-06, "loss": 0.9261446, "num_input_tokens_seen": 26073905, "step": 1226, "time_per_iteration": 2.6527435779571533 }, { "auxiliary_loss_clip": 0.01207799, "auxiliary_loss_mlp": 0.0104599, "balance_loss_clip": 1.05913424, "balance_loss_mlp": 1.02080154, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.579592162134606, "language_loss": 0.76626784, "learning_rate": 3.97993658861193e-06, "loss": 0.78880572, "num_input_tokens_seen": 26091700, "step": 1227, "time_per_iteration": 2.596151351928711 }, { "auxiliary_loss_clip": 0.0118909, "auxiliary_loss_mlp": 0.01053386, "balance_loss_clip": 1.06296694, "balance_loss_mlp": 1.02954459, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 7.788838200212175, "language_loss": 0.8555491, "learning_rate": 3.9798815241526575e-06, "loss": 0.87797379, "num_input_tokens_seen": 26114105, "step": 1228, "time_per_iteration": 2.6955716609954834 }, { "auxiliary_loss_clip": 0.01191175, "auxiliary_loss_mlp": 0.01062669, "balance_loss_clip": 1.05897212, "balance_loss_mlp": 1.03860044, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.2575099517148898, "language_loss": 0.79598552, "learning_rate": 3.97982638461608e-06, "loss": 0.818524, "num_input_tokens_seen": 26131165, "step": 1229, "time_per_iteration": 2.6544861793518066 }, { "auxiliary_loss_clip": 0.01192886, "auxiliary_loss_mlp": 0.00782044, "balance_loss_clip": 1.05966699, "balance_loss_mlp": 1.00032902, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 2.2881874382496377, "language_loss": 0.78209347, "learning_rate": 3.979771170004287e-06, "loss": 0.80184281, "num_input_tokens_seen": 26150040, "step": 1230, "time_per_iteration": 2.6001133918762207 }, { "auxiliary_loss_clip": 0.0120142, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.06209648, "balance_loss_mlp": 1.02739108, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2.038847041772147, "language_loss": 0.8136946, "learning_rate": 3.979715880319372e-06, "loss": 0.83623219, "num_input_tokens_seen": 26169380, "step": 1231, "time_per_iteration": 2.6364073753356934 }, { "auxiliary_loss_clip": 0.01179975, "auxiliary_loss_mlp": 0.01070917, "balance_loss_clip": 1.05690873, "balance_loss_mlp": 1.04599047, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.096832924731062, "language_loss": 0.95204866, "learning_rate": 3.979660515563434e-06, "loss": 0.97455758, "num_input_tokens_seen": 26189420, "step": 1232, "time_per_iteration": 2.7929203510284424 }, { "auxiliary_loss_clip": 0.01187282, "auxiliary_loss_mlp": 0.01059661, "balance_loss_clip": 1.06202245, "balance_loss_mlp": 1.03733301, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.7778448126368063, "language_loss": 0.80695188, "learning_rate": 3.979605075738569e-06, "loss": 0.82942128, "num_input_tokens_seen": 26209300, "step": 1233, "time_per_iteration": 2.7945051193237305 }, { "auxiliary_loss_clip": 0.01209245, "auxiliary_loss_mlp": 0.0106207, "balance_loss_clip": 1.06238747, "balance_loss_mlp": 1.03602231, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.136728864247421, "language_loss": 0.70708907, "learning_rate": 3.979549560846883e-06, "loss": 0.72980225, "num_input_tokens_seen": 26228110, "step": 1234, "time_per_iteration": 2.9646782875061035 }, { "auxiliary_loss_clip": 0.01167486, "auxiliary_loss_mlp": 0.01068879, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 1.04265285, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.7921102377369336, "language_loss": 0.76852918, "learning_rate": 3.979493970890478e-06, "loss": 0.79089284, "num_input_tokens_seen": 26247020, "step": 1235, "time_per_iteration": 2.820577621459961 }, { "auxiliary_loss_clip": 0.01198028, "auxiliary_loss_mlp": 0.01055883, "balance_loss_clip": 1.05918813, "balance_loss_mlp": 1.0321244, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 2.3018318065058097, "language_loss": 0.82748145, "learning_rate": 3.979438305871464e-06, "loss": 0.85002053, "num_input_tokens_seen": 26265750, "step": 1236, "time_per_iteration": 2.6302287578582764 }, { "auxiliary_loss_clip": 0.01154783, "auxiliary_loss_mlp": 0.00782014, "balance_loss_clip": 1.05519629, "balance_loss_mlp": 1.00039148, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 1.7985383717833268, "language_loss": 0.7595011, "learning_rate": 3.979382565791951e-06, "loss": 0.77886909, "num_input_tokens_seen": 26287905, "step": 1237, "time_per_iteration": 2.721931219100952 }, { "auxiliary_loss_clip": 0.01135551, "auxiliary_loss_mlp": 0.00783311, "balance_loss_clip": 1.0505693, "balance_loss_mlp": 1.00031757, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.6915170784810407, "language_loss": 0.77458763, "learning_rate": 3.979326750654053e-06, "loss": 0.79377621, "num_input_tokens_seen": 26311795, "step": 1238, "time_per_iteration": 2.831620931625366 }, { "auxiliary_loss_clip": 0.01177529, "auxiliary_loss_mlp": 0.01057762, "balance_loss_clip": 1.05673254, "balance_loss_mlp": 1.03311002, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9053364150897723, "language_loss": 0.867737, "learning_rate": 3.9792708604598854e-06, "loss": 0.89008987, "num_input_tokens_seen": 26330330, "step": 1239, "time_per_iteration": 2.6697263717651367 }, { "auxiliary_loss_clip": 0.01159844, "auxiliary_loss_mlp": 0.01050954, "balance_loss_clip": 1.05222142, "balance_loss_mlp": 1.02532458, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 26.978042105238785, "language_loss": 0.89356089, "learning_rate": 3.979214895211569e-06, "loss": 0.91566885, "num_input_tokens_seen": 26348865, "step": 1240, "time_per_iteration": 2.846013069152832 }, { "auxiliary_loss_clip": 0.01174117, "auxiliary_loss_mlp": 0.01063539, "balance_loss_clip": 1.05857158, "balance_loss_mlp": 1.03713393, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.9346624045484253, "language_loss": 0.88873678, "learning_rate": 3.979158854911225e-06, "loss": 0.91111326, "num_input_tokens_seen": 26368210, "step": 1241, "time_per_iteration": 2.6926562786102295 }, { "auxiliary_loss_clip": 0.01079637, "auxiliary_loss_mlp": 0.01009562, "balance_loss_clip": 1.03489435, "balance_loss_mlp": 1.00405502, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.8973011136706247, "language_loss": 0.63067901, "learning_rate": 3.979102739560979e-06, "loss": 0.65157104, "num_input_tokens_seen": 26424890, "step": 1242, "time_per_iteration": 3.298609972000122 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01068833, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.03819644, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 3.87499965477456, "language_loss": 0.62926078, "learning_rate": 3.9790465491629595e-06, "loss": 0.65159178, "num_input_tokens_seen": 26446405, "step": 1243, "time_per_iteration": 2.7774572372436523 }, { "auxiliary_loss_clip": 0.01188864, "auxiliary_loss_mlp": 0.01059918, "balance_loss_clip": 1.05716145, "balance_loss_mlp": 1.03499091, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 1.6252135866538246, "language_loss": 0.76259589, "learning_rate": 3.978990283719296e-06, "loss": 0.78508377, "num_input_tokens_seen": 26466070, "step": 1244, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.0611167, "balance_loss_mlp": 1.00038469, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 5.636002853507256, "language_loss": 0.69419599, "learning_rate": 3.978933943232123e-06, "loss": 0.71387023, "num_input_tokens_seen": 26479350, "step": 1245, "time_per_iteration": 2.640895366668701 }, { "auxiliary_loss_clip": 0.01203955, "auxiliary_loss_mlp": 0.01062684, "balance_loss_clip": 1.06098139, "balance_loss_mlp": 1.0372088, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 2.5525245798098757, "language_loss": 0.88635457, "learning_rate": 3.978877527703576e-06, "loss": 0.90902102, "num_input_tokens_seen": 26498255, "step": 1246, "time_per_iteration": 2.747765302658081 }, { "auxiliary_loss_clip": 0.01212369, "auxiliary_loss_mlp": 0.01077452, "balance_loss_clip": 1.06102896, "balance_loss_mlp": 1.049402, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.675073323546491, "language_loss": 0.8825295, "learning_rate": 3.9788210371357945e-06, "loss": 0.90542769, "num_input_tokens_seen": 26515375, "step": 1247, "time_per_iteration": 2.6810224056243896 }, { "auxiliary_loss_clip": 0.0118495, "auxiliary_loss_mlp": 0.01069489, "balance_loss_clip": 1.06058884, "balance_loss_mlp": 1.04383492, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.620559853720615, "language_loss": 0.64849806, "learning_rate": 3.978764471530921e-06, "loss": 0.67104244, "num_input_tokens_seen": 26533595, "step": 1248, "time_per_iteration": 2.706862449645996 }, { "auxiliary_loss_clip": 0.01181878, "auxiliary_loss_mlp": 0.00782677, "balance_loss_clip": 1.0575974, "balance_loss_mlp": 1.0004611, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 2.872208543000993, "language_loss": 0.74216163, "learning_rate": 3.978707830891102e-06, "loss": 0.7618072, "num_input_tokens_seen": 26549405, "step": 1249, "time_per_iteration": 4.309665679931641 }, { "auxiliary_loss_clip": 0.01168375, "auxiliary_loss_mlp": 0.01079691, "balance_loss_clip": 1.0579834, "balance_loss_mlp": 1.05296445, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.679176110316805, "language_loss": 0.82353318, "learning_rate": 3.978651115218482e-06, "loss": 0.84601378, "num_input_tokens_seen": 26567200, "step": 1250, "time_per_iteration": 4.367432594299316 }, { "auxiliary_loss_clip": 0.011507, "auxiliary_loss_mlp": 0.01064103, "balance_loss_clip": 1.05736125, "balance_loss_mlp": 1.0380677, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 2.015636709873133, "language_loss": 0.6679548, "learning_rate": 3.978594324515215e-06, "loss": 0.69010288, "num_input_tokens_seen": 26586190, "step": 1251, "time_per_iteration": 4.339111089706421 }, { "auxiliary_loss_clip": 0.01061099, "auxiliary_loss_mlp": 0.01007289, "balance_loss_clip": 1.02992618, "balance_loss_mlp": 1.00314093, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9014655793512963, "language_loss": 0.7038399, "learning_rate": 3.9785374587834515e-06, "loss": 0.72452378, "num_input_tokens_seen": 26650710, "step": 1252, "time_per_iteration": 4.984445333480835 }, { "auxiliary_loss_clip": 0.0120348, "auxiliary_loss_mlp": 0.01071343, "balance_loss_clip": 1.06016684, "balance_loss_mlp": 1.04651129, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.2789224049077226, "language_loss": 0.79936707, "learning_rate": 3.97848051802535e-06, "loss": 0.82211524, "num_input_tokens_seen": 26669000, "step": 1253, "time_per_iteration": 2.613696575164795 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01062493, "balance_loss_clip": 1.05703712, "balance_loss_mlp": 1.03758967, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 3.1057458778243263, "language_loss": 0.93360364, "learning_rate": 3.978423502243069e-06, "loss": 0.95588255, "num_input_tokens_seen": 26683075, "step": 1254, "time_per_iteration": 2.7332606315612793 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01064454, "balance_loss_clip": 1.06050682, "balance_loss_mlp": 1.03958726, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 2.090631066181037, "language_loss": 0.88087487, "learning_rate": 3.97836641143877e-06, "loss": 0.90325236, "num_input_tokens_seen": 26701875, "step": 1255, "time_per_iteration": 2.713636875152588 }, { "auxiliary_loss_clip": 0.01202338, "auxiliary_loss_mlp": 0.01071467, "balance_loss_clip": 1.06138325, "balance_loss_mlp": 1.04531264, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.9772348994273161, "language_loss": 0.79305708, "learning_rate": 3.978309245614618e-06, "loss": 0.81579506, "num_input_tokens_seen": 26719050, "step": 1256, "time_per_iteration": 2.688812255859375 }, { "auxiliary_loss_clip": 0.01064506, "auxiliary_loss_mlp": 0.01008663, "balance_loss_clip": 1.0281384, "balance_loss_mlp": 1.0043, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.7721513084275832, "language_loss": 0.58031851, "learning_rate": 3.9782520047727825e-06, "loss": 0.6010502, "num_input_tokens_seen": 26780650, "step": 1257, "time_per_iteration": 3.290971517562866 }, { "auxiliary_loss_clip": 0.01154091, "auxiliary_loss_mlp": 0.01065293, "balance_loss_clip": 1.06175375, "balance_loss_mlp": 1.04035461, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 2.5700283098608026, "language_loss": 0.90029764, "learning_rate": 3.978194688915432e-06, "loss": 0.92249143, "num_input_tokens_seen": 26798725, "step": 1258, "time_per_iteration": 2.800297975540161 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01064585, "balance_loss_clip": 1.06184185, "balance_loss_mlp": 1.03797793, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 2.1868972302346377, "language_loss": 0.81404132, "learning_rate": 3.978137298044741e-06, "loss": 0.83638299, "num_input_tokens_seen": 26817005, "step": 1259, "time_per_iteration": 2.767717123031616 }, { "auxiliary_loss_clip": 0.01194891, "auxiliary_loss_mlp": 0.01062022, "balance_loss_clip": 1.06317782, "balance_loss_mlp": 1.03766739, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.8876128491153832, "language_loss": 0.7609086, "learning_rate": 3.978079832162885e-06, "loss": 0.78347778, "num_input_tokens_seen": 26836655, "step": 1260, "time_per_iteration": 2.859339714050293 }, { "auxiliary_loss_clip": 0.01160098, "auxiliary_loss_mlp": 0.01068568, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.04222322, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 1.7028037437197219, "language_loss": 0.84734851, "learning_rate": 3.978022291272044e-06, "loss": 0.86963522, "num_input_tokens_seen": 26854925, "step": 1261, "time_per_iteration": 2.773087978363037 }, { "auxiliary_loss_clip": 0.01212087, "auxiliary_loss_mlp": 0.0106726, "balance_loss_clip": 1.06821966, "balance_loss_mlp": 1.04273915, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.8668314773439494, "language_loss": 0.82578814, "learning_rate": 3.977964675374399e-06, "loss": 0.84858155, "num_input_tokens_seen": 26876170, "step": 1262, "time_per_iteration": 2.681764841079712 }, { "auxiliary_loss_clip": 0.01206367, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.06333947, "balance_loss_mlp": 1.03685009, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.501362251414687, "language_loss": 0.82448232, "learning_rate": 3.977906984472136e-06, "loss": 0.84717447, "num_input_tokens_seen": 26895005, "step": 1263, "time_per_iteration": 2.6262786388397217 }, { "auxiliary_loss_clip": 0.01166059, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.06484997, "balance_loss_mlp": 1.04334641, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.171520639750579, "language_loss": 0.76149648, "learning_rate": 3.977849218567442e-06, "loss": 0.78384447, "num_input_tokens_seen": 26913930, "step": 1264, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01181777, "auxiliary_loss_mlp": 0.01061673, "balance_loss_clip": 1.06183577, "balance_loss_mlp": 1.03704381, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.252731793921747, "language_loss": 0.80919051, "learning_rate": 3.977791377662507e-06, "loss": 0.83162498, "num_input_tokens_seen": 26931485, "step": 1265, "time_per_iteration": 2.6076793670654297 }, { "auxiliary_loss_clip": 0.01143593, "auxiliary_loss_mlp": 0.01068856, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.0411638, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 2.117217065332582, "language_loss": 0.65244937, "learning_rate": 3.977733461759524e-06, "loss": 0.67457378, "num_input_tokens_seen": 26951670, "step": 1266, "time_per_iteration": 2.714848041534424 }, { "auxiliary_loss_clip": 0.0116364, "auxiliary_loss_mlp": 0.01066982, "balance_loss_clip": 1.05869627, "balance_loss_mlp": 1.04194832, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 2.0157381540709416, "language_loss": 0.79570109, "learning_rate": 3.977675470860691e-06, "loss": 0.81800735, "num_input_tokens_seen": 26970335, "step": 1267, "time_per_iteration": 2.692220687866211 }, { "auxiliary_loss_clip": 0.01186526, "auxiliary_loss_mlp": 0.01060572, "balance_loss_clip": 1.06368709, "balance_loss_mlp": 1.03644359, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 2.573855585409162, "language_loss": 0.72936547, "learning_rate": 3.977617404968205e-06, "loss": 0.75183642, "num_input_tokens_seen": 26986025, "step": 1268, "time_per_iteration": 2.666487216949463 }, { "auxiliary_loss_clip": 0.01189272, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.05925119, "balance_loss_mlp": 1.03146791, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 2.3531002902867018, "language_loss": 0.82087409, "learning_rate": 3.977559264084269e-06, "loss": 0.84333622, "num_input_tokens_seen": 27004045, "step": 1269, "time_per_iteration": 2.6196024417877197 }, { "auxiliary_loss_clip": 0.01198264, "auxiliary_loss_mlp": 0.01062408, "balance_loss_clip": 1.06528163, "balance_loss_mlp": 1.03656352, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 2.6660741307472424, "language_loss": 0.88614184, "learning_rate": 3.977501048211088e-06, "loss": 0.90874851, "num_input_tokens_seen": 27022070, "step": 1270, "time_per_iteration": 2.6423919200897217 }, { "auxiliary_loss_clip": 0.01195764, "auxiliary_loss_mlp": 0.01062092, "balance_loss_clip": 1.06443572, "balance_loss_mlp": 1.0371294, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 2.486841045046768, "language_loss": 0.7104162, "learning_rate": 3.977442757350869e-06, "loss": 0.73299474, "num_input_tokens_seen": 27041755, "step": 1271, "time_per_iteration": 2.6679437160491943 }, { "auxiliary_loss_clip": 0.01157818, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05973268, "balance_loss_mlp": 1.04282308, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.5691807400142836, "language_loss": 0.82570392, "learning_rate": 3.977384391505823e-06, "loss": 0.84796339, "num_input_tokens_seen": 27061540, "step": 1272, "time_per_iteration": 2.7613680362701416 }, { "auxiliary_loss_clip": 0.01176176, "auxiliary_loss_mlp": 0.00782751, "balance_loss_clip": 1.05822372, "balance_loss_mlp": 1.00051665, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.811509476700225, "language_loss": 0.79854733, "learning_rate": 3.977325950678162e-06, "loss": 0.81813657, "num_input_tokens_seen": 27081395, "step": 1273, "time_per_iteration": 2.696317434310913 }, { "auxiliary_loss_clip": 0.01185133, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06556833, "balance_loss_mlp": 1.03910685, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 1.7399681078894738, "language_loss": 0.81519866, "learning_rate": 3.977267434870103e-06, "loss": 0.83769304, "num_input_tokens_seen": 27101175, "step": 1274, "time_per_iteration": 2.8570950031280518 }, { "auxiliary_loss_clip": 0.0118748, "auxiliary_loss_mlp": 0.01078696, "balance_loss_clip": 1.06516898, "balance_loss_mlp": 1.05164731, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 2.6845981005996453, "language_loss": 0.73083639, "learning_rate": 3.977208844083865e-06, "loss": 0.75349814, "num_input_tokens_seen": 27124505, "step": 1275, "time_per_iteration": 2.75947904586792 }, { "auxiliary_loss_clip": 0.0121081, "auxiliary_loss_mlp": 0.01063745, "balance_loss_clip": 1.06740415, "balance_loss_mlp": 1.03694642, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.828157953752124, "language_loss": 0.79507053, "learning_rate": 3.9771501783216685e-06, "loss": 0.81781602, "num_input_tokens_seen": 27140960, "step": 1276, "time_per_iteration": 2.626683473587036 }, { "auxiliary_loss_clip": 0.01198279, "auxiliary_loss_mlp": 0.01058719, "balance_loss_clip": 1.06486118, "balance_loss_mlp": 1.03485298, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 2.406514987231471, "language_loss": 0.58915478, "learning_rate": 3.97709143758574e-06, "loss": 0.61172473, "num_input_tokens_seen": 27160985, "step": 1277, "time_per_iteration": 2.6684958934783936 }, { "auxiliary_loss_clip": 0.01201282, "auxiliary_loss_mlp": 0.01064396, "balance_loss_clip": 1.06430948, "balance_loss_mlp": 1.03919542, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.8024245322836046, "language_loss": 0.74957907, "learning_rate": 3.977032621878305e-06, "loss": 0.77223587, "num_input_tokens_seen": 27178390, "step": 1278, "time_per_iteration": 2.723675012588501 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01063133, "balance_loss_clip": 1.0584681, "balance_loss_mlp": 1.0390408, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 5.339853944094037, "language_loss": 0.88594604, "learning_rate": 3.976973731201596e-06, "loss": 0.90818715, "num_input_tokens_seen": 27197505, "step": 1279, "time_per_iteration": 2.655036211013794 }, { "auxiliary_loss_clip": 0.01172627, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.06065845, "balance_loss_mlp": 1.04077685, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.4937131241937256, "language_loss": 0.8300451, "learning_rate": 3.976914765557845e-06, "loss": 0.85243726, "num_input_tokens_seen": 27214260, "step": 1280, "time_per_iteration": 2.7717065811157227 }, { "auxiliary_loss_clip": 0.01194022, "auxiliary_loss_mlp": 0.01066533, "balance_loss_clip": 1.06593037, "balance_loss_mlp": 1.04104638, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 2.044864943195716, "language_loss": 0.7581439, "learning_rate": 3.9768557249492875e-06, "loss": 0.78074944, "num_input_tokens_seen": 27232525, "step": 1281, "time_per_iteration": 2.7444865703582764 }, { "auxiliary_loss_clip": 0.01170775, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.05879402, "balance_loss_mlp": 1.03669322, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 1.8925477349429178, "language_loss": 0.75091648, "learning_rate": 3.9767966093781634e-06, "loss": 0.77324951, "num_input_tokens_seen": 27249800, "step": 1282, "time_per_iteration": 2.829145908355713 }, { "auxiliary_loss_clip": 0.01213222, "auxiliary_loss_mlp": 0.01071082, "balance_loss_clip": 1.07007408, "balance_loss_mlp": 1.04549992, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 2.1558853998977527, "language_loss": 0.83863324, "learning_rate": 3.976737418846713e-06, "loss": 0.8614763, "num_input_tokens_seen": 27268895, "step": 1283, "time_per_iteration": 2.6955173015594482 }, { "auxiliary_loss_clip": 0.0119621, "auxiliary_loss_mlp": 0.01066889, "balance_loss_clip": 1.06603825, "balance_loss_mlp": 1.03925657, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 2.520477290704422, "language_loss": 0.75147104, "learning_rate": 3.976678153357181e-06, "loss": 0.77410209, "num_input_tokens_seen": 27288180, "step": 1284, "time_per_iteration": 2.6589291095733643 }, { "auxiliary_loss_clip": 0.01182212, "auxiliary_loss_mlp": 0.01068485, "balance_loss_clip": 1.06304765, "balance_loss_mlp": 1.0438329, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 5.2953301239297295, "language_loss": 0.76224041, "learning_rate": 3.976618812911817e-06, "loss": 0.78474742, "num_input_tokens_seen": 27311815, "step": 1285, "time_per_iteration": 2.847702741622925 }, { "auxiliary_loss_clip": 0.01216302, "auxiliary_loss_mlp": 0.01071451, "balance_loss_clip": 1.07193899, "balance_loss_mlp": 1.04729891, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 2.0564733507641, "language_loss": 0.84193194, "learning_rate": 3.9765593975128685e-06, "loss": 0.86480945, "num_input_tokens_seen": 27331890, "step": 1286, "time_per_iteration": 2.713963270187378 }, { "auxiliary_loss_clip": 0.01180469, "auxiliary_loss_mlp": 0.01061062, "balance_loss_clip": 1.06331325, "balance_loss_mlp": 1.03646958, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.810253293244863, "language_loss": 0.76899689, "learning_rate": 3.97649990716259e-06, "loss": 0.79141217, "num_input_tokens_seen": 27348320, "step": 1287, "time_per_iteration": 2.669168472290039 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05891848, "balance_loss_mlp": 1.03696775, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.6525652726351308, "language_loss": 0.84699571, "learning_rate": 3.976440341863237e-06, "loss": 0.86936986, "num_input_tokens_seen": 27367670, "step": 1288, "time_per_iteration": 2.7794599533081055 }, { "auxiliary_loss_clip": 0.01206182, "auxiliary_loss_mlp": 0.0106604, "balance_loss_clip": 1.06214797, "balance_loss_mlp": 1.04203176, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.0424090794957523, "language_loss": 0.85576034, "learning_rate": 3.976380701617068e-06, "loss": 0.87848258, "num_input_tokens_seen": 27385485, "step": 1289, "time_per_iteration": 4.232934236526489 }, { "auxiliary_loss_clip": 0.01207527, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.06487668, "balance_loss_mlp": 1.0291574, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 2.840721047922519, "language_loss": 0.85548425, "learning_rate": 3.976320986426344e-06, "loss": 0.87808931, "num_input_tokens_seen": 27405110, "step": 1290, "time_per_iteration": 4.218302965164185 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01066698, "balance_loss_clip": 1.06411862, "balance_loss_mlp": 1.04041266, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.3756178078405976, "language_loss": 0.91390574, "learning_rate": 3.9762611962933315e-06, "loss": 0.93631011, "num_input_tokens_seen": 27422855, "step": 1291, "time_per_iteration": 4.468304395675659 }, { "auxiliary_loss_clip": 0.01081301, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.04092944, "balance_loss_mlp": 1.03894901, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.8973948861970446, "language_loss": 0.65065891, "learning_rate": 3.9762013312202955e-06, "loss": 0.67190224, "num_input_tokens_seen": 27487190, "step": 1292, "time_per_iteration": 3.3142755031585693 }, { "auxiliary_loss_clip": 0.01195822, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.06527543, "balance_loss_mlp": 1.03846776, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7595227960044768, "language_loss": 0.87530363, "learning_rate": 3.9761413912095075e-06, "loss": 0.89788938, "num_input_tokens_seen": 27510465, "step": 1293, "time_per_iteration": 2.801603078842163 }, { "auxiliary_loss_clip": 0.01116633, "auxiliary_loss_mlp": 0.01078659, "balance_loss_clip": 1.05041039, "balance_loss_mlp": 1.05012059, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 2.2898991349098528, "language_loss": 0.84518278, "learning_rate": 3.976081376263239e-06, "loss": 0.8671357, "num_input_tokens_seen": 27528645, "step": 1294, "time_per_iteration": 2.898597002029419 }, { "auxiliary_loss_clip": 0.01158796, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.05967593, "balance_loss_mlp": 1.0342207, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.7292442592472073, "language_loss": 0.79365373, "learning_rate": 3.976021286383768e-06, "loss": 0.81583679, "num_input_tokens_seen": 27546165, "step": 1295, "time_per_iteration": 2.8481552600860596 }, { "auxiliary_loss_clip": 0.01155886, "auxiliary_loss_mlp": 0.01061351, "balance_loss_clip": 1.06015158, "balance_loss_mlp": 1.0356493, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 3.472740252224496, "language_loss": 0.88351864, "learning_rate": 3.975961121573371e-06, "loss": 0.90569103, "num_input_tokens_seen": 27566520, "step": 1296, "time_per_iteration": 2.697831392288208 }, { "auxiliary_loss_clip": 0.0120756, "auxiliary_loss_mlp": 0.01074146, "balance_loss_clip": 1.06552935, "balance_loss_mlp": 1.04791999, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 2.384603846473911, "language_loss": 0.9625901, "learning_rate": 3.9759008818343305e-06, "loss": 0.98540717, "num_input_tokens_seen": 27581960, "step": 1297, "time_per_iteration": 2.62660551071167 }, { "auxiliary_loss_clip": 0.01175852, "auxiliary_loss_mlp": 0.01069298, "balance_loss_clip": 1.06147313, "balance_loss_mlp": 1.04517019, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.15152040651991, "language_loss": 0.7600193, "learning_rate": 3.97584056716893e-06, "loss": 0.78247076, "num_input_tokens_seen": 27601415, "step": 1298, "time_per_iteration": 2.8040499687194824 }, { "auxiliary_loss_clip": 0.0114505, "auxiliary_loss_mlp": 0.00783981, "balance_loss_clip": 1.05864501, "balance_loss_mlp": 1.0006063, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.6697657327886877, "language_loss": 0.8097105, "learning_rate": 3.9757801775794575e-06, "loss": 0.82900077, "num_input_tokens_seen": 27621490, "step": 1299, "time_per_iteration": 2.7667653560638428 }, { "auxiliary_loss_clip": 0.01162638, "auxiliary_loss_mlp": 0.01064395, "balance_loss_clip": 1.06191885, "balance_loss_mlp": 1.0393368, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.9748762517467437, "language_loss": 0.86755943, "learning_rate": 3.975719713068202e-06, "loss": 0.8898297, "num_input_tokens_seen": 27640600, "step": 1300, "time_per_iteration": 2.7819204330444336 }, { "auxiliary_loss_clip": 0.0120807, "auxiliary_loss_mlp": 0.01056805, "balance_loss_clip": 1.06663537, "balance_loss_mlp": 1.03180683, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.040560411644486, "language_loss": 0.71822268, "learning_rate": 3.975659173637458e-06, "loss": 0.74087137, "num_input_tokens_seen": 27663070, "step": 1301, "time_per_iteration": 2.845107316970825 }, { "auxiliary_loss_clip": 0.01196566, "auxiliary_loss_mlp": 0.01075534, "balance_loss_clip": 1.06426311, "balance_loss_mlp": 1.05100083, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.6425838754876312, "language_loss": 0.70782864, "learning_rate": 3.97559855928952e-06, "loss": 0.73054957, "num_input_tokens_seen": 27686425, "step": 1302, "time_per_iteration": 2.898069381713867 }, { "auxiliary_loss_clip": 0.01162032, "auxiliary_loss_mlp": 0.00783256, "balance_loss_clip": 1.06019354, "balance_loss_mlp": 1.00062823, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.067506704059933, "language_loss": 0.82100385, "learning_rate": 3.9755378700266864e-06, "loss": 0.84045678, "num_input_tokens_seen": 27704900, "step": 1303, "time_per_iteration": 2.7862839698791504 }, { "auxiliary_loss_clip": 0.01191742, "auxiliary_loss_mlp": 0.01074585, "balance_loss_clip": 1.06583321, "balance_loss_mlp": 1.04908574, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.8830773419754625, "language_loss": 0.75206572, "learning_rate": 3.9754771058512585e-06, "loss": 0.77472901, "num_input_tokens_seen": 27724890, "step": 1304, "time_per_iteration": 2.7380170822143555 }, { "auxiliary_loss_clip": 0.01211207, "auxiliary_loss_mlp": 0.01074343, "balance_loss_clip": 1.07114935, "balance_loss_mlp": 1.04922605, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.6118444643214749, "language_loss": 0.76141047, "learning_rate": 3.975416266765542e-06, "loss": 0.784266, "num_input_tokens_seen": 27743115, "step": 1305, "time_per_iteration": 2.6788928508758545 }, { "auxiliary_loss_clip": 0.01137547, "auxiliary_loss_mlp": 0.01064795, "balance_loss_clip": 1.05611205, "balance_loss_mlp": 1.04021358, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.9541638070229452, "language_loss": 0.85011744, "learning_rate": 3.975355352771841e-06, "loss": 0.87214082, "num_input_tokens_seen": 27763570, "step": 1306, "time_per_iteration": 3.048137903213501 }, { "auxiliary_loss_clip": 0.01194779, "auxiliary_loss_mlp": 0.01049822, "balance_loss_clip": 1.06754708, "balance_loss_mlp": 1.02668333, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 6.108459548145404, "language_loss": 0.90882134, "learning_rate": 3.975294363872468e-06, "loss": 0.93126732, "num_input_tokens_seen": 27780030, "step": 1307, "time_per_iteration": 3.1597135066986084 }, { "auxiliary_loss_clip": 0.01145989, "auxiliary_loss_mlp": 0.01060478, "balance_loss_clip": 1.05529833, "balance_loss_mlp": 1.034729, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 3.4991416096159136, "language_loss": 0.83695096, "learning_rate": 3.975233300069735e-06, "loss": 0.85901558, "num_input_tokens_seen": 27796225, "step": 1308, "time_per_iteration": 2.749174118041992 }, { "auxiliary_loss_clip": 0.01151044, "auxiliary_loss_mlp": 0.01061966, "balance_loss_clip": 1.05445218, "balance_loss_mlp": 1.03789735, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.7092634116882437, "language_loss": 0.77521002, "learning_rate": 3.975172161365958e-06, "loss": 0.7973401, "num_input_tokens_seen": 27815975, "step": 1309, "time_per_iteration": 2.752854108810425 }, { "auxiliary_loss_clip": 0.01200102, "auxiliary_loss_mlp": 0.01070583, "balance_loss_clip": 1.06396675, "balance_loss_mlp": 1.04449987, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.8729662604656268, "language_loss": 0.80561006, "learning_rate": 3.975110947763453e-06, "loss": 0.82831693, "num_input_tokens_seen": 27832255, "step": 1310, "time_per_iteration": 2.6966710090637207 }, { "auxiliary_loss_clip": 0.01173381, "auxiliary_loss_mlp": 0.0078245, "balance_loss_clip": 1.06193507, "balance_loss_mlp": 1.00060987, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.796715978968241, "language_loss": 0.73187977, "learning_rate": 3.9750496592645435e-06, "loss": 0.75143808, "num_input_tokens_seen": 27852180, "step": 1311, "time_per_iteration": 2.7588090896606445 }, { "auxiliary_loss_clip": 0.01188438, "auxiliary_loss_mlp": 0.01078546, "balance_loss_clip": 1.06358969, "balance_loss_mlp": 1.05342865, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.7490617386556226, "language_loss": 0.86002982, "learning_rate": 3.974988295871553e-06, "loss": 0.88269973, "num_input_tokens_seen": 27871435, "step": 1312, "time_per_iteration": 2.6969683170318604 }, { "auxiliary_loss_clip": 0.01178338, "auxiliary_loss_mlp": 0.01059112, "balance_loss_clip": 1.06324685, "balance_loss_mlp": 1.03633142, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.825664315845032, "language_loss": 0.82087892, "learning_rate": 3.9749268575868085e-06, "loss": 0.84325337, "num_input_tokens_seen": 27890625, "step": 1313, "time_per_iteration": 2.6936304569244385 }, { "auxiliary_loss_clip": 0.01184798, "auxiliary_loss_mlp": 0.00783631, "balance_loss_clip": 1.06229842, "balance_loss_mlp": 1.00053823, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.837190319075622, "language_loss": 0.73569417, "learning_rate": 3.97486534441264e-06, "loss": 0.75537837, "num_input_tokens_seen": 27906530, "step": 1314, "time_per_iteration": 2.653505325317383 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.00782352, "balance_loss_clip": 1.05730104, "balance_loss_mlp": 1.00044668, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.6153694611764058, "language_loss": 0.79490477, "learning_rate": 3.974803756351379e-06, "loss": 0.81427419, "num_input_tokens_seen": 27926725, "step": 1315, "time_per_iteration": 2.797306776046753 }, { "auxiliary_loss_clip": 0.01189107, "auxiliary_loss_mlp": 0.01060743, "balance_loss_clip": 1.05841756, "balance_loss_mlp": 1.03487444, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 1.6362349035659796, "language_loss": 0.73546493, "learning_rate": 3.974742093405362e-06, "loss": 0.75796348, "num_input_tokens_seen": 27947875, "step": 1316, "time_per_iteration": 2.688997507095337 }, { "auxiliary_loss_clip": 0.01162651, "auxiliary_loss_mlp": 0.01066617, "balance_loss_clip": 1.05845332, "balance_loss_mlp": 1.0418098, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.157376902111077, "language_loss": 0.65540409, "learning_rate": 3.974680355576927e-06, "loss": 0.67769682, "num_input_tokens_seen": 27965040, "step": 1317, "time_per_iteration": 2.6998519897460938 }, { "auxiliary_loss_clip": 0.01177674, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.06280386, "balance_loss_mlp": 1.0428021, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.382161374765057, "language_loss": 0.73105192, "learning_rate": 3.974618542868415e-06, "loss": 0.75351495, "num_input_tokens_seen": 27985330, "step": 1318, "time_per_iteration": 2.8350789546966553 }, { "auxiliary_loss_clip": 0.01139638, "auxiliary_loss_mlp": 0.01058798, "balance_loss_clip": 1.05582452, "balance_loss_mlp": 1.03515935, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 2.635941883481154, "language_loss": 0.90381306, "learning_rate": 3.97455665528217e-06, "loss": 0.92579746, "num_input_tokens_seen": 28007615, "step": 1319, "time_per_iteration": 2.8553895950317383 }, { "auxiliary_loss_clip": 0.01175059, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.05662942, "balance_loss_mlp": 1.03122926, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 1.9449065990449943, "language_loss": 0.80134505, "learning_rate": 3.974494692820539e-06, "loss": 0.82364893, "num_input_tokens_seen": 28027765, "step": 1320, "time_per_iteration": 2.6651997566223145 }, { "auxiliary_loss_clip": 0.01181808, "auxiliary_loss_mlp": 0.01060151, "balance_loss_clip": 1.06380332, "balance_loss_mlp": 1.03657198, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 2.1078540484546746, "language_loss": 0.6901226, "learning_rate": 3.974432655485872e-06, "loss": 0.71254218, "num_input_tokens_seen": 28044225, "step": 1321, "time_per_iteration": 2.6500401496887207 }, { "auxiliary_loss_clip": 0.01189002, "auxiliary_loss_mlp": 0.01060598, "balance_loss_clip": 1.06469131, "balance_loss_mlp": 1.03688753, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 1.9310950096267907, "language_loss": 0.8359012, "learning_rate": 3.9743705432805195e-06, "loss": 0.85839725, "num_input_tokens_seen": 28062915, "step": 1322, "time_per_iteration": 2.684978723526001 }, { "auxiliary_loss_clip": 0.01202147, "auxiliary_loss_mlp": 0.01057117, "balance_loss_clip": 1.06135976, "balance_loss_mlp": 1.03304851, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 2.128262121046283, "language_loss": 0.90555447, "learning_rate": 3.974308356206838e-06, "loss": 0.92814714, "num_input_tokens_seen": 28082175, "step": 1323, "time_per_iteration": 2.6192240715026855 }, { "auxiliary_loss_clip": 0.01164151, "auxiliary_loss_mlp": 0.01062303, "balance_loss_clip": 1.06272292, "balance_loss_mlp": 1.03809166, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.8373443631598505, "language_loss": 0.82521075, "learning_rate": 3.974246094267187e-06, "loss": 0.84747529, "num_input_tokens_seen": 28102645, "step": 1324, "time_per_iteration": 2.8283956050872803 }, { "auxiliary_loss_clip": 0.01180787, "auxiliary_loss_mlp": 0.01053463, "balance_loss_clip": 1.06256735, "balance_loss_mlp": 1.02834535, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 2.119290865165494, "language_loss": 0.79162025, "learning_rate": 3.974183757463925e-06, "loss": 0.8139627, "num_input_tokens_seen": 28122805, "step": 1325, "time_per_iteration": 2.6996092796325684 }, { "auxiliary_loss_clip": 0.01119286, "auxiliary_loss_mlp": 0.00785175, "balance_loss_clip": 1.04844928, "balance_loss_mlp": 1.00035501, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.2621745256944448, "language_loss": 0.88038248, "learning_rate": 3.974121345799418e-06, "loss": 0.89942712, "num_input_tokens_seen": 28140530, "step": 1326, "time_per_iteration": 2.881410837173462 }, { "auxiliary_loss_clip": 0.012, "auxiliary_loss_mlp": 0.01056877, "balance_loss_clip": 1.06257951, "balance_loss_mlp": 1.03168797, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 1.8538865301137586, "language_loss": 0.8328709, "learning_rate": 3.974058859276032e-06, "loss": 0.85543966, "num_input_tokens_seen": 28159640, "step": 1327, "time_per_iteration": 2.7277982234954834 }, { "auxiliary_loss_clip": 0.01207207, "auxiliary_loss_mlp": 0.01056886, "balance_loss_clip": 1.06532371, "balance_loss_mlp": 1.03223395, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 2.3216818645515636, "language_loss": 0.78599, "learning_rate": 3.9739962978961354e-06, "loss": 0.80863088, "num_input_tokens_seen": 28177050, "step": 1328, "time_per_iteration": 4.2137157917022705 }, { "auxiliary_loss_clip": 0.01201442, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.06778932, "balance_loss_mlp": 1.02722156, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 4.209530911932697, "language_loss": 0.73918134, "learning_rate": 3.973933661662101e-06, "loss": 0.76171625, "num_input_tokens_seen": 28193245, "step": 1329, "time_per_iteration": 5.853717565536499 }, { "auxiliary_loss_clip": 0.01169795, "auxiliary_loss_mlp": 0.01064631, "balance_loss_clip": 1.06039059, "balance_loss_mlp": 1.04069376, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.6102544328312476, "language_loss": 0.81743932, "learning_rate": 3.973870950576305e-06, "loss": 0.83978355, "num_input_tokens_seen": 28213570, "step": 1330, "time_per_iteration": 4.307915687561035 }, { "auxiliary_loss_clip": 0.01205148, "auxiliary_loss_mlp": 0.00780735, "balance_loss_clip": 1.06445098, "balance_loss_mlp": 1.00030971, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 3.0935981151455865, "language_loss": 0.88962448, "learning_rate": 3.9738081646411255e-06, "loss": 0.90948325, "num_input_tokens_seen": 28229980, "step": 1331, "time_per_iteration": 2.645198345184326 }, { "auxiliary_loss_clip": 0.01196019, "auxiliary_loss_mlp": 0.00781409, "balance_loss_clip": 1.05950165, "balance_loss_mlp": 1.00032377, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 1.8933982437719925, "language_loss": 0.7335732, "learning_rate": 3.973745303858942e-06, "loss": 0.75334752, "num_input_tokens_seen": 28253840, "step": 1332, "time_per_iteration": 2.792128562927246 }, { "auxiliary_loss_clip": 0.01180359, "auxiliary_loss_mlp": 0.01055118, "balance_loss_clip": 1.06217384, "balance_loss_mlp": 1.03216982, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 1.7464568676953767, "language_loss": 0.82765031, "learning_rate": 3.973682368232138e-06, "loss": 0.85000509, "num_input_tokens_seen": 28271675, "step": 1333, "time_per_iteration": 2.635579824447632 }, { "auxiliary_loss_clip": 0.01160554, "auxiliary_loss_mlp": 0.01059025, "balance_loss_clip": 1.05944169, "balance_loss_mlp": 1.03502798, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.677615191761892, "language_loss": 0.74862051, "learning_rate": 3.9736193577631015e-06, "loss": 0.77081633, "num_input_tokens_seen": 28291850, "step": 1334, "time_per_iteration": 2.8150298595428467 }, { "auxiliary_loss_clip": 0.01176175, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.06460369, "balance_loss_mlp": 1.04010868, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.8723728369534094, "language_loss": 0.79970533, "learning_rate": 3.973556272454221e-06, "loss": 0.82210302, "num_input_tokens_seen": 28310780, "step": 1335, "time_per_iteration": 2.6858503818511963 }, { "auxiliary_loss_clip": 0.01068232, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01693749, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7491611763509133, "language_loss": 0.56056821, "learning_rate": 3.973493112307889e-06, "loss": 0.58145452, "num_input_tokens_seen": 28369985, "step": 1336, "time_per_iteration": 3.324230670928955 }, { "auxiliary_loss_clip": 0.01179495, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06005239, "balance_loss_mlp": 1.04149771, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 2.8990759307469256, "language_loss": 0.67587668, "learning_rate": 3.9734298773265005e-06, "loss": 0.69831598, "num_input_tokens_seen": 28388670, "step": 1337, "time_per_iteration": 2.755451202392578 }, { "auxiliary_loss_clip": 0.01171763, "auxiliary_loss_mlp": 0.0107788, "balance_loss_clip": 1.06270492, "balance_loss_mlp": 1.05304837, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 1.9421039451316542, "language_loss": 0.86847901, "learning_rate": 3.973366567512453e-06, "loss": 0.89097536, "num_input_tokens_seen": 28411845, "step": 1338, "time_per_iteration": 2.758418560028076 }, { "auxiliary_loss_clip": 0.01136344, "auxiliary_loss_mlp": 0.01082295, "balance_loss_clip": 1.04883683, "balance_loss_mlp": 1.05596161, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 2.4557709650828157, "language_loss": 0.87217385, "learning_rate": 3.973303182868147e-06, "loss": 0.89436018, "num_input_tokens_seen": 28427875, "step": 1339, "time_per_iteration": 2.72682785987854 }, { "auxiliary_loss_clip": 0.01188632, "auxiliary_loss_mlp": 0.01055953, "balance_loss_clip": 1.06334567, "balance_loss_mlp": 1.03417385, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 10.603370056653041, "language_loss": 0.89504963, "learning_rate": 3.973239723395988e-06, "loss": 0.91749549, "num_input_tokens_seen": 28446615, "step": 1340, "time_per_iteration": 2.639601469039917 }, { "auxiliary_loss_clip": 0.01080107, "auxiliary_loss_mlp": 0.01012224, "balance_loss_clip": 1.02943289, "balance_loss_mlp": 1.00850451, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8861598592181924, "language_loss": 0.64834231, "learning_rate": 3.97317618909838e-06, "loss": 0.66926563, "num_input_tokens_seen": 28505290, "step": 1341, "time_per_iteration": 3.0625648498535156 }, { "auxiliary_loss_clip": 0.01197538, "auxiliary_loss_mlp": 0.01061885, "balance_loss_clip": 1.0628854, "balance_loss_mlp": 1.0364095, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 3.3156125209451286, "language_loss": 0.89471233, "learning_rate": 3.973112579977733e-06, "loss": 0.9173066, "num_input_tokens_seen": 28522735, "step": 1342, "time_per_iteration": 2.6123783588409424 }, { "auxiliary_loss_clip": 0.01177687, "auxiliary_loss_mlp": 0.01062063, "balance_loss_clip": 1.0644995, "balance_loss_mlp": 1.03818512, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.2904075751929365, "language_loss": 0.76354575, "learning_rate": 3.973048896036459e-06, "loss": 0.78594327, "num_input_tokens_seen": 28539460, "step": 1343, "time_per_iteration": 2.7564918994903564 }, { "auxiliary_loss_clip": 0.01064182, "auxiliary_loss_mlp": 0.01010488, "balance_loss_clip": 1.02542567, "balance_loss_mlp": 1.0066731, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.8071281523255156, "language_loss": 0.57418531, "learning_rate": 3.972985137276974e-06, "loss": 0.59493202, "num_input_tokens_seen": 28599855, "step": 1344, "time_per_iteration": 3.170443058013916 }, { "auxiliary_loss_clip": 0.01158029, "auxiliary_loss_mlp": 0.01063108, "balance_loss_clip": 1.05839872, "balance_loss_mlp": 1.03846788, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.5953739346171676, "language_loss": 0.86569476, "learning_rate": 3.972921303701695e-06, "loss": 0.88790607, "num_input_tokens_seen": 28617585, "step": 1345, "time_per_iteration": 2.765254497528076 }, { "auxiliary_loss_clip": 0.01203428, "auxiliary_loss_mlp": 0.01057879, "balance_loss_clip": 1.06629944, "balance_loss_mlp": 1.03603959, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.8653844332842058, "language_loss": 0.87646407, "learning_rate": 3.972857395313042e-06, "loss": 0.89907712, "num_input_tokens_seen": 28636355, "step": 1346, "time_per_iteration": 2.655611991882324 }, { "auxiliary_loss_clip": 0.01191822, "auxiliary_loss_mlp": 0.0105414, "balance_loss_clip": 1.06450033, "balance_loss_mlp": 1.03047693, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.7047476553504466, "language_loss": 0.9298563, "learning_rate": 3.972793412113439e-06, "loss": 0.95231593, "num_input_tokens_seen": 28656260, "step": 1347, "time_per_iteration": 2.718355417251587 }, { "auxiliary_loss_clip": 0.01188696, "auxiliary_loss_mlp": 0.01066703, "balance_loss_clip": 1.06260633, "balance_loss_mlp": 1.04144263, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.9307860049130865, "language_loss": 0.89506733, "learning_rate": 3.972729354105312e-06, "loss": 0.91762137, "num_input_tokens_seen": 28675865, "step": 1348, "time_per_iteration": 2.763735771179199 }, { "auxiliary_loss_clip": 0.01137961, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.06026649, "balance_loss_mlp": 1.03730989, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.6214351378274148, "language_loss": 0.76906884, "learning_rate": 3.97266522129109e-06, "loss": 0.79104578, "num_input_tokens_seen": 28696255, "step": 1349, "time_per_iteration": 2.778050661087036 }, { "auxiliary_loss_clip": 0.01202122, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.06290889, "balance_loss_mlp": 1.04144049, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.777484449358279, "language_loss": 0.8877703, "learning_rate": 3.972601013673205e-06, "loss": 0.91044247, "num_input_tokens_seen": 28713905, "step": 1350, "time_per_iteration": 2.5871450901031494 }, { "auxiliary_loss_clip": 0.01164889, "auxiliary_loss_mlp": 0.00780958, "balance_loss_clip": 1.06011164, "balance_loss_mlp": 1.00028801, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 2.7472756845793156, "language_loss": 0.82298493, "learning_rate": 3.972536731254092e-06, "loss": 0.84244347, "num_input_tokens_seen": 28732075, "step": 1351, "time_per_iteration": 2.840271234512329 }, { "auxiliary_loss_clip": 0.01198177, "auxiliary_loss_mlp": 0.01055773, "balance_loss_clip": 1.06010592, "balance_loss_mlp": 1.03090644, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.2808101252466724, "language_loss": 0.75274944, "learning_rate": 3.972472374036189e-06, "loss": 0.775289, "num_input_tokens_seen": 28751150, "step": 1352, "time_per_iteration": 2.733644485473633 }, { "auxiliary_loss_clip": 0.01194643, "auxiliary_loss_mlp": 0.00783595, "balance_loss_clip": 1.06613326, "balance_loss_mlp": 1.00036311, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 1.678520960707938, "language_loss": 0.82936156, "learning_rate": 3.972407942021935e-06, "loss": 0.84914398, "num_input_tokens_seen": 28773360, "step": 1353, "time_per_iteration": 2.742149829864502 }, { "auxiliary_loss_clip": 0.01068236, "auxiliary_loss_mlp": 0.01015932, "balance_loss_clip": 1.02440155, "balance_loss_mlp": 1.01242769, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8516312511934722, "language_loss": 0.59741521, "learning_rate": 3.972343435213775e-06, "loss": 0.61825693, "num_input_tokens_seen": 28833390, "step": 1354, "time_per_iteration": 3.1912426948547363 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.01058874, "balance_loss_clip": 1.0546236, "balance_loss_mlp": 1.03583086, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 2.1234068486581643, "language_loss": 0.82310611, "learning_rate": 3.972278853614154e-06, "loss": 0.84519827, "num_input_tokens_seen": 28852430, "step": 1355, "time_per_iteration": 2.782442808151245 }, { "auxiliary_loss_clip": 0.01186948, "auxiliary_loss_mlp": 0.01062856, "balance_loss_clip": 1.0600667, "balance_loss_mlp": 1.03801262, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 1.8366299277102565, "language_loss": 0.7135247, "learning_rate": 3.972214197225521e-06, "loss": 0.73602271, "num_input_tokens_seen": 28870685, "step": 1356, "time_per_iteration": 2.7777554988861084 }, { "auxiliary_loss_clip": 0.01194666, "auxiliary_loss_mlp": 0.01056522, "balance_loss_clip": 1.06462216, "balance_loss_mlp": 1.03259718, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 2.050923525150184, "language_loss": 0.70426142, "learning_rate": 3.972149466050329e-06, "loss": 0.72677326, "num_input_tokens_seen": 28889860, "step": 1357, "time_per_iteration": 2.852046012878418 }, { "auxiliary_loss_clip": 0.01186996, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.06138206, "balance_loss_mlp": 1.03070426, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.634204556872777, "language_loss": 0.84203482, "learning_rate": 3.97208466009103e-06, "loss": 0.8644495, "num_input_tokens_seen": 28905865, "step": 1358, "time_per_iteration": 2.7127115726470947 }, { "auxiliary_loss_clip": 0.01176629, "auxiliary_loss_mlp": 0.010566, "balance_loss_clip": 1.06037402, "balance_loss_mlp": 1.03154182, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 2.1726272773281097, "language_loss": 1.02781308, "learning_rate": 3.972019779350084e-06, "loss": 1.05014539, "num_input_tokens_seen": 28925250, "step": 1359, "time_per_iteration": 2.7171826362609863 }, { "auxiliary_loss_clip": 0.01128357, "auxiliary_loss_mlp": 0.01056774, "balance_loss_clip": 1.05009234, "balance_loss_mlp": 1.03263426, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 2.0494617207464945, "language_loss": 0.8313604, "learning_rate": 3.971954823829951e-06, "loss": 0.85321164, "num_input_tokens_seen": 28943445, "step": 1360, "time_per_iteration": 2.9020919799804688 }, { "auxiliary_loss_clip": 0.01202956, "auxiliary_loss_mlp": 0.0106887, "balance_loss_clip": 1.06274688, "balance_loss_mlp": 1.04469395, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 5.2377005088202075, "language_loss": 0.72322488, "learning_rate": 3.971889793533093e-06, "loss": 0.74594313, "num_input_tokens_seen": 28962695, "step": 1361, "time_per_iteration": 2.6643178462982178 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.0552367, "balance_loss_mlp": 1.03184962, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 28.302545492028134, "language_loss": 0.76657653, "learning_rate": 3.971824688461976e-06, "loss": 0.78880513, "num_input_tokens_seen": 28982120, "step": 1362, "time_per_iteration": 2.7439064979553223 }, { "auxiliary_loss_clip": 0.01199728, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.06350708, "balance_loss_mlp": 1.03104496, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1850191919210338, "language_loss": 0.72384715, "learning_rate": 3.971759508619069e-06, "loss": 0.74637932, "num_input_tokens_seen": 28998100, "step": 1363, "time_per_iteration": 2.7082791328430176 }, { "auxiliary_loss_clip": 0.01202887, "auxiliary_loss_mlp": 0.01066374, "balance_loss_clip": 1.06580126, "balance_loss_mlp": 1.04083955, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.142285699657122, "language_loss": 0.7726444, "learning_rate": 3.971694254006844e-06, "loss": 0.79533696, "num_input_tokens_seen": 29017095, "step": 1364, "time_per_iteration": 2.777156114578247 }, { "auxiliary_loss_clip": 0.01135428, "auxiliary_loss_mlp": 0.01063854, "balance_loss_clip": 1.05182433, "balance_loss_mlp": 1.03645968, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 1.85589982882842, "language_loss": 0.82242119, "learning_rate": 3.971628924627776e-06, "loss": 0.844414, "num_input_tokens_seen": 29037240, "step": 1365, "time_per_iteration": 2.8192803859710693 }, { "auxiliary_loss_clip": 0.01196582, "auxiliary_loss_mlp": 0.01059945, "balance_loss_clip": 1.07006347, "balance_loss_mlp": 1.03706884, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.7803424706125983, "language_loss": 0.82062519, "learning_rate": 3.97156352048434e-06, "loss": 0.84319043, "num_input_tokens_seen": 29056250, "step": 1366, "time_per_iteration": 2.7482311725616455 }, { "auxiliary_loss_clip": 0.01153262, "auxiliary_loss_mlp": 0.0107233, "balance_loss_clip": 1.05320215, "balance_loss_mlp": 1.04779685, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 2.010209091244133, "language_loss": 0.81944495, "learning_rate": 3.97149804157902e-06, "loss": 0.84170091, "num_input_tokens_seen": 29073380, "step": 1367, "time_per_iteration": 4.352729797363281 }, { "auxiliary_loss_clip": 0.01206125, "auxiliary_loss_mlp": 0.01066888, "balance_loss_clip": 1.06541765, "balance_loss_mlp": 1.04241478, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 2.518996379768439, "language_loss": 0.8331567, "learning_rate": 3.9714324879142946e-06, "loss": 0.85588682, "num_input_tokens_seen": 29091330, "step": 1368, "time_per_iteration": 6.077457666397095 }, { "auxiliary_loss_clip": 0.01159992, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.06314564, "balance_loss_mlp": 1.02790344, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 3.198110530618569, "language_loss": 0.81336468, "learning_rate": 3.971366859492653e-06, "loss": 0.8354634, "num_input_tokens_seen": 29110375, "step": 1369, "time_per_iteration": 2.769972085952759 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.05438268, "balance_loss_mlp": 1.00027657, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.610758273724768, "language_loss": 0.74818152, "learning_rate": 3.971301156316582e-06, "loss": 0.76737428, "num_input_tokens_seen": 29129395, "step": 1370, "time_per_iteration": 4.497304201126099 }, { "auxiliary_loss_clip": 0.0115498, "auxiliary_loss_mlp": 0.01064278, "balance_loss_clip": 1.06403351, "balance_loss_mlp": 1.03987551, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.5246391685186451, "language_loss": 0.7398203, "learning_rate": 3.971235378388573e-06, "loss": 0.76201284, "num_input_tokens_seen": 29148650, "step": 1371, "time_per_iteration": 2.758089065551758 }, { "auxiliary_loss_clip": 0.01097162, "auxiliary_loss_mlp": 0.0106614, "balance_loss_clip": 1.05124569, "balance_loss_mlp": 1.04098701, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 1.9670948823939327, "language_loss": 0.70851803, "learning_rate": 3.971169525711122e-06, "loss": 0.73015106, "num_input_tokens_seen": 29170785, "step": 1372, "time_per_iteration": 4.069301605224609 }, { "auxiliary_loss_clip": 0.01162292, "auxiliary_loss_mlp": 0.01056859, "balance_loss_clip": 1.0571332, "balance_loss_mlp": 1.03261209, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.750431245604494, "language_loss": 0.88363653, "learning_rate": 3.9711035982867246e-06, "loss": 0.905828, "num_input_tokens_seen": 29185210, "step": 1373, "time_per_iteration": 3.9346964359283447 }, { "auxiliary_loss_clip": 0.01147291, "auxiliary_loss_mlp": 0.01062343, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.03878665, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 2.128923272573014, "language_loss": 0.82465184, "learning_rate": 3.971037596117882e-06, "loss": 0.84674811, "num_input_tokens_seen": 29205210, "step": 1374, "time_per_iteration": 2.933377981185913 }, { "auxiliary_loss_clip": 0.01044322, "auxiliary_loss_mlp": 0.01017124, "balance_loss_clip": 1.03154135, "balance_loss_mlp": 1.0135479, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.8272339650193923, "language_loss": 0.60641956, "learning_rate": 3.970971519207095e-06, "loss": 0.62703401, "num_input_tokens_seen": 29265350, "step": 1375, "time_per_iteration": 3.3287038803100586 }, { "auxiliary_loss_clip": 0.01060461, "auxiliary_loss_mlp": 0.01013653, "balance_loss_clip": 1.02398169, "balance_loss_mlp": 1.01017237, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9162492148708097, "language_loss": 0.62171799, "learning_rate": 3.970905367556871e-06, "loss": 0.64245915, "num_input_tokens_seen": 29321475, "step": 1376, "time_per_iteration": 3.218834161758423 }, { "auxiliary_loss_clip": 0.01159103, "auxiliary_loss_mlp": 0.0106347, "balance_loss_clip": 1.06229186, "balance_loss_mlp": 1.03942561, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 1.9191670647860084, "language_loss": 0.82577401, "learning_rate": 3.970839141169718e-06, "loss": 0.84799975, "num_input_tokens_seen": 29341405, "step": 1377, "time_per_iteration": 2.8763558864593506 }, { "auxiliary_loss_clip": 0.01176967, "auxiliary_loss_mlp": 0.01054072, "balance_loss_clip": 1.06486619, "balance_loss_mlp": 1.03011107, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 1.915539507671093, "language_loss": 0.84923226, "learning_rate": 3.970772840048147e-06, "loss": 0.87154263, "num_input_tokens_seen": 29361955, "step": 1378, "time_per_iteration": 2.8232595920562744 }, { "auxiliary_loss_clip": 0.01185329, "auxiliary_loss_mlp": 0.01058999, "balance_loss_clip": 1.06043923, "balance_loss_mlp": 1.0344305, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 6.4689921779024795, "language_loss": 0.87319231, "learning_rate": 3.970706464194672e-06, "loss": 0.8956356, "num_input_tokens_seen": 29382395, "step": 1379, "time_per_iteration": 2.756082534790039 }, { "auxiliary_loss_clip": 0.01158173, "auxiliary_loss_mlp": 0.01061479, "balance_loss_clip": 1.05779433, "balance_loss_mlp": 1.03829277, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 2.078993196749275, "language_loss": 0.78545237, "learning_rate": 3.970640013611812e-06, "loss": 0.8076489, "num_input_tokens_seen": 29404460, "step": 1380, "time_per_iteration": 2.9525601863861084 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.01059448, "balance_loss_clip": 1.06308961, "balance_loss_mlp": 1.0344255, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.6608111668609697, "language_loss": 0.86125714, "learning_rate": 3.970573488302083e-06, "loss": 0.88368654, "num_input_tokens_seen": 29422675, "step": 1381, "time_per_iteration": 2.735203742980957 }, { "auxiliary_loss_clip": 0.01197152, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.06611753, "balance_loss_mlp": 1.00034571, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 2.9433398182948203, "language_loss": 0.87471211, "learning_rate": 3.970506888268011e-06, "loss": 0.89450181, "num_input_tokens_seen": 29439840, "step": 1382, "time_per_iteration": 2.6392617225646973 }, { "auxiliary_loss_clip": 0.0115996, "auxiliary_loss_mlp": 0.01055463, "balance_loss_clip": 1.06138313, "balance_loss_mlp": 1.03337312, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 1.9901989904031434, "language_loss": 0.77085757, "learning_rate": 3.970440213512121e-06, "loss": 0.79301178, "num_input_tokens_seen": 29457360, "step": 1383, "time_per_iteration": 2.756565809249878 }, { "auxiliary_loss_clip": 0.01191549, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.06211782, "balance_loss_mlp": 1.03395748, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 1.818236548161018, "language_loss": 0.82858944, "learning_rate": 3.97037346403694e-06, "loss": 0.85108507, "num_input_tokens_seen": 29477040, "step": 1384, "time_per_iteration": 2.7848587036132812 }, { "auxiliary_loss_clip": 0.01148661, "auxiliary_loss_mlp": 0.01063605, "balance_loss_clip": 1.05671442, "balance_loss_mlp": 1.03610373, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 3.9982776391866346, "language_loss": 0.85219657, "learning_rate": 3.970306639845e-06, "loss": 0.8743192, "num_input_tokens_seen": 29492010, "step": 1385, "time_per_iteration": 2.803893566131592 }, { "auxiliary_loss_clip": 0.01157001, "auxiliary_loss_mlp": 0.01061891, "balance_loss_clip": 1.05823874, "balance_loss_mlp": 1.03750122, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.7071515381676081, "language_loss": 0.69195282, "learning_rate": 3.970239740938835e-06, "loss": 0.71414173, "num_input_tokens_seen": 29511850, "step": 1386, "time_per_iteration": 3.004786252975464 }, { "auxiliary_loss_clip": 0.01172803, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.05489016, "balance_loss_mlp": 1.03483546, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.672791522425571, "language_loss": 0.81894958, "learning_rate": 3.97017276732098e-06, "loss": 0.84126568, "num_input_tokens_seen": 29531415, "step": 1387, "time_per_iteration": 2.7678542137145996 }, { "auxiliary_loss_clip": 0.01179554, "auxiliary_loss_mlp": 0.01074251, "balance_loss_clip": 1.06179345, "balance_loss_mlp": 1.04817975, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.071322011459688, "language_loss": 0.77205479, "learning_rate": 3.970105718993978e-06, "loss": 0.7945928, "num_input_tokens_seen": 29549525, "step": 1388, "time_per_iteration": 2.8246304988861084 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.01062414, "balance_loss_clip": 1.05684018, "balance_loss_mlp": 1.03742766, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.0255270252506636, "language_loss": 0.79527366, "learning_rate": 3.970038595960369e-06, "loss": 0.81720638, "num_input_tokens_seen": 29568705, "step": 1389, "time_per_iteration": 2.8606414794921875 }, { "auxiliary_loss_clip": 0.01172785, "auxiliary_loss_mlp": 0.01064077, "balance_loss_clip": 1.05787444, "balance_loss_mlp": 1.03923428, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 2.546615132743645, "language_loss": 0.87427586, "learning_rate": 3.969971398222699e-06, "loss": 0.89664447, "num_input_tokens_seen": 29585855, "step": 1390, "time_per_iteration": 2.795931577682495 }, { "auxiliary_loss_clip": 0.01160426, "auxiliary_loss_mlp": 0.01067723, "balance_loss_clip": 1.05447149, "balance_loss_mlp": 1.04082966, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.8703157168219726, "language_loss": 0.86833143, "learning_rate": 3.969904125783517e-06, "loss": 0.89061296, "num_input_tokens_seen": 29607280, "step": 1391, "time_per_iteration": 2.811598062515259 }, { "auxiliary_loss_clip": 0.01156119, "auxiliary_loss_mlp": 0.01076482, "balance_loss_clip": 1.05575848, "balance_loss_mlp": 1.05180562, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 3.7979396758909263, "language_loss": 0.87688571, "learning_rate": 3.969836778645371e-06, "loss": 0.89921176, "num_input_tokens_seen": 29624130, "step": 1392, "time_per_iteration": 2.776819944381714 }, { "auxiliary_loss_clip": 0.01183316, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.05830503, "balance_loss_mlp": 1.03500128, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 8.95243370865895, "language_loss": 0.80574775, "learning_rate": 3.969769356810819e-06, "loss": 0.82816637, "num_input_tokens_seen": 29643210, "step": 1393, "time_per_iteration": 2.735761880874634 }, { "auxiliary_loss_clip": 0.01197686, "auxiliary_loss_mlp": 0.01058125, "balance_loss_clip": 1.06329441, "balance_loss_mlp": 1.03466487, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 1.7485261130451684, "language_loss": 0.85064757, "learning_rate": 3.969701860282415e-06, "loss": 0.87320572, "num_input_tokens_seen": 29663920, "step": 1394, "time_per_iteration": 2.950211524963379 }, { "auxiliary_loss_clip": 0.01145594, "auxiliary_loss_mlp": 0.01058123, "balance_loss_clip": 1.05994248, "balance_loss_mlp": 1.03432918, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 1.782466846937859, "language_loss": 0.82979721, "learning_rate": 3.969634289062719e-06, "loss": 0.85183442, "num_input_tokens_seen": 29683825, "step": 1395, "time_per_iteration": 2.883977174758911 }, { "auxiliary_loss_clip": 0.01187279, "auxiliary_loss_mlp": 0.00782865, "balance_loss_clip": 1.06065941, "balance_loss_mlp": 1.00028706, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 3.330409107955743, "language_loss": 0.82481396, "learning_rate": 3.969566643154293e-06, "loss": 0.84451544, "num_input_tokens_seen": 29698775, "step": 1396, "time_per_iteration": 2.6729378700256348 }, { "auxiliary_loss_clip": 0.0118605, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.06378388, "balance_loss_mlp": 1.03475475, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.780410555630689, "language_loss": 0.76843297, "learning_rate": 3.969498922559703e-06, "loss": 0.79089814, "num_input_tokens_seen": 29719430, "step": 1397, "time_per_iteration": 2.64888334274292 }, { "auxiliary_loss_clip": 0.01153742, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.05790138, "balance_loss_mlp": 1.02621412, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.1323769932413184, "language_loss": 0.77941638, "learning_rate": 3.969431127281516e-06, "loss": 0.8014614, "num_input_tokens_seen": 29739685, "step": 1398, "time_per_iteration": 2.8302125930786133 }, { "auxiliary_loss_clip": 0.01191086, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.05962944, "balance_loss_mlp": 1.02943766, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.150764713624159, "language_loss": 0.94635069, "learning_rate": 3.969363257322304e-06, "loss": 0.96878529, "num_input_tokens_seen": 29756165, "step": 1399, "time_per_iteration": 2.650517702102661 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.0106738, "balance_loss_clip": 1.0562712, "balance_loss_mlp": 1.04168999, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 3.6141849657848137, "language_loss": 0.81904209, "learning_rate": 3.96929531268464e-06, "loss": 0.8414399, "num_input_tokens_seen": 29776425, "step": 1400, "time_per_iteration": 2.777369260787964 }, { "auxiliary_loss_clip": 0.01170173, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.05968165, "balance_loss_mlp": 1.03957999, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 8.998651919840762, "language_loss": 0.8642807, "learning_rate": 3.969227293371099e-06, "loss": 0.88662529, "num_input_tokens_seen": 29796440, "step": 1401, "time_per_iteration": 2.91375732421875 }, { "auxiliary_loss_clip": 0.01196, "auxiliary_loss_mlp": 0.01066109, "balance_loss_clip": 1.05935979, "balance_loss_mlp": 1.04053831, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.9792515680869114, "language_loss": 0.87500131, "learning_rate": 3.969159199384263e-06, "loss": 0.89762247, "num_input_tokens_seen": 29814755, "step": 1402, "time_per_iteration": 2.7827296257019043 }, { "auxiliary_loss_clip": 0.01144907, "auxiliary_loss_mlp": 0.00781428, "balance_loss_clip": 1.05105817, "balance_loss_mlp": 1.00033188, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.1517994230241566, "language_loss": 0.8905524, "learning_rate": 3.9690910307267125e-06, "loss": 0.90981579, "num_input_tokens_seen": 29834785, "step": 1403, "time_per_iteration": 2.931666374206543 }, { "auxiliary_loss_clip": 0.01165276, "auxiliary_loss_mlp": 0.01061696, "balance_loss_clip": 1.05570936, "balance_loss_mlp": 1.03715038, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.790271378285476, "language_loss": 0.80321431, "learning_rate": 3.969022787401033e-06, "loss": 0.82548404, "num_input_tokens_seen": 29854695, "step": 1404, "time_per_iteration": 2.7397725582122803 }, { "auxiliary_loss_clip": 0.01181709, "auxiliary_loss_mlp": 0.01071408, "balance_loss_clip": 1.06211567, "balance_loss_mlp": 1.04649353, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 2.0849305916509193, "language_loss": 0.83557045, "learning_rate": 3.968954469409811e-06, "loss": 0.85810155, "num_input_tokens_seen": 29872180, "step": 1405, "time_per_iteration": 2.8052847385406494 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01058347, "balance_loss_clip": 1.05636072, "balance_loss_mlp": 1.03588748, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.5225846020503528, "language_loss": 0.7991904, "learning_rate": 3.968886076755639e-06, "loss": 0.82159847, "num_input_tokens_seen": 29893205, "step": 1406, "time_per_iteration": 4.301243305206299 }, { "auxiliary_loss_clip": 0.0117117, "auxiliary_loss_mlp": 0.01068275, "balance_loss_clip": 1.05790758, "balance_loss_mlp": 1.04406369, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.717770739318623, "language_loss": 0.79441547, "learning_rate": 3.96881760944111e-06, "loss": 0.81680995, "num_input_tokens_seen": 29911970, "step": 1407, "time_per_iteration": 2.6535613536834717 }, { "auxiliary_loss_clip": 0.01186501, "auxiliary_loss_mlp": 0.01057881, "balance_loss_clip": 1.05982685, "balance_loss_mlp": 1.03409886, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 2.191354041218588, "language_loss": 0.91799384, "learning_rate": 3.968749067468819e-06, "loss": 0.94043779, "num_input_tokens_seen": 29929925, "step": 1408, "time_per_iteration": 5.774486064910889 }, { "auxiliary_loss_clip": 0.01058217, "auxiliary_loss_mlp": 0.01015213, "balance_loss_clip": 1.0231359, "balance_loss_mlp": 1.01139832, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.9559717259642487, "language_loss": 0.61891782, "learning_rate": 3.968680450841368e-06, "loss": 0.63965201, "num_input_tokens_seen": 29985950, "step": 1409, "time_per_iteration": 4.9455225467681885 }, { "auxiliary_loss_clip": 0.01188186, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05840743, "balance_loss_mlp": 1.03878236, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 1.6980375913788566, "language_loss": 0.86357373, "learning_rate": 3.968611759561355e-06, "loss": 0.88607281, "num_input_tokens_seen": 30004330, "step": 1410, "time_per_iteration": 2.640355110168457 }, { "auxiliary_loss_clip": 0.01181512, "auxiliary_loss_mlp": 0.01053874, "balance_loss_clip": 1.0583061, "balance_loss_mlp": 1.02870846, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 2.248971712939306, "language_loss": 0.74384397, "learning_rate": 3.968542993631388e-06, "loss": 0.7661978, "num_input_tokens_seen": 30022555, "step": 1411, "time_per_iteration": 2.6200830936431885 }, { "auxiliary_loss_clip": 0.01077929, "auxiliary_loss_mlp": 0.01003535, "balance_loss_clip": 1.02317524, "balance_loss_mlp": 0.99991113, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9014663966204861, "language_loss": 0.56748837, "learning_rate": 3.968474153054073e-06, "loss": 0.58830309, "num_input_tokens_seen": 30077220, "step": 1412, "time_per_iteration": 3.0746512413024902 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01067795, "balance_loss_clip": 1.05325568, "balance_loss_mlp": 1.04265356, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 2.2757293876932945, "language_loss": 0.88754624, "learning_rate": 3.96840523783202e-06, "loss": 0.90978551, "num_input_tokens_seen": 30094600, "step": 1413, "time_per_iteration": 2.7309420108795166 }, { "auxiliary_loss_clip": 0.01164895, "auxiliary_loss_mlp": 0.01057479, "balance_loss_clip": 1.05780244, "balance_loss_mlp": 1.03295755, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.9781781646219805, "language_loss": 0.87963474, "learning_rate": 3.968336247967844e-06, "loss": 0.90185857, "num_input_tokens_seen": 30114475, "step": 1414, "time_per_iteration": 2.692030668258667 }, { "auxiliary_loss_clip": 0.01168145, "auxiliary_loss_mlp": 0.01063751, "balance_loss_clip": 1.05704033, "balance_loss_mlp": 1.04170966, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.9706021333256292, "language_loss": 0.77636635, "learning_rate": 3.96826718346416e-06, "loss": 0.79868531, "num_input_tokens_seen": 30133350, "step": 1415, "time_per_iteration": 2.8435540199279785 }, { "auxiliary_loss_clip": 0.01182108, "auxiliary_loss_mlp": 0.01059478, "balance_loss_clip": 1.0588963, "balance_loss_mlp": 1.03701878, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 1.7170282174092708, "language_loss": 0.70545506, "learning_rate": 3.968198044323587e-06, "loss": 0.72787094, "num_input_tokens_seen": 30159005, "step": 1416, "time_per_iteration": 3.021360158920288 }, { "auxiliary_loss_clip": 0.01174166, "auxiliary_loss_mlp": 0.01066487, "balance_loss_clip": 1.05930233, "balance_loss_mlp": 1.04131043, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 2.8159853289102053, "language_loss": 0.74938154, "learning_rate": 3.968128830548748e-06, "loss": 0.771788, "num_input_tokens_seen": 30179450, "step": 1417, "time_per_iteration": 2.738301992416382 }, { "auxiliary_loss_clip": 0.01171292, "auxiliary_loss_mlp": 0.01057092, "balance_loss_clip": 1.05715823, "balance_loss_mlp": 1.03313112, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.4132423968154635, "language_loss": 0.8258723, "learning_rate": 3.968059542142265e-06, "loss": 0.84815615, "num_input_tokens_seen": 30197235, "step": 1418, "time_per_iteration": 2.671574831008911 }, { "auxiliary_loss_clip": 0.0104499, "auxiliary_loss_mlp": 0.01004818, "balance_loss_clip": 1.02242994, "balance_loss_mlp": 1.0004549, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.8667411864001444, "language_loss": 0.56638753, "learning_rate": 3.9679901791067685e-06, "loss": 0.58688557, "num_input_tokens_seen": 30257410, "step": 1419, "time_per_iteration": 3.199730396270752 }, { "auxiliary_loss_clip": 0.01192231, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05757999, "balance_loss_mlp": 1.04369283, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.2357492693560466, "language_loss": 0.70111859, "learning_rate": 3.967920741444886e-06, "loss": 0.72371829, "num_input_tokens_seen": 30277865, "step": 1420, "time_per_iteration": 2.7176027297973633 }, { "auxiliary_loss_clip": 0.01155207, "auxiliary_loss_mlp": 0.01050755, "balance_loss_clip": 1.05377483, "balance_loss_mlp": 1.02692556, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.5975069204011494, "language_loss": 0.88011539, "learning_rate": 3.967851229159252e-06, "loss": 0.90217495, "num_input_tokens_seen": 30298545, "step": 1421, "time_per_iteration": 2.7552106380462646 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01013517, "balance_loss_clip": 1.02364218, "balance_loss_mlp": 1.01020324, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.9142209544576306, "language_loss": 0.63506877, "learning_rate": 3.967781642252502e-06, "loss": 0.65597868, "num_input_tokens_seen": 30361725, "step": 1422, "time_per_iteration": 3.134183168411255 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.01063847, "balance_loss_clip": 1.05932307, "balance_loss_mlp": 1.0406723, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 1.8757015124159093, "language_loss": 0.82691669, "learning_rate": 3.967711980727276e-06, "loss": 0.84908462, "num_input_tokens_seen": 30382180, "step": 1423, "time_per_iteration": 2.789393424987793 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01064169, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.04089928, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.6593534429066656, "language_loss": 0.75424892, "learning_rate": 3.967642244586213e-06, "loss": 0.776425, "num_input_tokens_seen": 30402980, "step": 1424, "time_per_iteration": 2.7805826663970947 }, { "auxiliary_loss_clip": 0.01139579, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.05769765, "balance_loss_mlp": 1.03751373, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 1.7999307606718091, "language_loss": 0.75948423, "learning_rate": 3.96757243383196e-06, "loss": 0.78148341, "num_input_tokens_seen": 30420800, "step": 1425, "time_per_iteration": 2.677889823913574 }, { "auxiliary_loss_clip": 0.0118966, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.05982256, "balance_loss_mlp": 1.03230715, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.1792756220437743, "language_loss": 0.93362999, "learning_rate": 3.9675025484671624e-06, "loss": 0.95607889, "num_input_tokens_seen": 30439620, "step": 1426, "time_per_iteration": 2.6270906925201416 }, { "auxiliary_loss_clip": 0.01145994, "auxiliary_loss_mlp": 0.01066219, "balance_loss_clip": 1.05707717, "balance_loss_mlp": 1.0406251, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.3679064075186553, "language_loss": 0.75424731, "learning_rate": 3.967432588494471e-06, "loss": 0.77636945, "num_input_tokens_seen": 30457300, "step": 1427, "time_per_iteration": 2.84614634513855 }, { "auxiliary_loss_clip": 0.01190697, "auxiliary_loss_mlp": 0.01052992, "balance_loss_clip": 1.06006169, "balance_loss_mlp": 1.0305804, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 3.503048788198607, "language_loss": 0.82108849, "learning_rate": 3.96736255391654e-06, "loss": 0.84352541, "num_input_tokens_seen": 30471580, "step": 1428, "time_per_iteration": 2.5882396697998047 }, { "auxiliary_loss_clip": 0.01173688, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.05633736, "balance_loss_mlp": 1.03832793, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 2.088481658755078, "language_loss": 0.79929984, "learning_rate": 3.967292444736023e-06, "loss": 0.82166648, "num_input_tokens_seen": 30492720, "step": 1429, "time_per_iteration": 2.720500946044922 }, { "auxiliary_loss_clip": 0.01169119, "auxiliary_loss_mlp": 0.010606, "balance_loss_clip": 1.05971265, "balance_loss_mlp": 1.0379504, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.9029222975672677, "language_loss": 0.87716508, "learning_rate": 3.967222260955578e-06, "loss": 0.89946228, "num_input_tokens_seen": 30509535, "step": 1430, "time_per_iteration": 2.6914596557617188 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01074633, "balance_loss_clip": 1.05802035, "balance_loss_mlp": 1.05125606, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.6366623508781384, "language_loss": 0.81859726, "learning_rate": 3.96715200257787e-06, "loss": 0.84078664, "num_input_tokens_seen": 30529490, "step": 1431, "time_per_iteration": 2.834402322769165 }, { "auxiliary_loss_clip": 0.01148362, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.05620182, "balance_loss_mlp": 1.03132737, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.5497375505717568, "language_loss": 0.78109461, "learning_rate": 3.967081669605559e-06, "loss": 0.80312145, "num_input_tokens_seen": 30550205, "step": 1432, "time_per_iteration": 2.767860174179077 }, { "auxiliary_loss_clip": 0.01167351, "auxiliary_loss_mlp": 0.0106333, "balance_loss_clip": 1.0540905, "balance_loss_mlp": 1.03914225, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 1.9631692713893694, "language_loss": 0.73365706, "learning_rate": 3.967011262041315e-06, "loss": 0.75596392, "num_input_tokens_seen": 30568830, "step": 1433, "time_per_iteration": 2.6930699348449707 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.00781967, "balance_loss_clip": 1.05335927, "balance_loss_mlp": 1.00044179, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 2.468588778716135, "language_loss": 0.85340321, "learning_rate": 3.9669407798878065e-06, "loss": 0.87272388, "num_input_tokens_seen": 30585730, "step": 1434, "time_per_iteration": 2.735690116882324 }, { "auxiliary_loss_clip": 0.01170363, "auxiliary_loss_mlp": 0.01057659, "balance_loss_clip": 1.05604434, "balance_loss_mlp": 1.0344249, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.160640509122794, "language_loss": 0.7870298, "learning_rate": 3.966870223147707e-06, "loss": 0.80931008, "num_input_tokens_seen": 30603180, "step": 1435, "time_per_iteration": 2.776567220687866 }, { "auxiliary_loss_clip": 0.01047768, "auxiliary_loss_mlp": 0.01015597, "balance_loss_clip": 1.023893, "balance_loss_mlp": 1.01206815, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.8900716332014227, "language_loss": 0.57975936, "learning_rate": 3.96679959182369e-06, "loss": 0.60039294, "num_input_tokens_seen": 30668895, "step": 1436, "time_per_iteration": 3.344207763671875 }, { "auxiliary_loss_clip": 0.0117372, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.05617976, "balance_loss_mlp": 1.03153312, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.240343996649645, "language_loss": 0.69169062, "learning_rate": 3.966728885918437e-06, "loss": 0.71399617, "num_input_tokens_seen": 30688955, "step": 1437, "time_per_iteration": 2.7171547412872314 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.03223276, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.1340571114707245, "language_loss": 0.72624576, "learning_rate": 3.966658105434627e-06, "loss": 0.74800885, "num_input_tokens_seen": 30706095, "step": 1438, "time_per_iteration": 2.7815651893615723 }, { "auxiliary_loss_clip": 0.01179626, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.06052637, "balance_loss_mlp": 1.02872419, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.5339762166114281, "language_loss": 0.64377135, "learning_rate": 3.966587250374945e-06, "loss": 0.66609335, "num_input_tokens_seen": 30729025, "step": 1439, "time_per_iteration": 2.8935797214508057 }, { "auxiliary_loss_clip": 0.01153286, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.03213322, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 5.193932354158579, "language_loss": 0.87521696, "learning_rate": 3.966516320742077e-06, "loss": 0.89731431, "num_input_tokens_seen": 30746155, "step": 1440, "time_per_iteration": 2.731531858444214 }, { "auxiliary_loss_clip": 0.01155923, "auxiliary_loss_mlp": 0.00782787, "balance_loss_clip": 1.05752945, "balance_loss_mlp": 1.00043201, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.023462963415533, "language_loss": 0.83434939, "learning_rate": 3.9664453165387124e-06, "loss": 0.85373652, "num_input_tokens_seen": 30761410, "step": 1441, "time_per_iteration": 2.7126500606536865 }, { "auxiliary_loss_clip": 0.01074667, "auxiliary_loss_mlp": 0.01004602, "balance_loss_clip": 1.0222367, "balance_loss_mlp": 1.00100195, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8541685426878655, "language_loss": 0.60479522, "learning_rate": 3.966374237767545e-06, "loss": 0.62558794, "num_input_tokens_seen": 30823010, "step": 1442, "time_per_iteration": 3.25555157661438 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.05768681, "balance_loss_mlp": 1.03075421, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.8449103562639073, "language_loss": 0.79304373, "learning_rate": 3.96630308443127e-06, "loss": 0.81525922, "num_input_tokens_seen": 30841980, "step": 1443, "time_per_iteration": 2.7314631938934326 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.05780149, "balance_loss_mlp": 1.02755547, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 1.6739262813835734, "language_loss": 0.82399666, "learning_rate": 3.966231856532584e-06, "loss": 0.84631598, "num_input_tokens_seen": 30863280, "step": 1444, "time_per_iteration": 2.7341418266296387 }, { "auxiliary_loss_clip": 0.01196759, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.06044626, "balance_loss_mlp": 1.02810788, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.3015461969915747, "language_loss": 0.87354827, "learning_rate": 3.966160554074189e-06, "loss": 0.8960306, "num_input_tokens_seen": 30881710, "step": 1445, "time_per_iteration": 4.25179386138916 }, { "auxiliary_loss_clip": 0.01180784, "auxiliary_loss_mlp": 0.01055896, "balance_loss_clip": 1.06094933, "balance_loss_mlp": 1.03446186, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 1.8066650797875201, "language_loss": 0.81863767, "learning_rate": 3.96608917705879e-06, "loss": 0.84100449, "num_input_tokens_seen": 30900225, "step": 1446, "time_per_iteration": 4.197181940078735 }, { "auxiliary_loss_clip": 0.01056056, "auxiliary_loss_mlp": 0.01004371, "balance_loss_clip": 1.01782191, "balance_loss_mlp": 1.00031781, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7255245569613363, "language_loss": 0.54762936, "learning_rate": 3.966017725489091e-06, "loss": 0.56823361, "num_input_tokens_seen": 30959580, "step": 1447, "time_per_iteration": 3.2158126831054688 }, { "auxiliary_loss_clip": 0.0114861, "auxiliary_loss_mlp": 0.01056824, "balance_loss_clip": 1.05373001, "balance_loss_mlp": 1.03518772, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 2.1586118179593696, "language_loss": 0.84592307, "learning_rate": 3.965946199367804e-06, "loss": 0.86797738, "num_input_tokens_seen": 30976775, "step": 1448, "time_per_iteration": 4.262767314910889 }, { "auxiliary_loss_clip": 0.01194173, "auxiliary_loss_mlp": 0.01050219, "balance_loss_clip": 1.05891991, "balance_loss_mlp": 1.02768826, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 3.4326906921347096, "language_loss": 0.80644608, "learning_rate": 3.965874598697638e-06, "loss": 0.82888997, "num_input_tokens_seen": 30990495, "step": 1449, "time_per_iteration": 4.553676128387451 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01052142, "balance_loss_clip": 1.05437374, "balance_loss_mlp": 1.02946854, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.5251600336566102, "language_loss": 0.70971417, "learning_rate": 3.965802923481313e-06, "loss": 0.73162109, "num_input_tokens_seen": 31014080, "step": 1450, "time_per_iteration": 2.9082705974578857 }, { "auxiliary_loss_clip": 0.01124466, "auxiliary_loss_mlp": 0.01054883, "balance_loss_clip": 1.05164719, "balance_loss_mlp": 1.03207827, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 1.9392114767205617, "language_loss": 0.83684897, "learning_rate": 3.965731173721542e-06, "loss": 0.85864246, "num_input_tokens_seen": 31031210, "step": 1451, "time_per_iteration": 2.809880495071411 }, { "auxiliary_loss_clip": 0.01134251, "auxiliary_loss_mlp": 0.00780873, "balance_loss_clip": 1.05147851, "balance_loss_mlp": 1.00039482, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 2.5160845512367773, "language_loss": 0.74654591, "learning_rate": 3.965659349421049e-06, "loss": 0.76569718, "num_input_tokens_seen": 31049710, "step": 1452, "time_per_iteration": 2.88580060005188 }, { "auxiliary_loss_clip": 0.01157134, "auxiliary_loss_mlp": 0.01063328, "balance_loss_clip": 1.05607891, "balance_loss_mlp": 1.0388428, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 4.56941406999875, "language_loss": 0.80543101, "learning_rate": 3.965587450582556e-06, "loss": 0.82763565, "num_input_tokens_seen": 31066160, "step": 1453, "time_per_iteration": 2.733632802963257 }, { "auxiliary_loss_clip": 0.01169707, "auxiliary_loss_mlp": 0.01059533, "balance_loss_clip": 1.05905569, "balance_loss_mlp": 1.03625154, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 2.0102093196988102, "language_loss": 0.71041977, "learning_rate": 3.96551547720879e-06, "loss": 0.73271215, "num_input_tokens_seen": 31085270, "step": 1454, "time_per_iteration": 2.7568745613098145 }, { "auxiliary_loss_clip": 0.0106426, "auxiliary_loss_mlp": 0.01008112, "balance_loss_clip": 1.0215131, "balance_loss_mlp": 1.00463128, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7713706503543015, "language_loss": 0.5859946, "learning_rate": 3.96544342930248e-06, "loss": 0.6067183, "num_input_tokens_seen": 31148445, "step": 1455, "time_per_iteration": 3.2372186183929443 }, { "auxiliary_loss_clip": 0.01189404, "auxiliary_loss_mlp": 0.01060742, "balance_loss_clip": 1.05742884, "balance_loss_mlp": 1.03688788, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.6485208275358016, "language_loss": 0.77564865, "learning_rate": 3.965371306866359e-06, "loss": 0.79815018, "num_input_tokens_seen": 31168770, "step": 1456, "time_per_iteration": 2.790663003921509 }, { "auxiliary_loss_clip": 0.01127959, "auxiliary_loss_mlp": 0.01054526, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.03071976, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.83889407784057, "language_loss": 0.72420907, "learning_rate": 3.96529910990316e-06, "loss": 0.74603397, "num_input_tokens_seen": 31189270, "step": 1457, "time_per_iteration": 2.9099740982055664 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.0104866, "balance_loss_clip": 1.05627227, "balance_loss_mlp": 1.02633214, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5250401870177361, "language_loss": 0.86412215, "learning_rate": 3.965226838415622e-06, "loss": 0.88637424, "num_input_tokens_seen": 31210385, "step": 1458, "time_per_iteration": 2.7517166137695312 }, { "auxiliary_loss_clip": 0.01169535, "auxiliary_loss_mlp": 0.01061413, "balance_loss_clip": 1.05884266, "balance_loss_mlp": 1.03825045, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.7412813512419094, "language_loss": 0.80268395, "learning_rate": 3.965154492406486e-06, "loss": 0.82499349, "num_input_tokens_seen": 31229745, "step": 1459, "time_per_iteration": 2.71455717086792 }, { "auxiliary_loss_clip": 0.01130491, "auxiliary_loss_mlp": 0.01054334, "balance_loss_clip": 1.05256546, "balance_loss_mlp": 1.03018188, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.1450339680450714, "language_loss": 0.84538847, "learning_rate": 3.9650820718784945e-06, "loss": 0.86723673, "num_input_tokens_seen": 31248280, "step": 1460, "time_per_iteration": 2.8737733364105225 }, { "auxiliary_loss_clip": 0.01177787, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.0572983, "balance_loss_mlp": 1.03640938, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 4.917361835698274, "language_loss": 0.79993135, "learning_rate": 3.965009576834394e-06, "loss": 0.82228899, "num_input_tokens_seen": 31262190, "step": 1461, "time_per_iteration": 2.8436062335968018 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.05800629, "balance_loss_mlp": 1.03704822, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 1.566202508611165, "language_loss": 0.76571167, "learning_rate": 3.964937007276932e-06, "loss": 0.78799284, "num_input_tokens_seen": 31283690, "step": 1462, "time_per_iteration": 2.7895474433898926 }, { "auxiliary_loss_clip": 0.0117563, "auxiliary_loss_mlp": 0.01060064, "balance_loss_clip": 1.05839491, "balance_loss_mlp": 1.03580475, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 2.89717114041641, "language_loss": 0.74710488, "learning_rate": 3.9648643632088634e-06, "loss": 0.76946187, "num_input_tokens_seen": 31302505, "step": 1463, "time_per_iteration": 2.760404348373413 }, { "auxiliary_loss_clip": 0.01191543, "auxiliary_loss_mlp": 0.01061609, "balance_loss_clip": 1.06145048, "balance_loss_mlp": 1.03680158, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 2.431514195311041, "language_loss": 0.83797103, "learning_rate": 3.964791644632941e-06, "loss": 0.8605026, "num_input_tokens_seen": 31323070, "step": 1464, "time_per_iteration": 2.7417759895324707 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01063475, "balance_loss_clip": 1.05683231, "balance_loss_mlp": 1.04093289, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 2.1775753375634963, "language_loss": 0.78104752, "learning_rate": 3.964718851551923e-06, "loss": 0.8034153, "num_input_tokens_seen": 31341880, "step": 1465, "time_per_iteration": 2.6852309703826904 }, { "auxiliary_loss_clip": 0.01199489, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.0619812, "balance_loss_mlp": 1.03791499, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 2.412657222564686, "language_loss": 0.85187089, "learning_rate": 3.9646459839685675e-06, "loss": 0.87446451, "num_input_tokens_seen": 31361995, "step": 1466, "time_per_iteration": 2.706264019012451 }, { "auxiliary_loss_clip": 0.01120627, "auxiliary_loss_mlp": 0.00782645, "balance_loss_clip": 1.04989958, "balance_loss_mlp": 1.00037241, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 1.9900601596102498, "language_loss": 0.84168816, "learning_rate": 3.964573041885641e-06, "loss": 0.86072087, "num_input_tokens_seen": 31381515, "step": 1467, "time_per_iteration": 2.8636934757232666 }, { "auxiliary_loss_clip": 0.01178935, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.05910301, "balance_loss_mlp": 1.03219247, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.660218686828999, "language_loss": 0.75506544, "learning_rate": 3.964500025305907e-06, "loss": 0.77740854, "num_input_tokens_seen": 31400345, "step": 1468, "time_per_iteration": 2.661501884460449 }, { "auxiliary_loss_clip": 0.01181261, "auxiliary_loss_mlp": 0.01054252, "balance_loss_clip": 1.0629456, "balance_loss_mlp": 1.03266358, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 4.868504388441724, "language_loss": 0.80322379, "learning_rate": 3.9644269342321355e-06, "loss": 0.82557893, "num_input_tokens_seen": 31419620, "step": 1469, "time_per_iteration": 2.7473137378692627 }, { "auxiliary_loss_clip": 0.01198542, "auxiliary_loss_mlp": 0.01059353, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.03677487, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 2.0179242193855806, "language_loss": 0.77437651, "learning_rate": 3.9643537686670974e-06, "loss": 0.79695547, "num_input_tokens_seen": 31437970, "step": 1470, "time_per_iteration": 2.7672410011291504 }, { "auxiliary_loss_clip": 0.01193825, "auxiliary_loss_mlp": 0.01067102, "balance_loss_clip": 1.06180143, "balance_loss_mlp": 1.04281926, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 1.6812425162011504, "language_loss": 0.84297001, "learning_rate": 3.964280528613569e-06, "loss": 0.86557925, "num_input_tokens_seen": 31457040, "step": 1471, "time_per_iteration": 2.7584216594696045 }, { "auxiliary_loss_clip": 0.01156315, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.05682266, "balance_loss_mlp": 1.03342199, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.6938350729430058, "language_loss": 0.83321345, "learning_rate": 3.964207214074324e-06, "loss": 0.85531968, "num_input_tokens_seen": 31477520, "step": 1472, "time_per_iteration": 2.7895469665527344 }, { "auxiliary_loss_clip": 0.01176151, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.06106544, "balance_loss_mlp": 1.03529835, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 2.3638705809965, "language_loss": 0.82781172, "learning_rate": 3.964133825052146e-06, "loss": 0.85015941, "num_input_tokens_seen": 31495575, "step": 1473, "time_per_iteration": 2.7361483573913574 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.05552769, "balance_loss_mlp": 1.04263091, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.6022277785896435, "language_loss": 0.78712153, "learning_rate": 3.964060361549816e-06, "loss": 0.80906206, "num_input_tokens_seen": 31520020, "step": 1474, "time_per_iteration": 2.894319534301758 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05575764, "balance_loss_mlp": 1.04175043, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.6120869011213488, "language_loss": 0.79030406, "learning_rate": 3.963986823570121e-06, "loss": 0.81244779, "num_input_tokens_seen": 31539265, "step": 1475, "time_per_iteration": 2.8806042671203613 }, { "auxiliary_loss_clip": 0.01191986, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.05980015, "balance_loss_mlp": 1.02478909, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.4679464237421194, "language_loss": 0.74202317, "learning_rate": 3.963913211115848e-06, "loss": 0.76441753, "num_input_tokens_seen": 31563425, "step": 1476, "time_per_iteration": 2.8381049633026123 }, { "auxiliary_loss_clip": 0.01174628, "auxiliary_loss_mlp": 0.01059934, "balance_loss_clip": 1.06217527, "balance_loss_mlp": 1.03678358, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.712954575149443, "language_loss": 0.74220836, "learning_rate": 3.9638395241897895e-06, "loss": 0.76455402, "num_input_tokens_seen": 31584525, "step": 1477, "time_per_iteration": 2.8452210426330566 }, { "auxiliary_loss_clip": 0.01191865, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.06062829, "balance_loss_mlp": 1.0278163, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 1.95844459768748, "language_loss": 0.87194049, "learning_rate": 3.963765762794739e-06, "loss": 0.89437139, "num_input_tokens_seen": 31603325, "step": 1478, "time_per_iteration": 2.644918203353882 }, { "auxiliary_loss_clip": 0.01176299, "auxiliary_loss_mlp": 0.01058069, "balance_loss_clip": 1.0572443, "balance_loss_mlp": 1.03546739, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 1.6306868156426517, "language_loss": 0.77571511, "learning_rate": 3.963691926933495e-06, "loss": 0.79805881, "num_input_tokens_seen": 31624820, "step": 1479, "time_per_iteration": 2.738168954849243 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05629039, "balance_loss_mlp": 1.02801871, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.199164032289915, "language_loss": 0.77797234, "learning_rate": 3.9636180166088555e-06, "loss": 0.80014014, "num_input_tokens_seen": 31646080, "step": 1480, "time_per_iteration": 2.837562322616577 }, { "auxiliary_loss_clip": 0.01180168, "auxiliary_loss_mlp": 0.01060894, "balance_loss_clip": 1.05762577, "balance_loss_mlp": 1.03656292, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 2.9471668635954273, "language_loss": 0.66437578, "learning_rate": 3.963544031823624e-06, "loss": 0.68678641, "num_input_tokens_seen": 31665770, "step": 1481, "time_per_iteration": 2.742422580718994 }, { "auxiliary_loss_clip": 0.01143445, "auxiliary_loss_mlp": 0.01055318, "balance_loss_clip": 1.05510306, "balance_loss_mlp": 1.03273988, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 2.124586862599894, "language_loss": 0.96630967, "learning_rate": 3.9634699725806065e-06, "loss": 0.9882974, "num_input_tokens_seen": 31683805, "step": 1482, "time_per_iteration": 2.8150243759155273 }, { "auxiliary_loss_clip": 0.0115336, "auxiliary_loss_mlp": 0.01057266, "balance_loss_clip": 1.05521989, "balance_loss_mlp": 1.03353167, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.7904792435575492, "language_loss": 0.78683239, "learning_rate": 3.96339583888261e-06, "loss": 0.80893862, "num_input_tokens_seen": 31704630, "step": 1483, "time_per_iteration": 2.869084119796753 }, { "auxiliary_loss_clip": 0.0116904, "auxiliary_loss_mlp": 0.01082082, "balance_loss_clip": 1.05540919, "balance_loss_mlp": 1.05829978, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.2229749189835677, "language_loss": 0.85424453, "learning_rate": 3.963321630732448e-06, "loss": 0.87675571, "num_input_tokens_seen": 31723255, "step": 1484, "time_per_iteration": 4.280332326889038 }, { "auxiliary_loss_clip": 0.01199312, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.06350458, "balance_loss_mlp": 1.03701186, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 1.7208139316694195, "language_loss": 0.80205405, "learning_rate": 3.963247348132932e-06, "loss": 0.82465357, "num_input_tokens_seen": 31747045, "step": 1485, "time_per_iteration": 2.761733055114746 }, { "auxiliary_loss_clip": 0.01173167, "auxiliary_loss_mlp": 0.01056554, "balance_loss_clip": 1.0563333, "balance_loss_mlp": 1.03228331, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.8969438127775513, "language_loss": 0.82859123, "learning_rate": 3.96317299108688e-06, "loss": 0.85088843, "num_input_tokens_seen": 31766615, "step": 1486, "time_per_iteration": 4.144649028778076 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.05592823, "balance_loss_mlp": 1.04021382, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 2.1520807598980185, "language_loss": 0.76365155, "learning_rate": 3.963098559597111e-06, "loss": 0.78573477, "num_input_tokens_seen": 31785855, "step": 1487, "time_per_iteration": 4.432489395141602 }, { "auxiliary_loss_clip": 0.01157327, "auxiliary_loss_mlp": 0.01060261, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.03542995, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 3.851280697857004, "language_loss": 0.83030224, "learning_rate": 3.963024053666449e-06, "loss": 0.85247803, "num_input_tokens_seen": 31804210, "step": 1488, "time_per_iteration": 2.7262001037597656 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.01051875, "balance_loss_clip": 1.05546355, "balance_loss_mlp": 1.02916527, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 1.7759111472560039, "language_loss": 0.71783459, "learning_rate": 3.962949473297718e-06, "loss": 0.74007201, "num_input_tokens_seen": 31826150, "step": 1489, "time_per_iteration": 4.562536954879761 }, { "auxiliary_loss_clip": 0.01150585, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.02830291, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.6999724957706692, "language_loss": 0.89717221, "learning_rate": 3.962874818493745e-06, "loss": 0.91919196, "num_input_tokens_seen": 31848060, "step": 1490, "time_per_iteration": 2.838327646255493 }, { "auxiliary_loss_clip": 0.01184278, "auxiliary_loss_mlp": 0.01064168, "balance_loss_clip": 1.05656135, "balance_loss_mlp": 1.04102957, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 3.9062133325383126, "language_loss": 0.73075998, "learning_rate": 3.9628000892573635e-06, "loss": 0.7532444, "num_input_tokens_seen": 31870040, "step": 1491, "time_per_iteration": 2.7007367610931396 }, { "auxiliary_loss_clip": 0.01189564, "auxiliary_loss_mlp": 0.00780167, "balance_loss_clip": 1.05968356, "balance_loss_mlp": 1.00023544, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 1.7021050418948058, "language_loss": 0.77235049, "learning_rate": 3.9627252855914055e-06, "loss": 0.79204774, "num_input_tokens_seen": 31890400, "step": 1492, "time_per_iteration": 2.7799623012542725 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01057952, "balance_loss_clip": 1.05902028, "balance_loss_mlp": 1.03512359, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 1.9236790530591625, "language_loss": 0.71429193, "learning_rate": 3.962650407498707e-06, "loss": 0.73674262, "num_input_tokens_seen": 31913435, "step": 1493, "time_per_iteration": 2.8479840755462646 }, { "auxiliary_loss_clip": 0.01188796, "auxiliary_loss_mlp": 0.01057103, "balance_loss_clip": 1.05757976, "balance_loss_mlp": 1.03371406, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 2.6977604073852053, "language_loss": 0.87175488, "learning_rate": 3.962575454982109e-06, "loss": 0.8942138, "num_input_tokens_seen": 31932435, "step": 1494, "time_per_iteration": 2.855658769607544 }, { "auxiliary_loss_clip": 0.0108466, "auxiliary_loss_mlp": 0.01070478, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.04551601, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.6162523894431247, "language_loss": 0.82929438, "learning_rate": 3.962500428044454e-06, "loss": 0.85084569, "num_input_tokens_seen": 31950125, "step": 1495, "time_per_iteration": 2.9265449047088623 }, { "auxiliary_loss_clip": 0.01171464, "auxiliary_loss_mlp": 0.01059756, "balance_loss_clip": 1.05779243, "balance_loss_mlp": 1.03682017, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 9.387255385257733, "language_loss": 0.70191383, "learning_rate": 3.962425326688585e-06, "loss": 0.72422606, "num_input_tokens_seen": 31968050, "step": 1496, "time_per_iteration": 2.773693799972534 }, { "auxiliary_loss_clip": 0.01164171, "auxiliary_loss_mlp": 0.01049454, "balance_loss_clip": 1.05397439, "balance_loss_mlp": 1.02888989, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.6327835891742186, "language_loss": 0.79752576, "learning_rate": 3.962350150917351e-06, "loss": 0.81966203, "num_input_tokens_seen": 31985675, "step": 1497, "time_per_iteration": 2.6850852966308594 }, { "auxiliary_loss_clip": 0.01129609, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.05307686, "balance_loss_mlp": 1.03146648, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 8.517000212139891, "language_loss": 0.82940567, "learning_rate": 3.9622749007336035e-06, "loss": 0.85125089, "num_input_tokens_seen": 32005180, "step": 1498, "time_per_iteration": 2.786205768585205 }, { "auxiliary_loss_clip": 0.01170006, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.0577898, "balance_loss_mlp": 1.03718853, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.220597323082783, "language_loss": 0.78609937, "learning_rate": 3.962199576140195e-06, "loss": 0.80839342, "num_input_tokens_seen": 32022970, "step": 1499, "time_per_iteration": 2.71785831451416 }, { "auxiliary_loss_clip": 0.01161539, "auxiliary_loss_mlp": 0.00780528, "balance_loss_clip": 1.05444527, "balance_loss_mlp": 1.00024021, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 2.049001350461653, "language_loss": 0.93337607, "learning_rate": 3.962124177139981e-06, "loss": 0.95279682, "num_input_tokens_seen": 32043055, "step": 1500, "time_per_iteration": 2.7077536582946777 }, { "auxiliary_loss_clip": 0.01148009, "auxiliary_loss_mlp": 0.01055246, "balance_loss_clip": 1.05371249, "balance_loss_mlp": 1.0308435, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 3.0778515668575492, "language_loss": 0.74595469, "learning_rate": 3.962048703735822e-06, "loss": 0.76798725, "num_input_tokens_seen": 32061900, "step": 1501, "time_per_iteration": 2.7073416709899902 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01013118, "balance_loss_clip": 1.03392363, "balance_loss_mlp": 1.00963676, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.7274487593473578, "language_loss": 0.58316052, "learning_rate": 3.96197315593058e-06, "loss": 0.60385704, "num_input_tokens_seen": 32122745, "step": 1502, "time_per_iteration": 3.274049997329712 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01062533, "balance_loss_clip": 1.04626393, "balance_loss_mlp": 1.03896546, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.1727281711500095, "language_loss": 0.69501173, "learning_rate": 3.961897533727119e-06, "loss": 0.71708393, "num_input_tokens_seen": 32145125, "step": 1503, "time_per_iteration": 2.87554669380188 }, { "auxiliary_loss_clip": 0.01133108, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.04783726, "balance_loss_mlp": 1.03660655, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 2.169205134580129, "language_loss": 0.86124271, "learning_rate": 3.961821837128306e-06, "loss": 0.88316405, "num_input_tokens_seen": 32166255, "step": 1504, "time_per_iteration": 2.844688892364502 }, { "auxiliary_loss_clip": 0.01146301, "auxiliary_loss_mlp": 0.01069714, "balance_loss_clip": 1.05341232, "balance_loss_mlp": 1.04261804, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 2.178155372989796, "language_loss": 0.7233696, "learning_rate": 3.961746066137014e-06, "loss": 0.74552977, "num_input_tokens_seen": 32184010, "step": 1505, "time_per_iteration": 2.7992677688598633 }, { "auxiliary_loss_clip": 0.01137399, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.03302479, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 2.5107188210784526, "language_loss": 0.80730999, "learning_rate": 3.961670220756114e-06, "loss": 0.82924813, "num_input_tokens_seen": 32201635, "step": 1506, "time_per_iteration": 2.7458760738372803 }, { "auxiliary_loss_clip": 0.01140643, "auxiliary_loss_mlp": 0.01053315, "balance_loss_clip": 1.05161858, "balance_loss_mlp": 1.03197718, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 2.166956120197676, "language_loss": 0.75915337, "learning_rate": 3.961594300988482e-06, "loss": 0.78109294, "num_input_tokens_seen": 32221940, "step": 1507, "time_per_iteration": 2.873826742172241 }, { "auxiliary_loss_clip": 0.01051873, "auxiliary_loss_mlp": 0.01005715, "balance_loss_clip": 1.02043629, "balance_loss_mlp": 1.00175714, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7272435825555993, "language_loss": 0.57699698, "learning_rate": 3.961518306836998e-06, "loss": 0.59757286, "num_input_tokens_seen": 32276495, "step": 1508, "time_per_iteration": 3.064926862716675 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01054804, "balance_loss_clip": 1.055233, "balance_loss_mlp": 1.03155804, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.7601330807914457, "language_loss": 0.85090744, "learning_rate": 3.961442238304543e-06, "loss": 0.87310958, "num_input_tokens_seen": 32294130, "step": 1509, "time_per_iteration": 2.6664113998413086 }, { "auxiliary_loss_clip": 0.01168837, "auxiliary_loss_mlp": 0.01064138, "balance_loss_clip": 1.05745769, "balance_loss_mlp": 1.03949761, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 2.3794507710009203, "language_loss": 0.84110659, "learning_rate": 3.961366095394002e-06, "loss": 0.8634364, "num_input_tokens_seen": 32313555, "step": 1510, "time_per_iteration": 2.783484697341919 }, { "auxiliary_loss_clip": 0.01153141, "auxiliary_loss_mlp": 0.01058569, "balance_loss_clip": 1.05423617, "balance_loss_mlp": 1.03482211, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 1.8490761573484715, "language_loss": 0.85247588, "learning_rate": 3.961289878108262e-06, "loss": 0.87459302, "num_input_tokens_seen": 32331430, "step": 1511, "time_per_iteration": 2.714620351791382 }, { "auxiliary_loss_clip": 0.01145395, "auxiliary_loss_mlp": 0.01052919, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.02983987, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.5734326837562458, "language_loss": 0.84977764, "learning_rate": 3.9612135864502135e-06, "loss": 0.87176073, "num_input_tokens_seen": 32353705, "step": 1512, "time_per_iteration": 2.75361704826355 }, { "auxiliary_loss_clip": 0.01155239, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.05740952, "balance_loss_mlp": 1.03185391, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 3.0235926431973654, "language_loss": 0.87346804, "learning_rate": 3.961137220422749e-06, "loss": 0.89555705, "num_input_tokens_seen": 32370520, "step": 1513, "time_per_iteration": 2.6864211559295654 }, { "auxiliary_loss_clip": 0.01168585, "auxiliary_loss_mlp": 0.01049408, "balance_loss_clip": 1.05562937, "balance_loss_mlp": 1.02841544, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.7883280971870592, "language_loss": 0.86802679, "learning_rate": 3.961060780028764e-06, "loss": 0.89020675, "num_input_tokens_seen": 32389105, "step": 1514, "time_per_iteration": 2.6788065433502197 }, { "auxiliary_loss_clip": 0.01134005, "auxiliary_loss_mlp": 0.01064386, "balance_loss_clip": 1.05571628, "balance_loss_mlp": 1.04252315, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 1.7666120550996132, "language_loss": 0.89944756, "learning_rate": 3.960984265271159e-06, "loss": 0.92143154, "num_input_tokens_seen": 32408065, "step": 1515, "time_per_iteration": 2.757390022277832 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.05547726, "balance_loss_mlp": 1.03360808, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 2.1090985009837646, "language_loss": 0.85576892, "learning_rate": 3.9609076761528335e-06, "loss": 0.87796342, "num_input_tokens_seen": 32427225, "step": 1516, "time_per_iteration": 2.704784870147705 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01057165, "balance_loss_clip": 1.05135357, "balance_loss_mlp": 1.03451526, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 2.086405156201108, "language_loss": 0.81167233, "learning_rate": 3.960831012676692e-06, "loss": 0.83383185, "num_input_tokens_seen": 32450510, "step": 1517, "time_per_iteration": 2.8586854934692383 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01065492, "balance_loss_clip": 1.05741739, "balance_loss_mlp": 1.04280686, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 2.104468567304263, "language_loss": 0.78067243, "learning_rate": 3.960754274845642e-06, "loss": 0.80311304, "num_input_tokens_seen": 32468425, "step": 1518, "time_per_iteration": 2.7862088680267334 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.01061371, "balance_loss_clip": 1.05285823, "balance_loss_mlp": 1.03900695, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.6816479812467473, "language_loss": 0.86124098, "learning_rate": 3.960677462662594e-06, "loss": 0.88350856, "num_input_tokens_seen": 32487510, "step": 1519, "time_per_iteration": 2.723714828491211 }, { "auxiliary_loss_clip": 0.01163599, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.05454183, "balance_loss_mlp": 1.02914131, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 1.9681293960876167, "language_loss": 0.73279071, "learning_rate": 3.96060057613046e-06, "loss": 0.75495458, "num_input_tokens_seen": 32507250, "step": 1520, "time_per_iteration": 2.8098628520965576 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01058035, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.03469419, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.6988457876937066, "language_loss": 0.85236609, "learning_rate": 3.960523615252156e-06, "loss": 0.87455815, "num_input_tokens_seen": 32526045, "step": 1521, "time_per_iteration": 2.7134172916412354 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01063979, "balance_loss_clip": 1.05189717, "balance_loss_mlp": 1.03991079, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 1.6991603177293335, "language_loss": 0.83933008, "learning_rate": 3.960446580030599e-06, "loss": 0.8611716, "num_input_tokens_seen": 32546575, "step": 1522, "time_per_iteration": 2.93745493888855 }, { "auxiliary_loss_clip": 0.01182362, "auxiliary_loss_mlp": 0.01064589, "balance_loss_clip": 1.05630755, "balance_loss_mlp": 1.04153395, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 1.647915064434875, "language_loss": 0.81012994, "learning_rate": 3.960369470468711e-06, "loss": 0.8325994, "num_input_tokens_seen": 32568795, "step": 1523, "time_per_iteration": 4.378152847290039 }, { "auxiliary_loss_clip": 0.01157976, "auxiliary_loss_mlp": 0.00781395, "balance_loss_clip": 1.05422449, "balance_loss_mlp": 1.00037968, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.106497620262502, "language_loss": 0.7460072, "learning_rate": 3.960292286569418e-06, "loss": 0.76540089, "num_input_tokens_seen": 32587010, "step": 1524, "time_per_iteration": 2.7146124839782715 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.0106119, "balance_loss_clip": 1.05092478, "balance_loss_mlp": 1.03782487, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 2.0992608845945413, "language_loss": 0.86498803, "learning_rate": 3.960215028335644e-06, "loss": 0.88695222, "num_input_tokens_seen": 32602375, "step": 1525, "time_per_iteration": 4.314826965332031 }, { "auxiliary_loss_clip": 0.01164396, "auxiliary_loss_mlp": 0.01049506, "balance_loss_clip": 1.05688822, "balance_loss_mlp": 1.0263319, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.1146348399758237, "language_loss": 0.74512708, "learning_rate": 3.96013769577032e-06, "loss": 0.76726609, "num_input_tokens_seen": 32621460, "step": 1526, "time_per_iteration": 5.878855466842651 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01055817, "balance_loss_clip": 1.05732703, "balance_loss_mlp": 1.03392982, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 2.5135282962071215, "language_loss": 0.77581728, "learning_rate": 3.960060288876378e-06, "loss": 0.79823542, "num_input_tokens_seen": 32640440, "step": 1527, "time_per_iteration": 2.693847179412842 }, { "auxiliary_loss_clip": 0.01173605, "auxiliary_loss_mlp": 0.01052264, "balance_loss_clip": 1.0534333, "balance_loss_mlp": 1.02868414, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 2.655631139677705, "language_loss": 0.78546697, "learning_rate": 3.959982807656753e-06, "loss": 0.80772561, "num_input_tokens_seen": 32660020, "step": 1528, "time_per_iteration": 2.774219512939453 }, { "auxiliary_loss_clip": 0.01146017, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.0499053, "balance_loss_mlp": 1.02827477, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.682547324044482, "language_loss": 0.76732361, "learning_rate": 3.959905252114384e-06, "loss": 0.78928751, "num_input_tokens_seen": 32678170, "step": 1529, "time_per_iteration": 4.603156089782715 }, { "auxiliary_loss_clip": 0.01186538, "auxiliary_loss_mlp": 0.00780856, "balance_loss_clip": 1.05415928, "balance_loss_mlp": 1.00045025, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 1.7410660090049153, "language_loss": 0.82906747, "learning_rate": 3.959827622252211e-06, "loss": 0.84874141, "num_input_tokens_seen": 32697540, "step": 1530, "time_per_iteration": 2.7118582725524902 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.0106509, "balance_loss_clip": 1.04975331, "balance_loss_mlp": 1.04220152, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 2.182960664479704, "language_loss": 0.84001881, "learning_rate": 3.959749918073179e-06, "loss": 0.86197078, "num_input_tokens_seen": 32716805, "step": 1531, "time_per_iteration": 2.791947603225708 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051554, "balance_loss_clip": 1.04906452, "balance_loss_mlp": 1.02853465, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 1.7570281394880602, "language_loss": 0.81253195, "learning_rate": 3.959672139580233e-06, "loss": 0.83441293, "num_input_tokens_seen": 32736385, "step": 1532, "time_per_iteration": 2.737739324569702 }, { "auxiliary_loss_clip": 0.01157728, "auxiliary_loss_mlp": 0.01056753, "balance_loss_clip": 1.052163, "balance_loss_mlp": 1.03385305, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 2.2821036564882182, "language_loss": 0.84194255, "learning_rate": 3.9595942867763235e-06, "loss": 0.86408734, "num_input_tokens_seen": 32757140, "step": 1533, "time_per_iteration": 2.7542598247528076 }, { "auxiliary_loss_clip": 0.01149262, "auxiliary_loss_mlp": 0.01053623, "balance_loss_clip": 1.05813503, "balance_loss_mlp": 1.03190327, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 1.9396914937933663, "language_loss": 0.9009546, "learning_rate": 3.959516359664402e-06, "loss": 0.92298347, "num_input_tokens_seen": 32774860, "step": 1534, "time_per_iteration": 2.6450984477996826 }, { "auxiliary_loss_clip": 0.01150273, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.03849435, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 5.065477266086046, "language_loss": 0.75779241, "learning_rate": 3.959438358247424e-06, "loss": 0.77992499, "num_input_tokens_seen": 32795250, "step": 1535, "time_per_iteration": 2.730915069580078 }, { "auxiliary_loss_clip": 0.01168283, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.05278873, "balance_loss_mlp": 1.02403271, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.8085584532497372, "language_loss": 0.81631637, "learning_rate": 3.959360282528346e-06, "loss": 0.83845198, "num_input_tokens_seen": 32813805, "step": 1536, "time_per_iteration": 2.7326817512512207 }, { "auxiliary_loss_clip": 0.01181977, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.05431938, "balance_loss_mlp": 1.03224182, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 2.0929096884707556, "language_loss": 0.89092755, "learning_rate": 3.959282132510131e-06, "loss": 0.9132843, "num_input_tokens_seen": 32830960, "step": 1537, "time_per_iteration": 2.675771713256836 }, { "auxiliary_loss_clip": 0.01157238, "auxiliary_loss_mlp": 0.01058647, "balance_loss_clip": 1.05114293, "balance_loss_mlp": 1.03605688, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 1.9480116987165197, "language_loss": 0.80702311, "learning_rate": 3.959203908195741e-06, "loss": 0.82918191, "num_input_tokens_seen": 32848275, "step": 1538, "time_per_iteration": 2.71618390083313 }, { "auxiliary_loss_clip": 0.01060495, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03095436, "balance_loss_mlp": 1.00872231, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7534074452314953, "language_loss": 0.57429332, "learning_rate": 3.959125609588142e-06, "loss": 0.59502202, "num_input_tokens_seen": 32917730, "step": 1539, "time_per_iteration": 3.3933441638946533 }, { "auxiliary_loss_clip": 0.01159831, "auxiliary_loss_mlp": 0.01050602, "balance_loss_clip": 1.05638027, "balance_loss_mlp": 1.02863121, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 2.849299216868502, "language_loss": 0.67554641, "learning_rate": 3.959047236690304e-06, "loss": 0.69765073, "num_input_tokens_seen": 32934910, "step": 1540, "time_per_iteration": 2.757084608078003 }, { "auxiliary_loss_clip": 0.01144239, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.04954028, "balance_loss_mlp": 1.026438, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 2.044335478602743, "language_loss": 0.83917534, "learning_rate": 3.958968789505198e-06, "loss": 0.86110216, "num_input_tokens_seen": 32953840, "step": 1541, "time_per_iteration": 2.8497180938720703 }, { "auxiliary_loss_clip": 0.01077839, "auxiliary_loss_mlp": 0.01013078, "balance_loss_clip": 1.02602255, "balance_loss_mlp": 1.0097636, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8790732834061692, "language_loss": 0.61881655, "learning_rate": 3.9588902680358e-06, "loss": 0.63972563, "num_input_tokens_seen": 33011410, "step": 1542, "time_per_iteration": 3.3079330921173096 }, { "auxiliary_loss_clip": 0.01161232, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.05441117, "balance_loss_mlp": 1.03808808, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.6256118826429122, "language_loss": 0.82802349, "learning_rate": 3.958811672285086e-06, "loss": 0.85023022, "num_input_tokens_seen": 33031675, "step": 1543, "time_per_iteration": 2.7408807277679443 }, { "auxiliary_loss_clip": 0.01135873, "auxiliary_loss_mlp": 0.01060295, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 1.03863442, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.706948475246468, "language_loss": 0.72265279, "learning_rate": 3.958733002256038e-06, "loss": 0.74461448, "num_input_tokens_seen": 33056355, "step": 1544, "time_per_iteration": 3.104156255722046 }, { "auxiliary_loss_clip": 0.01166071, "auxiliary_loss_mlp": 0.01055881, "balance_loss_clip": 1.05165935, "balance_loss_mlp": 1.03138375, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.7720844214030114, "language_loss": 0.77286768, "learning_rate": 3.958654257951637e-06, "loss": 0.79508722, "num_input_tokens_seen": 33079520, "step": 1545, "time_per_iteration": 2.808180570602417 }, { "auxiliary_loss_clip": 0.01140161, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.0526737, "balance_loss_mlp": 1.02872682, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 2.7089619481030076, "language_loss": 0.74396008, "learning_rate": 3.9585754393748706e-06, "loss": 0.76586664, "num_input_tokens_seen": 33096135, "step": 1546, "time_per_iteration": 2.7634081840515137 }, { "auxiliary_loss_clip": 0.01163775, "auxiliary_loss_mlp": 0.0105305, "balance_loss_clip": 1.05357957, "balance_loss_mlp": 1.02956545, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 1.9423225100503794, "language_loss": 0.84200966, "learning_rate": 3.9584965465287275e-06, "loss": 0.86417794, "num_input_tokens_seen": 33115245, "step": 1547, "time_per_iteration": 2.790003776550293 }, { "auxiliary_loss_clip": 0.01141839, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.04740989, "balance_loss_mlp": 1.03195918, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.6545433694843488, "language_loss": 0.67698336, "learning_rate": 3.958417579416199e-06, "loss": 0.69894123, "num_input_tokens_seen": 33136640, "step": 1548, "time_per_iteration": 2.8367013931274414 }, { "auxiliary_loss_clip": 0.01123899, "auxiliary_loss_mlp": 0.01059885, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.03754544, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.6829727803454704, "language_loss": 0.8326273, "learning_rate": 3.9583385380402795e-06, "loss": 0.85446513, "num_input_tokens_seen": 33155060, "step": 1549, "time_per_iteration": 2.8462016582489014 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.0104617, "balance_loss_clip": 1.05815506, "balance_loss_mlp": 1.02473652, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.5528514681372962, "language_loss": 0.75838119, "learning_rate": 3.958259422403966e-06, "loss": 0.78060532, "num_input_tokens_seen": 33175420, "step": 1550, "time_per_iteration": 2.7325351238250732 }, { "auxiliary_loss_clip": 0.01150315, "auxiliary_loss_mlp": 0.01069257, "balance_loss_clip": 1.05249369, "balance_loss_mlp": 1.04483092, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 2.1922696027472233, "language_loss": 0.82828665, "learning_rate": 3.95818023251026e-06, "loss": 0.85048234, "num_input_tokens_seen": 33194120, "step": 1551, "time_per_iteration": 2.852602481842041 }, { "auxiliary_loss_clip": 0.01064371, "auxiliary_loss_mlp": 0.00760109, "balance_loss_clip": 1.02203059, "balance_loss_mlp": 0.99984246, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7384225982202158, "language_loss": 0.61837572, "learning_rate": 3.958100968362163e-06, "loss": 0.63662052, "num_input_tokens_seen": 33261080, "step": 1552, "time_per_iteration": 3.3453099727630615 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01016654, "balance_loss_clip": 1.02415061, "balance_loss_mlp": 1.01338792, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8524917480784928, "language_loss": 0.58986926, "learning_rate": 3.958021629962681e-06, "loss": 0.61062753, "num_input_tokens_seen": 33330235, "step": 1553, "time_per_iteration": 3.37673282623291 }, { "auxiliary_loss_clip": 0.01146955, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.05026984, "balance_loss_mlp": 1.03336585, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 2.3365109182487, "language_loss": 0.87665397, "learning_rate": 3.957942217314823e-06, "loss": 0.8986904, "num_input_tokens_seen": 33349035, "step": 1554, "time_per_iteration": 2.8098127841949463 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.01057257, "balance_loss_clip": 1.05439448, "balance_loss_mlp": 1.03393972, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 4.388884220182432, "language_loss": 0.81678319, "learning_rate": 3.957862730421599e-06, "loss": 0.83888626, "num_input_tokens_seen": 33368060, "step": 1555, "time_per_iteration": 2.726207971572876 }, { "auxiliary_loss_clip": 0.01058869, "auxiliary_loss_mlp": 0.01003892, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.00045919, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8683826280274983, "language_loss": 0.59606886, "learning_rate": 3.957783169286024e-06, "loss": 0.61669648, "num_input_tokens_seen": 33430825, "step": 1556, "time_per_iteration": 3.209326982498169 }, { "auxiliary_loss_clip": 0.01174249, "auxiliary_loss_mlp": 0.01059741, "balance_loss_clip": 1.05518138, "balance_loss_mlp": 1.03727031, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.6803158790244075, "language_loss": 0.84290808, "learning_rate": 3.9577035339111155e-06, "loss": 0.86524796, "num_input_tokens_seen": 33454855, "step": 1557, "time_per_iteration": 2.831650733947754 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.01065156, "balance_loss_clip": 1.04900038, "balance_loss_mlp": 1.04112351, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 1.6725809358966677, "language_loss": 0.780913, "learning_rate": 3.957623824299893e-06, "loss": 0.8026731, "num_input_tokens_seen": 33476000, "step": 1558, "time_per_iteration": 3.0111780166625977 }, { "auxiliary_loss_clip": 0.01164994, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.0558666, "balance_loss_mlp": 1.02881753, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.0141986314124414, "language_loss": 0.80066288, "learning_rate": 3.957544040455379e-06, "loss": 0.82282507, "num_input_tokens_seen": 33493845, "step": 1559, "time_per_iteration": 3.024117946624756 }, { "auxiliary_loss_clip": 0.01141277, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05060387, "balance_loss_mlp": 1.04012942, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 1.8358373674042003, "language_loss": 0.76418209, "learning_rate": 3.957464182380599e-06, "loss": 0.78621197, "num_input_tokens_seen": 33510850, "step": 1560, "time_per_iteration": 2.68558406829834 }, { "auxiliary_loss_clip": 0.01137939, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.05014277, "balance_loss_mlp": 1.03213274, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 3.575155933252121, "language_loss": 0.80784953, "learning_rate": 3.95738425007858e-06, "loss": 0.82977819, "num_input_tokens_seen": 33530430, "step": 1561, "time_per_iteration": 2.759148359298706 }, { "auxiliary_loss_clip": 0.01173652, "auxiliary_loss_mlp": 0.01052448, "balance_loss_clip": 1.05276573, "balance_loss_mlp": 1.02989376, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 2.448664627367939, "language_loss": 0.6140722, "learning_rate": 3.957304243552354e-06, "loss": 0.63633323, "num_input_tokens_seen": 33551975, "step": 1562, "time_per_iteration": 2.9014978408813477 }, { "auxiliary_loss_clip": 0.01162693, "auxiliary_loss_mlp": 0.0106374, "balance_loss_clip": 1.05719543, "balance_loss_mlp": 1.04213953, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 3.5098220300578555, "language_loss": 0.8496151, "learning_rate": 3.957224162804956e-06, "loss": 0.87187934, "num_input_tokens_seen": 33569850, "step": 1563, "time_per_iteration": 4.404061555862427 }, { "auxiliary_loss_clip": 0.01164811, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.05775142, "balance_loss_mlp": 1.02652228, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 1.6765528861156813, "language_loss": 0.76511294, "learning_rate": 3.9571440078394205e-06, "loss": 0.78724039, "num_input_tokens_seen": 33590510, "step": 1564, "time_per_iteration": 4.255565166473389 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.05196142, "balance_loss_mlp": 1.04172707, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 1.9762038777899962, "language_loss": 0.80134326, "learning_rate": 3.9570637786587895e-06, "loss": 0.82348871, "num_input_tokens_seen": 33608810, "step": 1565, "time_per_iteration": 2.8548545837402344 }, { "auxiliary_loss_clip": 0.01158602, "auxiliary_loss_mlp": 0.01063767, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.04233313, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 1.6810250981626251, "language_loss": 0.75134379, "learning_rate": 3.956983475266103e-06, "loss": 0.77356744, "num_input_tokens_seen": 33627265, "step": 1566, "time_per_iteration": 4.889045715332031 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.00780689, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.00022864, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 1.6828919748843199, "language_loss": 0.77958012, "learning_rate": 3.956903097664407e-06, "loss": 0.79884553, "num_input_tokens_seen": 33644810, "step": 1567, "time_per_iteration": 4.445765972137451 }, { "auxiliary_loss_clip": 0.01156815, "auxiliary_loss_mlp": 0.01056228, "balance_loss_clip": 1.05256855, "balance_loss_mlp": 1.03591454, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 2.008686295040646, "language_loss": 0.82608044, "learning_rate": 3.956822645856749e-06, "loss": 0.84821093, "num_input_tokens_seen": 33665665, "step": 1568, "time_per_iteration": 2.881535768508911 }, { "auxiliary_loss_clip": 0.01187915, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.05717778, "balance_loss_mlp": 1.02927184, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 1.9573151026586577, "language_loss": 0.76943743, "learning_rate": 3.9567421198461814e-06, "loss": 0.79183388, "num_input_tokens_seen": 33684760, "step": 1569, "time_per_iteration": 2.6097726821899414 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04771852, "balance_loss_mlp": 1.03625941, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 3.3813700161908917, "language_loss": 0.85488856, "learning_rate": 3.956661519635756e-06, "loss": 0.87669849, "num_input_tokens_seen": 33700750, "step": 1570, "time_per_iteration": 2.7571377754211426 }, { "auxiliary_loss_clip": 0.01122458, "auxiliary_loss_mlp": 0.01055939, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.03183508, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.540414635950846, "language_loss": 0.76415235, "learning_rate": 3.95658084522853e-06, "loss": 0.7859363, "num_input_tokens_seen": 33724430, "step": 1571, "time_per_iteration": 2.913569211959839 }, { "auxiliary_loss_clip": 0.01135683, "auxiliary_loss_mlp": 0.01057111, "balance_loss_clip": 1.0490278, "balance_loss_mlp": 1.0349735, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.6745378641752047, "language_loss": 0.79397607, "learning_rate": 3.956500096627561e-06, "loss": 0.81590402, "num_input_tokens_seen": 33743455, "step": 1572, "time_per_iteration": 2.813410758972168 }, { "auxiliary_loss_clip": 0.01148251, "auxiliary_loss_mlp": 0.0106927, "balance_loss_clip": 1.05619979, "balance_loss_mlp": 1.04524922, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 1.7559396294879055, "language_loss": 0.87707287, "learning_rate": 3.956419273835913e-06, "loss": 0.89924812, "num_input_tokens_seen": 33763435, "step": 1573, "time_per_iteration": 2.776535987854004 }, { "auxiliary_loss_clip": 0.01161183, "auxiliary_loss_mlp": 0.01063326, "balance_loss_clip": 1.05485129, "balance_loss_mlp": 1.03804219, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.9707854698090097, "language_loss": 0.81982428, "learning_rate": 3.95633837685665e-06, "loss": 0.84206939, "num_input_tokens_seen": 33784325, "step": 1574, "time_per_iteration": 2.7604806423187256 }, { "auxiliary_loss_clip": 0.01156287, "auxiliary_loss_mlp": 0.01055594, "balance_loss_clip": 1.05234718, "balance_loss_mlp": 1.0344342, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.7178511535677499, "language_loss": 0.80855322, "learning_rate": 3.95625740569284e-06, "loss": 0.83067203, "num_input_tokens_seen": 33802510, "step": 1575, "time_per_iteration": 2.713247299194336 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01068689, "balance_loss_clip": 1.05578864, "balance_loss_mlp": 1.04581285, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 1.9110861379460222, "language_loss": 0.86483347, "learning_rate": 3.956176360347553e-06, "loss": 0.88734365, "num_input_tokens_seen": 33819980, "step": 1576, "time_per_iteration": 2.682644844055176 }, { "auxiliary_loss_clip": 0.01056441, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.0225811, "balance_loss_mlp": 1.02344561, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9789918611127905, "language_loss": 0.6582402, "learning_rate": 3.956095240823862e-06, "loss": 0.67907751, "num_input_tokens_seen": 33878925, "step": 1577, "time_per_iteration": 3.2106685638427734 }, { "auxiliary_loss_clip": 0.01147668, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.05218005, "balance_loss_mlp": 1.03098869, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 1.8223175005615506, "language_loss": 0.79152733, "learning_rate": 3.956014047124844e-06, "loss": 0.81352365, "num_input_tokens_seen": 33897600, "step": 1578, "time_per_iteration": 2.820089340209961 }, { "auxiliary_loss_clip": 0.01185941, "auxiliary_loss_mlp": 0.01066432, "balance_loss_clip": 1.05838132, "balance_loss_mlp": 1.04437804, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 3.480730999818176, "language_loss": 0.78161818, "learning_rate": 3.955932779253578e-06, "loss": 0.80414188, "num_input_tokens_seen": 33917365, "step": 1579, "time_per_iteration": 2.6518983840942383 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01065633, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.04001498, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 2.0084876987684526, "language_loss": 0.73410392, "learning_rate": 3.955851437213144e-06, "loss": 0.75604343, "num_input_tokens_seen": 33936680, "step": 1580, "time_per_iteration": 2.679461717605591 }, { "auxiliary_loss_clip": 0.01157568, "auxiliary_loss_mlp": 0.01062628, "balance_loss_clip": 1.05573344, "balance_loss_mlp": 1.04095626, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 14.809542792179553, "language_loss": 0.77565914, "learning_rate": 3.955770021006627e-06, "loss": 0.7978611, "num_input_tokens_seen": 33960685, "step": 1581, "time_per_iteration": 2.765394449234009 }, { "auxiliary_loss_clip": 0.01144835, "auxiliary_loss_mlp": 0.0106468, "balance_loss_clip": 1.05426359, "balance_loss_mlp": 1.04276967, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.8617167187056045, "language_loss": 0.87230825, "learning_rate": 3.955688530637116e-06, "loss": 0.89440346, "num_input_tokens_seen": 33980015, "step": 1582, "time_per_iteration": 2.691364288330078 }, { "auxiliary_loss_clip": 0.01174295, "auxiliary_loss_mlp": 0.0106431, "balance_loss_clip": 1.05508888, "balance_loss_mlp": 1.04039705, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 1.8512060219658202, "language_loss": 0.67043924, "learning_rate": 3.955606966107699e-06, "loss": 0.69282532, "num_input_tokens_seen": 33997705, "step": 1583, "time_per_iteration": 2.6693732738494873 }, { "auxiliary_loss_clip": 0.01177751, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.0593859, "balance_loss_mlp": 1.03035378, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 2.144216926782962, "language_loss": 0.70752859, "learning_rate": 3.95552532742147e-06, "loss": 0.7298435, "num_input_tokens_seen": 34017465, "step": 1584, "time_per_iteration": 2.7164390087127686 }, { "auxiliary_loss_clip": 0.01138507, "auxiliary_loss_mlp": 0.0105762, "balance_loss_clip": 1.05243039, "balance_loss_mlp": 1.03584039, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.4654737580846544, "language_loss": 0.8080442, "learning_rate": 3.955443614581525e-06, "loss": 0.83000553, "num_input_tokens_seen": 34038550, "step": 1585, "time_per_iteration": 2.879831314086914 }, { "auxiliary_loss_clip": 0.01159374, "auxiliary_loss_mlp": 0.01057717, "balance_loss_clip": 1.05387473, "balance_loss_mlp": 1.03355336, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.638250735795891, "language_loss": 0.71921158, "learning_rate": 3.955361827590961e-06, "loss": 0.74138248, "num_input_tokens_seen": 34058665, "step": 1586, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.01048565, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.03115988, "balance_loss_mlp": 0.99901009, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8099482252624973, "language_loss": 0.55475175, "learning_rate": 3.955279966452883e-06, "loss": 0.57527041, "num_input_tokens_seen": 34109655, "step": 1587, "time_per_iteration": 3.0975699424743652 }, { "auxiliary_loss_clip": 0.01128884, "auxiliary_loss_mlp": 0.0105965, "balance_loss_clip": 1.04768586, "balance_loss_mlp": 1.03661847, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 1.708481785076906, "language_loss": 0.81062275, "learning_rate": 3.955198031170391e-06, "loss": 0.83250809, "num_input_tokens_seen": 34131115, "step": 1588, "time_per_iteration": 2.7718451023101807 }, { "auxiliary_loss_clip": 0.01131602, "auxiliary_loss_mlp": 0.01056117, "balance_loss_clip": 1.04894614, "balance_loss_mlp": 1.03438473, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.5119879232668088, "language_loss": 0.81481898, "learning_rate": 3.955116021746594e-06, "loss": 0.83669615, "num_input_tokens_seen": 34151925, "step": 1589, "time_per_iteration": 2.782468795776367 }, { "auxiliary_loss_clip": 0.0112194, "auxiliary_loss_mlp": 0.00780573, "balance_loss_clip": 1.0508883, "balance_loss_mlp": 1.00013089, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.525287399882202, "language_loss": 0.64882791, "learning_rate": 3.955033938184601e-06, "loss": 0.667853, "num_input_tokens_seen": 34175395, "step": 1590, "time_per_iteration": 3.0783450603485107 }, { "auxiliary_loss_clip": 0.01143501, "auxiliary_loss_mlp": 0.01058399, "balance_loss_clip": 1.05087948, "balance_loss_mlp": 1.0358206, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 2.0745314237741916, "language_loss": 0.83290577, "learning_rate": 3.954951780487526e-06, "loss": 0.85492468, "num_input_tokens_seen": 34197760, "step": 1591, "time_per_iteration": 2.8393962383270264 }, { "auxiliary_loss_clip": 0.01163486, "auxiliary_loss_mlp": 0.01065588, "balance_loss_clip": 1.0522387, "balance_loss_mlp": 1.04266405, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 2.825705290827541, "language_loss": 0.74087322, "learning_rate": 3.9548695486584835e-06, "loss": 0.76316392, "num_input_tokens_seen": 34215330, "step": 1592, "time_per_iteration": 2.6828882694244385 }, { "auxiliary_loss_clip": 0.01169239, "auxiliary_loss_mlp": 0.01055073, "balance_loss_clip": 1.05161428, "balance_loss_mlp": 1.03337741, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 2.18277080043521, "language_loss": 0.74483889, "learning_rate": 3.954787242700592e-06, "loss": 0.76708198, "num_input_tokens_seen": 34237745, "step": 1593, "time_per_iteration": 2.7193498611450195 }, { "auxiliary_loss_clip": 0.01177343, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.05910873, "balance_loss_mlp": 1.03307831, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.887493467708827, "language_loss": 0.69782627, "learning_rate": 3.954704862616971e-06, "loss": 0.72015071, "num_input_tokens_seen": 34256565, "step": 1594, "time_per_iteration": 2.635383367538452 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.05618978, "balance_loss_mlp": 1.03037214, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.1411006117727682, "language_loss": 0.82780552, "learning_rate": 3.954622408410747e-06, "loss": 0.85005581, "num_input_tokens_seen": 34275970, "step": 1595, "time_per_iteration": 2.7158257961273193 }, { "auxiliary_loss_clip": 0.01153253, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.05143809, "balance_loss_mlp": 1.0301652, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 1.7751890788987925, "language_loss": 0.84513396, "learning_rate": 3.954539880085045e-06, "loss": 0.86720896, "num_input_tokens_seen": 34295490, "step": 1596, "time_per_iteration": 2.710228204727173 }, { "auxiliary_loss_clip": 0.01166586, "auxiliary_loss_mlp": 0.0105804, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.03376901, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 1.8335529067237837, "language_loss": 0.69328064, "learning_rate": 3.9544572776429945e-06, "loss": 0.71552688, "num_input_tokens_seen": 34319990, "step": 1597, "time_per_iteration": 2.802959442138672 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.00780978, "balance_loss_clip": 1.0503217, "balance_loss_mlp": 1.00010371, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.0491570740921885, "language_loss": 0.7486403, "learning_rate": 3.954374601087729e-06, "loss": 0.76812243, "num_input_tokens_seen": 34339225, "step": 1598, "time_per_iteration": 2.6502270698547363 }, { "auxiliary_loss_clip": 0.01176661, "auxiliary_loss_mlp": 0.01053936, "balance_loss_clip": 1.05745888, "balance_loss_mlp": 1.03009462, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.6831440826618358, "language_loss": 0.68804371, "learning_rate": 3.954291850422382e-06, "loss": 0.71034968, "num_input_tokens_seen": 34361020, "step": 1599, "time_per_iteration": 2.74243426322937 }, { "auxiliary_loss_clip": 0.01157322, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.05754852, "balance_loss_mlp": 1.0371263, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 2.9774251326108367, "language_loss": 0.83950365, "learning_rate": 3.954209025650093e-06, "loss": 0.86167574, "num_input_tokens_seen": 34378630, "step": 1600, "time_per_iteration": 2.702907085418701 }, { "auxiliary_loss_clip": 0.01150263, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.05129707, "balance_loss_mlp": 1.03093433, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 2.287254549480118, "language_loss": 0.80520785, "learning_rate": 3.954126126774001e-06, "loss": 0.82725215, "num_input_tokens_seen": 34397110, "step": 1601, "time_per_iteration": 2.693399429321289 }, { "auxiliary_loss_clip": 0.01181247, "auxiliary_loss_mlp": 0.01054578, "balance_loss_clip": 1.05711937, "balance_loss_mlp": 1.03133249, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.4356926646094954, "language_loss": 0.81959623, "learning_rate": 3.954043153797251e-06, "loss": 0.84195447, "num_input_tokens_seen": 34414165, "step": 1602, "time_per_iteration": 2.639479875564575 }, { "auxiliary_loss_clip": 0.01137855, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05295444, "balance_loss_mlp": 1.02681863, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 3.099164686790191, "language_loss": 0.62498438, "learning_rate": 3.953960106722989e-06, "loss": 0.64687788, "num_input_tokens_seen": 34434445, "step": 1603, "time_per_iteration": 4.341834306716919 }, { "auxiliary_loss_clip": 0.01189954, "auxiliary_loss_mlp": 0.01054376, "balance_loss_clip": 1.05902839, "balance_loss_mlp": 1.02918696, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 3.121905357886113, "language_loss": 0.70996022, "learning_rate": 3.953876985554364e-06, "loss": 0.73240346, "num_input_tokens_seen": 34453095, "step": 1604, "time_per_iteration": 2.6520893573760986 }, { "auxiliary_loss_clip": 0.01176446, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.0570209, "balance_loss_mlp": 1.03358221, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 2.082890345500055, "language_loss": 0.7993719, "learning_rate": 3.953793790294527e-06, "loss": 0.82168949, "num_input_tokens_seen": 34473680, "step": 1605, "time_per_iteration": 4.5557661056518555 }, { "auxiliary_loss_clip": 0.01161047, "auxiliary_loss_mlp": 0.01047918, "balance_loss_clip": 1.05455577, "balance_loss_mlp": 1.0245893, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 1.990204665194141, "language_loss": 0.74550986, "learning_rate": 3.953710520946634e-06, "loss": 0.76759952, "num_input_tokens_seen": 34492610, "step": 1606, "time_per_iteration": 2.7172651290893555 }, { "auxiliary_loss_clip": 0.01172416, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.05834222, "balance_loss_mlp": 1.02378857, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.6403710807101601, "language_loss": 0.7571919, "learning_rate": 3.953627177513843e-06, "loss": 0.77938372, "num_input_tokens_seen": 34511855, "step": 1607, "time_per_iteration": 4.302686452865601 }, { "auxiliary_loss_clip": 0.01139491, "auxiliary_loss_mlp": 0.01051546, "balance_loss_clip": 1.04833579, "balance_loss_mlp": 1.0289799, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 1.975850982703557, "language_loss": 0.86756283, "learning_rate": 3.953543759999312e-06, "loss": 0.88947326, "num_input_tokens_seen": 34528905, "step": 1608, "time_per_iteration": 2.6280455589294434 }, { "auxiliary_loss_clip": 0.01126253, "auxiliary_loss_mlp": 0.01064704, "balance_loss_clip": 1.05433142, "balance_loss_mlp": 1.03940821, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 2.3082762386200266, "language_loss": 0.71363097, "learning_rate": 3.953460268406207e-06, "loss": 0.73554057, "num_input_tokens_seen": 34548480, "step": 1609, "time_per_iteration": 2.9116146564483643 }, { "auxiliary_loss_clip": 0.01149353, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.0546515, "balance_loss_mlp": 1.03606534, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 1.9988414994799784, "language_loss": 0.84810984, "learning_rate": 3.953376702737693e-06, "loss": 0.87018514, "num_input_tokens_seen": 34565410, "step": 1610, "time_per_iteration": 2.8005051612854004 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.01056267, "balance_loss_clip": 1.05790925, "balance_loss_mlp": 1.03228188, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 2.176236379770122, "language_loss": 0.6696198, "learning_rate": 3.953293062996939e-06, "loss": 0.69179636, "num_input_tokens_seen": 34584840, "step": 1611, "time_per_iteration": 2.731931447982788 }, { "auxiliary_loss_clip": 0.01125259, "auxiliary_loss_mlp": 0.01057116, "balance_loss_clip": 1.04740572, "balance_loss_mlp": 1.03385806, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 1.6508278294088392, "language_loss": 0.81067657, "learning_rate": 3.953209349187115e-06, "loss": 0.83250034, "num_input_tokens_seen": 34603360, "step": 1612, "time_per_iteration": 2.7998390197753906 }, { "auxiliary_loss_clip": 0.01182404, "auxiliary_loss_mlp": 0.01069551, "balance_loss_clip": 1.06046534, "balance_loss_mlp": 1.04600716, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 3.304939197664143, "language_loss": 0.80836105, "learning_rate": 3.953125561311398e-06, "loss": 0.83088064, "num_input_tokens_seen": 34620760, "step": 1613, "time_per_iteration": 2.624218702316284 }, { "auxiliary_loss_clip": 0.01148565, "auxiliary_loss_mlp": 0.01054743, "balance_loss_clip": 1.05542159, "balance_loss_mlp": 1.03047192, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 1.7164386274315457, "language_loss": 0.84289789, "learning_rate": 3.953041699372964e-06, "loss": 0.86493099, "num_input_tokens_seen": 34640695, "step": 1614, "time_per_iteration": 2.744340419769287 }, { "auxiliary_loss_clip": 0.01066618, "auxiliary_loss_mlp": 0.00759744, "balance_loss_clip": 1.02654934, "balance_loss_mlp": 1.00008702, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7127167896900892, "language_loss": 0.54629624, "learning_rate": 3.952957763374992e-06, "loss": 0.56455994, "num_input_tokens_seen": 34702395, "step": 1615, "time_per_iteration": 3.1547679901123047 }, { "auxiliary_loss_clip": 0.01033143, "auxiliary_loss_mlp": 0.01017555, "balance_loss_clip": 1.02384067, "balance_loss_mlp": 1.01381195, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7689373847786285, "language_loss": 0.58190405, "learning_rate": 3.952873753320666e-06, "loss": 0.60241103, "num_input_tokens_seen": 34768910, "step": 1616, "time_per_iteration": 3.3940556049346924 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01067983, "balance_loss_clip": 1.05504358, "balance_loss_mlp": 1.04205465, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 1.8932449927934136, "language_loss": 0.69031835, "learning_rate": 3.952789669213172e-06, "loss": 0.7125535, "num_input_tokens_seen": 34787680, "step": 1617, "time_per_iteration": 2.714629888534546 }, { "auxiliary_loss_clip": 0.01152637, "auxiliary_loss_mlp": 0.01057882, "balance_loss_clip": 1.05386162, "balance_loss_mlp": 1.03127456, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.755493071880773, "language_loss": 0.80910909, "learning_rate": 3.952705511055698e-06, "loss": 0.83121431, "num_input_tokens_seen": 34808330, "step": 1618, "time_per_iteration": 2.8081507682800293 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.06048679, "balance_loss_mlp": 1.03678131, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.667659488760432, "language_loss": 0.92901695, "learning_rate": 3.952621278851435e-06, "loss": 0.95128226, "num_input_tokens_seen": 34830020, "step": 1619, "time_per_iteration": 2.7752275466918945 }, { "auxiliary_loss_clip": 0.01175515, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.05952573, "balance_loss_mlp": 1.03512526, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 2.1973967195348902, "language_loss": 0.88978708, "learning_rate": 3.9525369726035784e-06, "loss": 0.91212475, "num_input_tokens_seen": 34850330, "step": 1620, "time_per_iteration": 2.771176338195801 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01065329, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.0397464, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 2.154793183838835, "language_loss": 0.77331412, "learning_rate": 3.952452592315324e-06, "loss": 0.79550499, "num_input_tokens_seen": 34871640, "step": 1621, "time_per_iteration": 2.6740832328796387 }, { "auxiliary_loss_clip": 0.01131342, "auxiliary_loss_mlp": 0.01082359, "balance_loss_clip": 1.04798269, "balance_loss_mlp": 1.05640674, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 1.9420195171733425, "language_loss": 0.77671158, "learning_rate": 3.952368137989871e-06, "loss": 0.79884863, "num_input_tokens_seen": 34888100, "step": 1622, "time_per_iteration": 2.7247347831726074 }, { "auxiliary_loss_clip": 0.01150185, "auxiliary_loss_mlp": 0.01064277, "balance_loss_clip": 1.05335355, "balance_loss_mlp": 1.04025626, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.8603109065807166, "language_loss": 0.85784447, "learning_rate": 3.9522836096304225e-06, "loss": 0.87998909, "num_input_tokens_seen": 34910485, "step": 1623, "time_per_iteration": 2.785388469696045 }, { "auxiliary_loss_clip": 0.0117659, "auxiliary_loss_mlp": 0.01064102, "balance_loss_clip": 1.05769634, "balance_loss_mlp": 1.04043913, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 2.39630116599036, "language_loss": 0.80534065, "learning_rate": 3.952199007240184e-06, "loss": 0.82774758, "num_input_tokens_seen": 34928615, "step": 1624, "time_per_iteration": 2.6818184852600098 }, { "auxiliary_loss_clip": 0.01176335, "auxiliary_loss_mlp": 0.01056788, "balance_loss_clip": 1.05616927, "balance_loss_mlp": 1.03465128, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.44379144971104, "language_loss": 0.85556966, "learning_rate": 3.952114330822364e-06, "loss": 0.8779009, "num_input_tokens_seen": 34946045, "step": 1625, "time_per_iteration": 2.6594324111938477 }, { "auxiliary_loss_clip": 0.01181411, "auxiliary_loss_mlp": 0.0106682, "balance_loss_clip": 1.06004012, "balance_loss_mlp": 1.04411101, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 2.058269503362464, "language_loss": 0.85431635, "learning_rate": 3.952029580380172e-06, "loss": 0.87679869, "num_input_tokens_seen": 34962865, "step": 1626, "time_per_iteration": 2.7384841442108154 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.007823, "balance_loss_clip": 1.05467701, "balance_loss_mlp": 1.000211, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.0701580273163036, "language_loss": 0.83370024, "learning_rate": 3.9519447559168234e-06, "loss": 0.85317636, "num_input_tokens_seen": 34983505, "step": 1627, "time_per_iteration": 2.8269948959350586 }, { "auxiliary_loss_clip": 0.01168188, "auxiliary_loss_mlp": 0.01065332, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.04275417, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 1.8143281262319713, "language_loss": 0.84674478, "learning_rate": 3.951859857435534e-06, "loss": 0.86907995, "num_input_tokens_seen": 35001825, "step": 1628, "time_per_iteration": 2.6151821613311768 }, { "auxiliary_loss_clip": 0.01170257, "auxiliary_loss_mlp": 0.01058367, "balance_loss_clip": 1.05374515, "balance_loss_mlp": 1.03558636, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.5658807312485334, "language_loss": 0.75531614, "learning_rate": 3.951774884939523e-06, "loss": 0.77760237, "num_input_tokens_seen": 35023075, "step": 1629, "time_per_iteration": 2.6794557571411133 }, { "auxiliary_loss_clip": 0.01129604, "auxiliary_loss_mlp": 0.01056904, "balance_loss_clip": 1.0577755, "balance_loss_mlp": 1.03169131, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 1.6755762488260617, "language_loss": 0.78487194, "learning_rate": 3.951689838432013e-06, "loss": 0.80673707, "num_input_tokens_seen": 35043480, "step": 1630, "time_per_iteration": 2.7986228466033936 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.05938148, "balance_loss_mlp": 1.03804946, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.8175370389297836, "language_loss": 0.86677933, "learning_rate": 3.951604717916228e-06, "loss": 0.88909143, "num_input_tokens_seen": 35061490, "step": 1631, "time_per_iteration": 2.6350157260894775 }, { "auxiliary_loss_clip": 0.01171369, "auxiliary_loss_mlp": 0.01058643, "balance_loss_clip": 1.0610745, "balance_loss_mlp": 1.03625536, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.2030333753544773, "language_loss": 0.82996809, "learning_rate": 3.9515195233953975e-06, "loss": 0.85226822, "num_input_tokens_seen": 35079670, "step": 1632, "time_per_iteration": 2.7990314960479736 }, { "auxiliary_loss_clip": 0.01148453, "auxiliary_loss_mlp": 0.01064004, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.04102039, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 1.531777801288569, "language_loss": 0.7882973, "learning_rate": 3.951434254872751e-06, "loss": 0.81042188, "num_input_tokens_seen": 35099205, "step": 1633, "time_per_iteration": 2.735353708267212 }, { "auxiliary_loss_clip": 0.01170992, "auxiliary_loss_mlp": 0.01061681, "balance_loss_clip": 1.05558002, "balance_loss_mlp": 1.03731489, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 2.4037572513069687, "language_loss": 0.73209554, "learning_rate": 3.951348912351521e-06, "loss": 0.75442231, "num_input_tokens_seen": 35115270, "step": 1634, "time_per_iteration": 2.688596248626709 }, { "auxiliary_loss_clip": 0.01162743, "auxiliary_loss_mlp": 0.01071164, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04672611, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 3.2244021303311405, "language_loss": 0.72553629, "learning_rate": 3.951263495834947e-06, "loss": 0.74787533, "num_input_tokens_seen": 35134065, "step": 1635, "time_per_iteration": 2.720266342163086 }, { "auxiliary_loss_clip": 0.01154765, "auxiliary_loss_mlp": 0.01068349, "balance_loss_clip": 1.05526268, "balance_loss_mlp": 1.04177701, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 1.7699592352066487, "language_loss": 0.78026646, "learning_rate": 3.951178005326264e-06, "loss": 0.80249763, "num_input_tokens_seen": 35154870, "step": 1636, "time_per_iteration": 2.9618239402770996 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.0368979, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 1.8332710343018006, "language_loss": 0.69524407, "learning_rate": 3.951092440828715e-06, "loss": 0.71747863, "num_input_tokens_seen": 35171850, "step": 1637, "time_per_iteration": 2.671178102493286 }, { "auxiliary_loss_clip": 0.01188316, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.05926394, "balance_loss_mlp": 1.03500926, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.3775286970935503, "language_loss": 0.77050996, "learning_rate": 3.951006802345545e-06, "loss": 0.79298162, "num_input_tokens_seen": 35188795, "step": 1638, "time_per_iteration": 2.62457537651062 }, { "auxiliary_loss_clip": 0.01140265, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.05538166, "balance_loss_mlp": 1.02941203, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4014263071342075, "language_loss": 0.72620296, "learning_rate": 3.950921089880003e-06, "loss": 0.74812591, "num_input_tokens_seen": 35212100, "step": 1639, "time_per_iteration": 2.7499618530273438 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.0582087, "balance_loss_mlp": 1.02831531, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7213189449892274, "language_loss": 0.88679075, "learning_rate": 3.950835303435337e-06, "loss": 0.90904212, "num_input_tokens_seen": 35230390, "step": 1640, "time_per_iteration": 2.664133071899414 }, { "auxiliary_loss_clip": 0.01177786, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.05981517, "balance_loss_mlp": 1.02130616, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.0701766566296915, "language_loss": 0.80567038, "learning_rate": 3.950749443014801e-06, "loss": 0.82789278, "num_input_tokens_seen": 35250405, "step": 1641, "time_per_iteration": 2.645353317260742 }, { "auxiliary_loss_clip": 0.011756, "auxiliary_loss_mlp": 0.01062641, "balance_loss_clip": 1.05896795, "balance_loss_mlp": 1.03742838, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.64335263522248, "language_loss": 0.86117625, "learning_rate": 3.95066350862165e-06, "loss": 0.88355863, "num_input_tokens_seen": 35262820, "step": 1642, "time_per_iteration": 5.81004524230957 }, { "auxiliary_loss_clip": 0.01151329, "auxiliary_loss_mlp": 0.01056693, "balance_loss_clip": 1.05857074, "balance_loss_mlp": 1.03404331, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 2.7092208079201607, "language_loss": 0.8058275, "learning_rate": 3.950577500259144e-06, "loss": 0.82790768, "num_input_tokens_seen": 35284490, "step": 1643, "time_per_iteration": 2.7235090732574463 }, { "auxiliary_loss_clip": 0.01174075, "auxiliary_loss_mlp": 0.01077435, "balance_loss_clip": 1.05761337, "balance_loss_mlp": 1.05470192, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.0561742686210676, "language_loss": 0.82546467, "learning_rate": 3.950491417930543e-06, "loss": 0.84797978, "num_input_tokens_seen": 35302815, "step": 1644, "time_per_iteration": 4.318823575973511 }, { "auxiliary_loss_clip": 0.01163142, "auxiliary_loss_mlp": 0.00782463, "balance_loss_clip": 1.05607629, "balance_loss_mlp": 1.00010633, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 1.6945489721625269, "language_loss": 0.68219113, "learning_rate": 3.9504052616391124e-06, "loss": 0.70164716, "num_input_tokens_seen": 35321175, "step": 1645, "time_per_iteration": 2.6626670360565186 }, { "auxiliary_loss_clip": 0.01059795, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.02852345, "balance_loss_mlp": 1.04404068, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.8512889940087613, "language_loss": 0.60885167, "learning_rate": 3.950319031388119e-06, "loss": 0.62992585, "num_input_tokens_seen": 35381740, "step": 1646, "time_per_iteration": 4.752669095993042 }, { "auxiliary_loss_clip": 0.01147006, "auxiliary_loss_mlp": 0.0105976, "balance_loss_clip": 1.0574733, "balance_loss_mlp": 1.03464222, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 5.785751121573768, "language_loss": 0.73211443, "learning_rate": 3.950232727180833e-06, "loss": 0.7541821, "num_input_tokens_seen": 35403760, "step": 1647, "time_per_iteration": 2.783442974090576 }, { "auxiliary_loss_clip": 0.01161789, "auxiliary_loss_mlp": 0.01066314, "balance_loss_clip": 1.06016421, "balance_loss_mlp": 1.04445136, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 1.828428298130997, "language_loss": 0.84094375, "learning_rate": 3.950146349020525e-06, "loss": 0.86322474, "num_input_tokens_seen": 35424050, "step": 1648, "time_per_iteration": 2.709559679031372 }, { "auxiliary_loss_clip": 0.01065954, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 1.02565169, "balance_loss_mlp": 1.01722264, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7317434537206132, "language_loss": 0.55672908, "learning_rate": 3.950059896910473e-06, "loss": 0.5775966, "num_input_tokens_seen": 35481690, "step": 1649, "time_per_iteration": 3.0944156646728516 }, { "auxiliary_loss_clip": 0.0117133, "auxiliary_loss_mlp": 0.01049543, "balance_loss_clip": 1.05603158, "balance_loss_mlp": 1.02723897, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 2.195431109372502, "language_loss": 0.8975327, "learning_rate": 3.949973370853954e-06, "loss": 0.91974139, "num_input_tokens_seen": 35498635, "step": 1650, "time_per_iteration": 2.7438554763793945 }, { "auxiliary_loss_clip": 0.01033978, "auxiliary_loss_mlp": 0.00758727, "balance_loss_clip": 1.02943921, "balance_loss_mlp": 0.9997822, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.8036050505402587, "language_loss": 0.63734978, "learning_rate": 3.94988677085425e-06, "loss": 0.65527683, "num_input_tokens_seen": 35565720, "step": 1651, "time_per_iteration": 3.40269136428833 }, { "auxiliary_loss_clip": 0.01170347, "auxiliary_loss_mlp": 0.01062486, "balance_loss_clip": 1.05790281, "balance_loss_mlp": 1.03842974, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 1.9744130417114842, "language_loss": 0.88115525, "learning_rate": 3.949800096914643e-06, "loss": 0.90348363, "num_input_tokens_seen": 35586000, "step": 1652, "time_per_iteration": 2.6695117950439453 }, { "auxiliary_loss_clip": 0.0116773, "auxiliary_loss_mlp": 0.01062073, "balance_loss_clip": 1.06095552, "balance_loss_mlp": 1.03895831, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 2.166773052437996, "language_loss": 0.81789082, "learning_rate": 3.949713349038422e-06, "loss": 0.84018886, "num_input_tokens_seen": 35604355, "step": 1653, "time_per_iteration": 2.7136831283569336 }, { "auxiliary_loss_clip": 0.01173152, "auxiliary_loss_mlp": 0.00780466, "balance_loss_clip": 1.05683279, "balance_loss_mlp": 1.00016594, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 1.662037391605293, "language_loss": 0.79489207, "learning_rate": 3.949626527228875e-06, "loss": 0.81442821, "num_input_tokens_seen": 35625495, "step": 1654, "time_per_iteration": 2.645875930786133 }, { "auxiliary_loss_clip": 0.01187918, "auxiliary_loss_mlp": 0.01056849, "balance_loss_clip": 1.06405056, "balance_loss_mlp": 1.03561759, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 1.7263610037420916, "language_loss": 0.81038272, "learning_rate": 3.949539631489295e-06, "loss": 0.83283037, "num_input_tokens_seen": 35645030, "step": 1655, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01181205, "auxiliary_loss_mlp": 0.01055977, "balance_loss_clip": 1.05679035, "balance_loss_mlp": 1.03294599, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 2.426795421082641, "language_loss": 0.80429518, "learning_rate": 3.9494526618229765e-06, "loss": 0.82666701, "num_input_tokens_seen": 35664305, "step": 1656, "time_per_iteration": 2.6283950805664062 }, { "auxiliary_loss_clip": 0.01170003, "auxiliary_loss_mlp": 0.01061881, "balance_loss_clip": 1.05787742, "balance_loss_mlp": 1.03870714, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.4960238412267362, "language_loss": 0.89040691, "learning_rate": 3.949365618233217e-06, "loss": 0.91272575, "num_input_tokens_seen": 35684060, "step": 1657, "time_per_iteration": 2.653674602508545 }, { "auxiliary_loss_clip": 0.01165842, "auxiliary_loss_mlp": 0.01057352, "balance_loss_clip": 1.05830753, "balance_loss_mlp": 1.0329144, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 2.1866084372248062, "language_loss": 0.84684521, "learning_rate": 3.9492785007233195e-06, "loss": 0.86907715, "num_input_tokens_seen": 35703250, "step": 1658, "time_per_iteration": 2.6897473335266113 }, { "auxiliary_loss_clip": 0.01069806, "auxiliary_loss_mlp": 0.01015844, "balance_loss_clip": 1.02042234, "balance_loss_mlp": 1.01292348, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9123227767672076, "language_loss": 0.60828507, "learning_rate": 3.949191309296585e-06, "loss": 0.62914157, "num_input_tokens_seen": 35762165, "step": 1659, "time_per_iteration": 3.273890495300293 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.01051829, "balance_loss_clip": 1.05082798, "balance_loss_mlp": 1.02814245, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 1.9344290476513741, "language_loss": 0.84892076, "learning_rate": 3.949104043956321e-06, "loss": 0.87096334, "num_input_tokens_seen": 35781520, "step": 1660, "time_per_iteration": 2.788018226623535 }, { "auxiliary_loss_clip": 0.01149163, "auxiliary_loss_mlp": 0.01060092, "balance_loss_clip": 1.05374026, "balance_loss_mlp": 1.03514171, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 1.9493882663610318, "language_loss": 0.80024737, "learning_rate": 3.949016704705836e-06, "loss": 0.82234001, "num_input_tokens_seen": 35799565, "step": 1661, "time_per_iteration": 2.6537399291992188 }, { "auxiliary_loss_clip": 0.01172787, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.05715156, "balance_loss_mlp": 1.03153503, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 2.0152235709188377, "language_loss": 0.83560598, "learning_rate": 3.948929291548443e-06, "loss": 0.85788912, "num_input_tokens_seen": 35821085, "step": 1662, "time_per_iteration": 2.753807783126831 }, { "auxiliary_loss_clip": 0.01154838, "auxiliary_loss_mlp": 0.01061466, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03616929, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 1.9355779644050557, "language_loss": 0.88865256, "learning_rate": 3.9488418044874546e-06, "loss": 0.91081554, "num_input_tokens_seen": 35839840, "step": 1663, "time_per_iteration": 2.6829047203063965 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01061692, "balance_loss_clip": 1.06228638, "balance_loss_mlp": 1.03825521, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.7925330820671084, "language_loss": 0.70140731, "learning_rate": 3.948754243526191e-06, "loss": 0.72384882, "num_input_tokens_seen": 35861545, "step": 1664, "time_per_iteration": 2.809300184249878 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01055306, "balance_loss_clip": 1.05475903, "balance_loss_mlp": 1.03312087, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.4978474602303895, "language_loss": 0.78981555, "learning_rate": 3.94866660866797e-06, "loss": 0.81179744, "num_input_tokens_seen": 35878295, "step": 1665, "time_per_iteration": 2.7010488510131836 }, { "auxiliary_loss_clip": 0.01175861, "auxiliary_loss_mlp": 0.01070341, "balance_loss_clip": 1.06286561, "balance_loss_mlp": 1.04742861, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 3.1438625724360945, "language_loss": 0.70054829, "learning_rate": 3.9485788999161165e-06, "loss": 0.7230103, "num_input_tokens_seen": 35898990, "step": 1666, "time_per_iteration": 2.689879894256592 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01074593, "balance_loss_clip": 1.05082703, "balance_loss_mlp": 1.04946339, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.7583449522195267, "language_loss": 0.78647351, "learning_rate": 3.948491117273956e-06, "loss": 0.80832791, "num_input_tokens_seen": 35916225, "step": 1667, "time_per_iteration": 2.8973352909088135 }, { "auxiliary_loss_clip": 0.01153352, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.05452693, "balance_loss_mlp": 1.03752255, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.4011089045072187, "language_loss": 0.77357388, "learning_rate": 3.948403260744817e-06, "loss": 0.7957356, "num_input_tokens_seen": 35934630, "step": 1668, "time_per_iteration": 3.2600321769714355 }, { "auxiliary_loss_clip": 0.01184879, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.05833495, "balance_loss_mlp": 1.03523922, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.7407668002390366, "language_loss": 0.77520061, "learning_rate": 3.948315330332031e-06, "loss": 0.79764307, "num_input_tokens_seen": 35953855, "step": 1669, "time_per_iteration": 2.6899471282958984 }, { "auxiliary_loss_clip": 0.0118887, "auxiliary_loss_mlp": 0.01067842, "balance_loss_clip": 1.05948365, "balance_loss_mlp": 1.04416728, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 5.441134829238958, "language_loss": 0.85160148, "learning_rate": 3.948227326038933e-06, "loss": 0.87416857, "num_input_tokens_seen": 35974555, "step": 1670, "time_per_iteration": 2.616867780685425 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01055607, "balance_loss_clip": 1.05584121, "balance_loss_mlp": 1.03354108, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 1.4849262119454174, "language_loss": 0.76836258, "learning_rate": 3.9481392478688586e-06, "loss": 0.79068166, "num_input_tokens_seen": 35996830, "step": 1671, "time_per_iteration": 2.658254384994507 }, { "auxiliary_loss_clip": 0.01061447, "auxiliary_loss_mlp": 0.01017561, "balance_loss_clip": 1.02178144, "balance_loss_mlp": 1.01454473, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7781454358921105, "language_loss": 0.60718858, "learning_rate": 3.948051095825149e-06, "loss": 0.62797856, "num_input_tokens_seen": 36054465, "step": 1672, "time_per_iteration": 3.1269097328186035 }, { "auxiliary_loss_clip": 0.01143177, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.05112922, "balance_loss_mlp": 1.04055333, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 2.433278134910662, "language_loss": 0.7711426, "learning_rate": 3.947962869911147e-06, "loss": 0.79320776, "num_input_tokens_seen": 36073480, "step": 1673, "time_per_iteration": 2.6931638717651367 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01056611, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.03262639, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.074683072839241, "language_loss": 0.73173523, "learning_rate": 3.947874570130197e-06, "loss": 0.75362229, "num_input_tokens_seen": 36091830, "step": 1674, "time_per_iteration": 2.7188127040863037 }, { "auxiliary_loss_clip": 0.01172389, "auxiliary_loss_mlp": 0.00779533, "balance_loss_clip": 1.0556165, "balance_loss_mlp": 1.00024796, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 2.1982379565146872, "language_loss": 0.79456973, "learning_rate": 3.947786196485649e-06, "loss": 0.81408894, "num_input_tokens_seen": 36111400, "step": 1675, "time_per_iteration": 2.712090253829956 }, { "auxiliary_loss_clip": 0.01182659, "auxiliary_loss_mlp": 0.01063327, "balance_loss_clip": 1.05801332, "balance_loss_mlp": 1.04239404, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.408955682155161, "language_loss": 0.8120935, "learning_rate": 3.947697748980853e-06, "loss": 0.83455336, "num_input_tokens_seen": 36129345, "step": 1676, "time_per_iteration": 2.685472249984741 }, { "auxiliary_loss_clip": 0.01175397, "auxiliary_loss_mlp": 0.01057105, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.03546858, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.008035557658629, "language_loss": 0.86132157, "learning_rate": 3.947609227619163e-06, "loss": 0.88364655, "num_input_tokens_seen": 36146255, "step": 1677, "time_per_iteration": 2.6589157581329346 }, { "auxiliary_loss_clip": 0.01162997, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.05363441, "balance_loss_mlp": 1.02896047, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.160847391025828, "language_loss": 0.86006588, "learning_rate": 3.947520632403936e-06, "loss": 0.88220382, "num_input_tokens_seen": 36164050, "step": 1678, "time_per_iteration": 2.694347858428955 }, { "auxiliary_loss_clip": 0.0116292, "auxiliary_loss_mlp": 0.01056376, "balance_loss_clip": 1.0587275, "balance_loss_mlp": 1.03406048, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 12.700254532531051, "language_loss": 0.89978886, "learning_rate": 3.947431963338532e-06, "loss": 0.92198181, "num_input_tokens_seen": 36183530, "step": 1679, "time_per_iteration": 2.6741397380828857 }, { "auxiliary_loss_clip": 0.01071086, "auxiliary_loss_mlp": 0.0101685, "balance_loss_clip": 1.02328789, "balance_loss_mlp": 1.01360798, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.7882499243548835, "language_loss": 0.52985126, "learning_rate": 3.947343220426312e-06, "loss": 0.55073065, "num_input_tokens_seen": 36248550, "step": 1680, "time_per_iteration": 3.169893503189087 }, { "auxiliary_loss_clip": 0.01185252, "auxiliary_loss_mlp": 0.00779951, "balance_loss_clip": 1.06022644, "balance_loss_mlp": 1.00017488, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 1.6642182724084642, "language_loss": 0.76869059, "learning_rate": 3.947254403670641e-06, "loss": 0.7883426, "num_input_tokens_seen": 36266065, "step": 1681, "time_per_iteration": 4.146950006484985 }, { "auxiliary_loss_clip": 0.01156046, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.0539515, "balance_loss_mlp": 1.03469992, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.3884003317971225, "language_loss": 0.93957508, "learning_rate": 3.947165513074889e-06, "loss": 0.96173531, "num_input_tokens_seen": 36280960, "step": 1682, "time_per_iteration": 4.220505237579346 }, { "auxiliary_loss_clip": 0.01173183, "auxiliary_loss_mlp": 0.01053261, "balance_loss_clip": 1.05487084, "balance_loss_mlp": 1.03133821, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 3.5300660189263917, "language_loss": 0.87618893, "learning_rate": 3.947076548642425e-06, "loss": 0.89845335, "num_input_tokens_seen": 36299010, "step": 1683, "time_per_iteration": 2.635636329650879 }, { "auxiliary_loss_clip": 0.01128888, "auxiliary_loss_mlp": 0.01063089, "balance_loss_clip": 1.04814756, "balance_loss_mlp": 1.04008126, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 2.3337760241024923, "language_loss": 0.74566805, "learning_rate": 3.946987510376624e-06, "loss": 0.76758784, "num_input_tokens_seen": 36318400, "step": 1684, "time_per_iteration": 4.417364835739136 }, { "auxiliary_loss_clip": 0.01053031, "auxiliary_loss_mlp": 0.0101182, "balance_loss_clip": 1.02547038, "balance_loss_mlp": 1.00853014, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7564631726021327, "language_loss": 0.61085057, "learning_rate": 3.9468983982808615e-06, "loss": 0.63149905, "num_input_tokens_seen": 36381815, "step": 1685, "time_per_iteration": 4.87179970741272 }, { "auxiliary_loss_clip": 0.01157045, "auxiliary_loss_mlp": 0.01056064, "balance_loss_clip": 1.05233479, "balance_loss_mlp": 1.0341655, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 4.297801792672815, "language_loss": 0.61381406, "learning_rate": 3.946809212358516e-06, "loss": 0.6359452, "num_input_tokens_seen": 36404320, "step": 1686, "time_per_iteration": 2.8289108276367188 }, { "auxiliary_loss_clip": 0.01144631, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.05645001, "balance_loss_mlp": 1.03678524, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.21923850158845, "language_loss": 0.81216162, "learning_rate": 3.946719952612972e-06, "loss": 0.83420682, "num_input_tokens_seen": 36427510, "step": 1687, "time_per_iteration": 2.947535276412964 }, { "auxiliary_loss_clip": 0.0117612, "auxiliary_loss_mlp": 0.0105614, "balance_loss_clip": 1.05933213, "balance_loss_mlp": 1.03403926, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 1.7955898786084035, "language_loss": 0.71943259, "learning_rate": 3.94663061904761e-06, "loss": 0.74175525, "num_input_tokens_seen": 36448230, "step": 1688, "time_per_iteration": 2.693249225616455 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.05288756, "balance_loss_mlp": 1.04079556, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.636795901714516, "language_loss": 0.86876953, "learning_rate": 3.94654121166582e-06, "loss": 0.89092261, "num_input_tokens_seen": 36464395, "step": 1689, "time_per_iteration": 2.677992820739746 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01057982, "balance_loss_clip": 1.05476904, "balance_loss_mlp": 1.0378834, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 2.2211105929909696, "language_loss": 0.88170946, "learning_rate": 3.946451730470993e-06, "loss": 0.90401113, "num_input_tokens_seen": 36486475, "step": 1690, "time_per_iteration": 2.707209348678589 }, { "auxiliary_loss_clip": 0.01158767, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.05507553, "balance_loss_mlp": 1.02973664, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 2.08291471600754, "language_loss": 0.83348423, "learning_rate": 3.946362175466521e-06, "loss": 0.85559577, "num_input_tokens_seen": 36505310, "step": 1691, "time_per_iteration": 2.6521170139312744 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01051716, "balance_loss_clip": 1.05550599, "balance_loss_mlp": 1.03016281, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.704519528530946, "language_loss": 0.66773653, "learning_rate": 3.946272546655801e-06, "loss": 0.68987525, "num_input_tokens_seen": 36529820, "step": 1692, "time_per_iteration": 2.799353837966919 }, { "auxiliary_loss_clip": 0.01144502, "auxiliary_loss_mlp": 0.0107473, "balance_loss_clip": 1.05057836, "balance_loss_mlp": 1.05258095, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.8345924563029705, "language_loss": 0.75939322, "learning_rate": 3.94618284404223e-06, "loss": 0.78158557, "num_input_tokens_seen": 36549000, "step": 1693, "time_per_iteration": 2.6711113452911377 }, { "auxiliary_loss_clip": 0.01132621, "auxiliary_loss_mlp": 0.01057162, "balance_loss_clip": 1.04893303, "balance_loss_mlp": 1.03289056, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.7027745569702395, "language_loss": 0.87503564, "learning_rate": 3.9460930676292105e-06, "loss": 0.89693356, "num_input_tokens_seen": 36567515, "step": 1694, "time_per_iteration": 2.749119520187378 }, { "auxiliary_loss_clip": 0.01130673, "auxiliary_loss_mlp": 0.01058451, "balance_loss_clip": 1.04954553, "balance_loss_mlp": 1.033095, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.7649462193878245, "language_loss": 0.79299057, "learning_rate": 3.946003217420147e-06, "loss": 0.8148818, "num_input_tokens_seen": 36586190, "step": 1695, "time_per_iteration": 2.839081048965454 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.04818296, "balance_loss_mlp": 1.03772628, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 2.7190993931598446, "language_loss": 0.86494684, "learning_rate": 3.945913293418447e-06, "loss": 0.88683105, "num_input_tokens_seen": 36607495, "step": 1696, "time_per_iteration": 2.7802348136901855 }, { "auxiliary_loss_clip": 0.01168675, "auxiliary_loss_mlp": 0.01054661, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.03315568, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 1.7889048836535288, "language_loss": 0.82350796, "learning_rate": 3.945823295627519e-06, "loss": 0.84574133, "num_input_tokens_seen": 36628555, "step": 1697, "time_per_iteration": 2.667962074279785 }, { "auxiliary_loss_clip": 0.01184333, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.05680871, "balance_loss_mlp": 1.033149, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.0464291543972006, "language_loss": 0.81198204, "learning_rate": 3.9457332240507775e-06, "loss": 0.83438087, "num_input_tokens_seen": 36646250, "step": 1698, "time_per_iteration": 2.6484432220458984 }, { "auxiliary_loss_clip": 0.01150498, "auxiliary_loss_mlp": 0.01053546, "balance_loss_clip": 1.05696845, "balance_loss_mlp": 1.03226686, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.3020250981163226, "language_loss": 0.75612724, "learning_rate": 3.945643078691637e-06, "loss": 0.77816761, "num_input_tokens_seen": 36666675, "step": 1699, "time_per_iteration": 2.8040614128112793 }, { "auxiliary_loss_clip": 0.01162088, "auxiliary_loss_mlp": 0.01050379, "balance_loss_clip": 1.06041551, "balance_loss_mlp": 1.02827764, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.6839869206777538, "language_loss": 0.80395639, "learning_rate": 3.945552859553516e-06, "loss": 0.8260811, "num_input_tokens_seen": 36685225, "step": 1700, "time_per_iteration": 2.6701290607452393 }, { "auxiliary_loss_clip": 0.0117076, "auxiliary_loss_mlp": 0.0104804, "balance_loss_clip": 1.05714083, "balance_loss_mlp": 1.02653444, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.102621975458346, "language_loss": 0.76877582, "learning_rate": 3.945462566639836e-06, "loss": 0.79096377, "num_input_tokens_seen": 36705985, "step": 1701, "time_per_iteration": 2.748201847076416 }, { "auxiliary_loss_clip": 0.01182259, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.06157088, "balance_loss_mlp": 1.02852523, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 2.1099726588763965, "language_loss": 0.77922845, "learning_rate": 3.945372199954019e-06, "loss": 0.80155474, "num_input_tokens_seen": 36725815, "step": 1702, "time_per_iteration": 2.6703274250030518 }, { "auxiliary_loss_clip": 0.01156323, "auxiliary_loss_mlp": 0.01052524, "balance_loss_clip": 1.05596721, "balance_loss_mlp": 1.03126872, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 2.2326457826946293, "language_loss": 0.94093609, "learning_rate": 3.945281759499494e-06, "loss": 0.96302462, "num_input_tokens_seen": 36742345, "step": 1703, "time_per_iteration": 2.6712698936462402 }, { "auxiliary_loss_clip": 0.01034483, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.02765131, "balance_loss_mlp": 1.03315914, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.8815387598011586, "language_loss": 0.55096036, "learning_rate": 3.94519124527969e-06, "loss": 0.57168299, "num_input_tokens_seen": 36798775, "step": 1704, "time_per_iteration": 3.2863855361938477 }, { "auxiliary_loss_clip": 0.01186822, "auxiliary_loss_mlp": 0.01053701, "balance_loss_clip": 1.06026638, "balance_loss_mlp": 1.03088403, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.051901555713709, "language_loss": 0.84025991, "learning_rate": 3.945100657298039e-06, "loss": 0.86266518, "num_input_tokens_seen": 36816295, "step": 1705, "time_per_iteration": 2.8991851806640625 }, { "auxiliary_loss_clip": 0.01045354, "auxiliary_loss_mlp": 0.01018361, "balance_loss_clip": 1.02622223, "balance_loss_mlp": 1.01526153, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7692746082941451, "language_loss": 0.60408181, "learning_rate": 3.9450099955579765e-06, "loss": 0.62471896, "num_input_tokens_seen": 36882030, "step": 1706, "time_per_iteration": 3.2174558639526367 }, { "auxiliary_loss_clip": 0.01149922, "auxiliary_loss_mlp": 0.01051211, "balance_loss_clip": 1.05388391, "balance_loss_mlp": 1.02812052, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.201796189576969, "language_loss": 0.85937822, "learning_rate": 3.94491926006294e-06, "loss": 0.88138962, "num_input_tokens_seen": 36899245, "step": 1707, "time_per_iteration": 2.689208507537842 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.05941081, "balance_loss_mlp": 1.03114319, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.471109036018689, "language_loss": 0.73299325, "learning_rate": 3.944828450816369e-06, "loss": 0.75521457, "num_input_tokens_seen": 36920950, "step": 1708, "time_per_iteration": 2.679760456085205 }, { "auxiliary_loss_clip": 0.01155833, "auxiliary_loss_mlp": 0.00780571, "balance_loss_clip": 1.05718231, "balance_loss_mlp": 1.00042295, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.7051644476897239, "language_loss": 0.91616452, "learning_rate": 3.944737567821709e-06, "loss": 0.93552846, "num_input_tokens_seen": 36938900, "step": 1709, "time_per_iteration": 2.6754679679870605 }, { "auxiliary_loss_clip": 0.01124911, "auxiliary_loss_mlp": 0.01057008, "balance_loss_clip": 1.05144072, "balance_loss_mlp": 1.0343945, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.1056252966717275, "language_loss": 0.88004494, "learning_rate": 3.944646611082406e-06, "loss": 0.90186411, "num_input_tokens_seen": 36957010, "step": 1710, "time_per_iteration": 2.708723306655884 }, { "auxiliary_loss_clip": 0.01171004, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.036973, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 1.7046493271202992, "language_loss": 0.79370153, "learning_rate": 3.944555580601908e-06, "loss": 0.81600821, "num_input_tokens_seen": 36977690, "step": 1711, "time_per_iteration": 2.631908416748047 }, { "auxiliary_loss_clip": 0.01156003, "auxiliary_loss_mlp": 0.01055126, "balance_loss_clip": 1.05841637, "balance_loss_mlp": 1.03189242, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 3.2168061349371135, "language_loss": 0.73666596, "learning_rate": 3.944464476383668e-06, "loss": 0.75877726, "num_input_tokens_seen": 36997300, "step": 1712, "time_per_iteration": 2.7107467651367188 }, { "auxiliary_loss_clip": 0.01133407, "auxiliary_loss_mlp": 0.01056055, "balance_loss_clip": 1.05496907, "balance_loss_mlp": 1.03334546, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 1.974447377126898, "language_loss": 0.87049067, "learning_rate": 3.94437329843114e-06, "loss": 0.89238536, "num_input_tokens_seen": 37016110, "step": 1713, "time_per_iteration": 2.6532411575317383 }, { "auxiliary_loss_clip": 0.0116832, "auxiliary_loss_mlp": 0.01060237, "balance_loss_clip": 1.05669498, "balance_loss_mlp": 1.03877962, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 1.57388574383124, "language_loss": 0.72406238, "learning_rate": 3.944282046747782e-06, "loss": 0.74634796, "num_input_tokens_seen": 37036405, "step": 1714, "time_per_iteration": 2.5987610816955566 }, { "auxiliary_loss_clip": 0.01174482, "auxiliary_loss_mlp": 0.01063165, "balance_loss_clip": 1.05715692, "balance_loss_mlp": 1.03934693, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 2.1959530175190434, "language_loss": 0.91065919, "learning_rate": 3.944190721337053e-06, "loss": 0.93303567, "num_input_tokens_seen": 37057580, "step": 1715, "time_per_iteration": 2.743833303451538 }, { "auxiliary_loss_clip": 0.01170297, "auxiliary_loss_mlp": 0.01054891, "balance_loss_clip": 1.05448914, "balance_loss_mlp": 1.03305221, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 1.8741123562687005, "language_loss": 0.75969976, "learning_rate": 3.944099322202418e-06, "loss": 0.78195167, "num_input_tokens_seen": 37079120, "step": 1716, "time_per_iteration": 2.748903274536133 }, { "auxiliary_loss_clip": 0.01162664, "auxiliary_loss_mlp": 0.01061895, "balance_loss_clip": 1.05617428, "balance_loss_mlp": 1.03804111, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 3.178190042364093, "language_loss": 0.85308528, "learning_rate": 3.944007849347342e-06, "loss": 0.87533092, "num_input_tokens_seen": 37099710, "step": 1717, "time_per_iteration": 2.690772533416748 }, { "auxiliary_loss_clip": 0.01127019, "auxiliary_loss_mlp": 0.01067935, "balance_loss_clip": 1.05048633, "balance_loss_mlp": 1.04436755, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 1.8474438265561113, "language_loss": 0.82945001, "learning_rate": 3.943916302775292e-06, "loss": 0.85139954, "num_input_tokens_seen": 37117775, "step": 1718, "time_per_iteration": 2.7029476165771484 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01049869, "balance_loss_clip": 1.05912328, "balance_loss_mlp": 1.02701616, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.7728224248964342, "language_loss": 0.73396438, "learning_rate": 3.943824682489742e-06, "loss": 0.75617492, "num_input_tokens_seen": 37140280, "step": 1719, "time_per_iteration": 2.7653820514678955 }, { "auxiliary_loss_clip": 0.01168859, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.05861163, "balance_loss_mlp": 1.02786827, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 1.7819459058763836, "language_loss": 0.92692196, "learning_rate": 3.9437329884941665e-06, "loss": 0.94909501, "num_input_tokens_seen": 37158350, "step": 1720, "time_per_iteration": 4.1962480545043945 }, { "auxiliary_loss_clip": 0.01139894, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.05092323, "balance_loss_mlp": 1.02827597, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.6861044154168399, "language_loss": 0.79497123, "learning_rate": 3.943641220792039e-06, "loss": 0.81688046, "num_input_tokens_seen": 37177120, "step": 1721, "time_per_iteration": 4.524151802062988 }, { "auxiliary_loss_clip": 0.01130482, "auxiliary_loss_mlp": 0.01067754, "balance_loss_clip": 1.05380797, "balance_loss_mlp": 1.04109859, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.951940775381607, "language_loss": 0.80707669, "learning_rate": 3.9435493793868434e-06, "loss": 0.829059, "num_input_tokens_seen": 37195895, "step": 1722, "time_per_iteration": 2.7972562313079834 }, { "auxiliary_loss_clip": 0.01059018, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.02668202, "balance_loss_mlp": 1.03536737, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9413879826908518, "language_loss": 0.67161834, "learning_rate": 3.943457464282059e-06, "loss": 0.69259846, "num_input_tokens_seen": 37247270, "step": 1723, "time_per_iteration": 4.899553060531616 }, { "auxiliary_loss_clip": 0.01169875, "auxiliary_loss_mlp": 0.01062977, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.04193664, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 2.8641520576523116, "language_loss": 0.77715755, "learning_rate": 3.9433654754811745e-06, "loss": 0.7994861, "num_input_tokens_seen": 37265595, "step": 1724, "time_per_iteration": 2.7613437175750732 }, { "auxiliary_loss_clip": 0.01151829, "auxiliary_loss_mlp": 0.01069246, "balance_loss_clip": 1.05667496, "balance_loss_mlp": 1.04753852, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 2.6433978354033543, "language_loss": 0.74533165, "learning_rate": 3.943273412987676e-06, "loss": 0.76754242, "num_input_tokens_seen": 37286660, "step": 1725, "time_per_iteration": 4.557274580001831 }, { "auxiliary_loss_clip": 0.01137065, "auxiliary_loss_mlp": 0.01081067, "balance_loss_clip": 1.05264461, "balance_loss_mlp": 1.05832207, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.2241153649877865, "language_loss": 0.75043738, "learning_rate": 3.943181276805054e-06, "loss": 0.77261865, "num_input_tokens_seen": 37304915, "step": 1726, "time_per_iteration": 2.7098495960235596 }, { "auxiliary_loss_clip": 0.01150932, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.05345368, "balance_loss_mlp": 1.05610991, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 2.783771441956431, "language_loss": 0.73243797, "learning_rate": 3.9430890669368035e-06, "loss": 0.75473368, "num_input_tokens_seen": 37325265, "step": 1727, "time_per_iteration": 2.74774169921875 }, { "auxiliary_loss_clip": 0.01157922, "auxiliary_loss_mlp": 0.01068007, "balance_loss_clip": 1.05303776, "balance_loss_mlp": 1.04625082, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.172978726198527, "language_loss": 0.84373868, "learning_rate": 3.942996783386422e-06, "loss": 0.86599791, "num_input_tokens_seen": 37341650, "step": 1728, "time_per_iteration": 2.675724744796753 }, { "auxiliary_loss_clip": 0.01154897, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.0545603, "balance_loss_mlp": 1.0393219, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 2.1406499008555513, "language_loss": 0.70776087, "learning_rate": 3.942904426157406e-06, "loss": 0.7299149, "num_input_tokens_seen": 37360270, "step": 1729, "time_per_iteration": 2.6885008811950684 }, { "auxiliary_loss_clip": 0.01158623, "auxiliary_loss_mlp": 0.01068311, "balance_loss_clip": 1.05437422, "balance_loss_mlp": 1.04520774, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 2.4133379049648283, "language_loss": 0.81237471, "learning_rate": 3.9428119952532605e-06, "loss": 0.83464402, "num_input_tokens_seen": 37375225, "step": 1730, "time_per_iteration": 2.6659536361694336 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01063394, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.04314065, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 1.6634499611984725, "language_loss": 0.75829297, "learning_rate": 3.942719490677489e-06, "loss": 0.77978551, "num_input_tokens_seen": 37395165, "step": 1731, "time_per_iteration": 3.043125629425049 }, { "auxiliary_loss_clip": 0.01129913, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.0526607, "balance_loss_mlp": 1.04604149, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.8280179918091173, "language_loss": 0.8268069, "learning_rate": 3.9426269124336e-06, "loss": 0.84876388, "num_input_tokens_seen": 37414845, "step": 1732, "time_per_iteration": 2.96221661567688 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01067805, "balance_loss_clip": 1.05805755, "balance_loss_mlp": 1.04852867, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 1.9919813178368582, "language_loss": 0.83320522, "learning_rate": 3.942534260525104e-06, "loss": 0.85529828, "num_input_tokens_seen": 37432490, "step": 1733, "time_per_iteration": 2.7364420890808105 }, { "auxiliary_loss_clip": 0.01153374, "auxiliary_loss_mlp": 0.0106675, "balance_loss_clip": 1.05592012, "balance_loss_mlp": 1.04654372, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.4441875881355597, "language_loss": 0.76683885, "learning_rate": 3.942441534955514e-06, "loss": 0.78904009, "num_input_tokens_seen": 37449435, "step": 1734, "time_per_iteration": 2.669623851776123 }, { "auxiliary_loss_clip": 0.0113597, "auxiliary_loss_mlp": 0.01052567, "balance_loss_clip": 1.05042601, "balance_loss_mlp": 1.03255177, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.6775801166329647, "language_loss": 0.74826896, "learning_rate": 3.9423487357283465e-06, "loss": 0.7701543, "num_input_tokens_seen": 37469105, "step": 1735, "time_per_iteration": 2.8477160930633545 }, { "auxiliary_loss_clip": 0.01167698, "auxiliary_loss_mlp": 0.01055716, "balance_loss_clip": 1.05678105, "balance_loss_mlp": 1.0344727, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 1.7228393064183538, "language_loss": 0.78835273, "learning_rate": 3.94225586284712e-06, "loss": 0.81058681, "num_input_tokens_seen": 37490540, "step": 1736, "time_per_iteration": 2.690453052520752 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.01064692, "balance_loss_clip": 1.05800533, "balance_loss_mlp": 1.04357982, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 1.8549131823334455, "language_loss": 0.7058785, "learning_rate": 3.942162916315356e-06, "loss": 0.72819883, "num_input_tokens_seen": 37511905, "step": 1737, "time_per_iteration": 2.6296744346618652 }, { "auxiliary_loss_clip": 0.01150138, "auxiliary_loss_mlp": 0.01059407, "balance_loss_clip": 1.04806042, "balance_loss_mlp": 1.03600669, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 2.415613377802324, "language_loss": 0.81624997, "learning_rate": 3.942069896136581e-06, "loss": 0.83834541, "num_input_tokens_seen": 37533635, "step": 1738, "time_per_iteration": 2.7436723709106445 }, { "auxiliary_loss_clip": 0.01181471, "auxiliary_loss_mlp": 0.01062035, "balance_loss_clip": 1.05579174, "balance_loss_mlp": 1.03950453, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.1004590024567897, "language_loss": 0.75419426, "learning_rate": 3.9419768023143196e-06, "loss": 0.77662933, "num_input_tokens_seen": 37552035, "step": 1739, "time_per_iteration": 2.585538148880005 }, { "auxiliary_loss_clip": 0.01146716, "auxiliary_loss_mlp": 0.01054893, "balance_loss_clip": 1.05417264, "balance_loss_mlp": 1.03348303, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.586314706443492, "language_loss": 0.77523744, "learning_rate": 3.941883634852104e-06, "loss": 0.79725355, "num_input_tokens_seen": 37571540, "step": 1740, "time_per_iteration": 2.8947789669036865 }, { "auxiliary_loss_clip": 0.01152077, "auxiliary_loss_mlp": 0.01049503, "balance_loss_clip": 1.05725431, "balance_loss_mlp": 1.0288676, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 1.964868695493703, "language_loss": 0.85976374, "learning_rate": 3.941790393753467e-06, "loss": 0.88177955, "num_input_tokens_seen": 37588265, "step": 1741, "time_per_iteration": 2.7706260681152344 }, { "auxiliary_loss_clip": 0.01158134, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.05614483, "balance_loss_mlp": 1.03350592, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 5.197245251055922, "language_loss": 0.75592613, "learning_rate": 3.941697079021942e-06, "loss": 0.77807057, "num_input_tokens_seen": 37606860, "step": 1742, "time_per_iteration": 2.784748077392578 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01057571, "balance_loss_clip": 1.05678856, "balance_loss_mlp": 1.03735304, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.1426857583950416, "language_loss": 0.87614191, "learning_rate": 3.94160369066107e-06, "loss": 0.89802414, "num_input_tokens_seen": 37625210, "step": 1743, "time_per_iteration": 2.819350004196167 }, { "auxiliary_loss_clip": 0.01139959, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.0552268, "balance_loss_mlp": 1.0254786, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 2.060686178474056, "language_loss": 0.75927812, "learning_rate": 3.941510228674391e-06, "loss": 0.7811631, "num_input_tokens_seen": 37644110, "step": 1744, "time_per_iteration": 2.7817211151123047 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.05992889, "balance_loss_mlp": 1.03442037, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 1.9689383181633062, "language_loss": 0.78905094, "learning_rate": 3.941416693065451e-06, "loss": 0.81129813, "num_input_tokens_seen": 37665800, "step": 1745, "time_per_iteration": 2.88080096244812 }, { "auxiliary_loss_clip": 0.01180482, "auxiliary_loss_mlp": 0.01060479, "balance_loss_clip": 1.05740213, "balance_loss_mlp": 1.03920031, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 2.64819141351011, "language_loss": 0.82568693, "learning_rate": 3.941323083837794e-06, "loss": 0.84809649, "num_input_tokens_seen": 37685095, "step": 1746, "time_per_iteration": 2.7068004608154297 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.0105595, "balance_loss_clip": 1.05737162, "balance_loss_mlp": 1.03448033, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 1.6274602877205533, "language_loss": 0.70573747, "learning_rate": 3.941229400994971e-06, "loss": 0.7278806, "num_input_tokens_seen": 37707445, "step": 1747, "time_per_iteration": 2.8689963817596436 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01056346, "balance_loss_clip": 1.06035507, "balance_loss_mlp": 1.03492367, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.386885173400054, "language_loss": 0.8447504, "learning_rate": 3.941135644540535e-06, "loss": 0.86690772, "num_input_tokens_seen": 37728325, "step": 1748, "time_per_iteration": 2.8022749423980713 }, { "auxiliary_loss_clip": 0.01175489, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05471563, "balance_loss_mlp": 1.02701974, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.759895679837136, "language_loss": 0.71681082, "learning_rate": 3.941041814478041e-06, "loss": 0.73905981, "num_input_tokens_seen": 37748910, "step": 1749, "time_per_iteration": 2.6568849086761475 }, { "auxiliary_loss_clip": 0.01158221, "auxiliary_loss_mlp": 0.01058697, "balance_loss_clip": 1.05427456, "balance_loss_mlp": 1.03590393, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.95022560634889, "language_loss": 0.81510806, "learning_rate": 3.940947910811047e-06, "loss": 0.83727717, "num_input_tokens_seen": 37765745, "step": 1750, "time_per_iteration": 2.6282739639282227 }, { "auxiliary_loss_clip": 0.01156475, "auxiliary_loss_mlp": 0.01062657, "balance_loss_clip": 1.06022298, "balance_loss_mlp": 1.03973269, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 2.2218325288878953, "language_loss": 0.92364043, "learning_rate": 3.940853933543114e-06, "loss": 0.94583178, "num_input_tokens_seen": 37780520, "step": 1751, "time_per_iteration": 2.703376531600952 }, { "auxiliary_loss_clip": 0.01165779, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.0570029, "balance_loss_mlp": 1.03171563, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.0356912608722877, "language_loss": 0.79293752, "learning_rate": 3.940759882677805e-06, "loss": 0.81512833, "num_input_tokens_seen": 37799515, "step": 1752, "time_per_iteration": 2.6501150131225586 }, { "auxiliary_loss_clip": 0.01116865, "auxiliary_loss_mlp": 0.01055489, "balance_loss_clip": 1.05116987, "balance_loss_mlp": 1.03264856, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 2.022904639316529, "language_loss": 0.75978744, "learning_rate": 3.940665758218686e-06, "loss": 0.78151095, "num_input_tokens_seen": 37818695, "step": 1753, "time_per_iteration": 2.871335744857788 }, { "auxiliary_loss_clip": 0.01141721, "auxiliary_loss_mlp": 0.01057356, "balance_loss_clip": 1.05547547, "balance_loss_mlp": 1.03415775, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.0563919939847914, "language_loss": 0.83969283, "learning_rate": 3.940571560169328e-06, "loss": 0.86168355, "num_input_tokens_seen": 37837860, "step": 1754, "time_per_iteration": 2.685591459274292 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01053577, "balance_loss_clip": 1.05587101, "balance_loss_mlp": 1.03034329, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.7567281016961087, "language_loss": 0.68732727, "learning_rate": 3.940477288533302e-06, "loss": 0.70923102, "num_input_tokens_seen": 37856260, "step": 1755, "time_per_iteration": 2.754117727279663 }, { "auxiliary_loss_clip": 0.01161626, "auxiliary_loss_mlp": 0.010623, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 1.040187, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.26658946748733, "language_loss": 0.76382339, "learning_rate": 3.940382943314182e-06, "loss": 0.7860626, "num_input_tokens_seen": 37876960, "step": 1756, "time_per_iteration": 2.686790943145752 }, { "auxiliary_loss_clip": 0.01182062, "auxiliary_loss_mlp": 0.01062906, "balance_loss_clip": 1.05688286, "balance_loss_mlp": 1.04203284, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.5917029795724482, "language_loss": 0.79926664, "learning_rate": 3.940288524515547e-06, "loss": 0.82171631, "num_input_tokens_seen": 37897070, "step": 1757, "time_per_iteration": 2.6543681621551514 }, { "auxiliary_loss_clip": 0.01149304, "auxiliary_loss_mlp": 0.01057523, "balance_loss_clip": 1.0524838, "balance_loss_mlp": 1.03563643, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 1.6583181970862437, "language_loss": 0.78714895, "learning_rate": 3.940194032140976e-06, "loss": 0.80921721, "num_input_tokens_seen": 37923635, "step": 1758, "time_per_iteration": 3.013157367706299 }, { "auxiliary_loss_clip": 0.01165597, "auxiliary_loss_mlp": 0.01054919, "balance_loss_clip": 1.05894113, "balance_loss_mlp": 1.03347349, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 1.870482409236857, "language_loss": 0.91388202, "learning_rate": 3.940099466194054e-06, "loss": 0.93608713, "num_input_tokens_seen": 37942650, "step": 1759, "time_per_iteration": 4.1841137409210205 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.05242109, "balance_loss_mlp": 1.03346229, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.509404173865799, "language_loss": 0.77406812, "learning_rate": 3.940004826678365e-06, "loss": 0.79618067, "num_input_tokens_seen": 37960660, "step": 1760, "time_per_iteration": 4.476959228515625 }, { "auxiliary_loss_clip": 0.01161737, "auxiliary_loss_mlp": 0.01064522, "balance_loss_clip": 1.0536418, "balance_loss_mlp": 1.04053712, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 2.27300461956159, "language_loss": 0.88896096, "learning_rate": 3.939910113597498e-06, "loss": 0.91122353, "num_input_tokens_seen": 37978625, "step": 1761, "time_per_iteration": 2.6907520294189453 }, { "auxiliary_loss_clip": 0.01110571, "auxiliary_loss_mlp": 0.00782389, "balance_loss_clip": 1.04964042, "balance_loss_mlp": 1.00012767, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 2.010693315376097, "language_loss": 0.7809304, "learning_rate": 3.9398153269550464e-06, "loss": 0.79986, "num_input_tokens_seen": 38000005, "step": 1762, "time_per_iteration": 2.869051456451416 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.02694225, "balance_loss_mlp": 1.05056334, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8956567750819878, "language_loss": 0.60503203, "learning_rate": 3.939720466754602e-06, "loss": 0.6261009, "num_input_tokens_seen": 38066165, "step": 1763, "time_per_iteration": 5.049196720123291 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01048706, "balance_loss_clip": 1.05424261, "balance_loss_mlp": 1.02708137, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 2.0510547250099633, "language_loss": 0.80232942, "learning_rate": 3.939625532999763e-06, "loss": 0.82438517, "num_input_tokens_seen": 38086150, "step": 1764, "time_per_iteration": 4.288762807846069 }, { "auxiliary_loss_clip": 0.01136032, "auxiliary_loss_mlp": 0.01055975, "balance_loss_clip": 1.04879069, "balance_loss_mlp": 1.03218043, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.693202084864273, "language_loss": 0.801691, "learning_rate": 3.9395305256941314e-06, "loss": 0.82361102, "num_input_tokens_seen": 38104205, "step": 1765, "time_per_iteration": 2.931269407272339 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01058956, "balance_loss_clip": 1.05457163, "balance_loss_mlp": 1.0367949, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.7665774264343403, "language_loss": 0.76864165, "learning_rate": 3.939435444841306e-06, "loss": 0.79086387, "num_input_tokens_seen": 38122005, "step": 1766, "time_per_iteration": 2.5976176261901855 }, { "auxiliary_loss_clip": 0.01182495, "auxiliary_loss_mlp": 0.01059246, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.03766894, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 1.6265727447650185, "language_loss": 0.77311498, "learning_rate": 3.939340290444895e-06, "loss": 0.79553241, "num_input_tokens_seen": 38143365, "step": 1767, "time_per_iteration": 2.6356630325317383 }, { "auxiliary_loss_clip": 0.01006515, "auxiliary_loss_mlp": 0.01018751, "balance_loss_clip": 1.03004837, "balance_loss_mlp": 1.0151509, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.9172341423433896, "language_loss": 0.57889944, "learning_rate": 3.939245062508506e-06, "loss": 0.59915209, "num_input_tokens_seen": 38210035, "step": 1768, "time_per_iteration": 3.6866471767425537 }, { "auxiliary_loss_clip": 0.01144481, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.0546546, "balance_loss_mlp": 1.02687907, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4529696494540971, "language_loss": 0.86711109, "learning_rate": 3.939149761035749e-06, "loss": 0.8890301, "num_input_tokens_seen": 38231230, "step": 1769, "time_per_iteration": 3.936905860900879 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.00780338, "balance_loss_clip": 1.05321527, "balance_loss_mlp": 1.00008726, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 1.8275276693890916, "language_loss": 0.61906171, "learning_rate": 3.9390543860302395e-06, "loss": 0.63827729, "num_input_tokens_seen": 38253890, "step": 1770, "time_per_iteration": 2.8926138877868652 }, { "auxiliary_loss_clip": 0.01057689, "auxiliary_loss_mlp": 0.01010808, "balance_loss_clip": 1.02007711, "balance_loss_mlp": 1.00775671, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.9163874753670794, "language_loss": 0.57049137, "learning_rate": 3.9389589374955925e-06, "loss": 0.59117633, "num_input_tokens_seen": 38304290, "step": 1771, "time_per_iteration": 3.0783088207244873 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.01065918, "balance_loss_clip": 1.05574095, "balance_loss_mlp": 1.04465103, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 12.794881398939157, "language_loss": 0.88265753, "learning_rate": 3.938863415435429e-06, "loss": 0.90477949, "num_input_tokens_seen": 38324725, "step": 1772, "time_per_iteration": 2.770202159881592 }, { "auxiliary_loss_clip": 0.0118421, "auxiliary_loss_mlp": 0.01058161, "balance_loss_clip": 1.05697048, "balance_loss_mlp": 1.03497458, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 2.576940958490313, "language_loss": 0.76030588, "learning_rate": 3.93876781985337e-06, "loss": 0.78272957, "num_input_tokens_seen": 38340735, "step": 1773, "time_per_iteration": 2.6177070140838623 }, { "auxiliary_loss_clip": 0.01122733, "auxiliary_loss_mlp": 0.01067657, "balance_loss_clip": 1.04691553, "balance_loss_mlp": 1.04205084, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 1.868288871406422, "language_loss": 0.8330853, "learning_rate": 3.938672150753041e-06, "loss": 0.85498923, "num_input_tokens_seen": 38361315, "step": 1774, "time_per_iteration": 2.7396061420440674 }, { "auxiliary_loss_clip": 0.01156305, "auxiliary_loss_mlp": 0.00780518, "balance_loss_clip": 1.05627465, "balance_loss_mlp": 1.00011277, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.73383407032925, "language_loss": 0.76446521, "learning_rate": 3.9385764081380704e-06, "loss": 0.78383344, "num_input_tokens_seen": 38377425, "step": 1775, "time_per_iteration": 2.624208927154541 }, { "auxiliary_loss_clip": 0.01063199, "auxiliary_loss_mlp": 0.01007654, "balance_loss_clip": 1.01726675, "balance_loss_mlp": 1.00443542, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8200823962511624, "language_loss": 0.57477289, "learning_rate": 3.9384805920120876e-06, "loss": 0.5954814, "num_input_tokens_seen": 38440275, "step": 1776, "time_per_iteration": 3.1782386302948 }, { "auxiliary_loss_clip": 0.01150087, "auxiliary_loss_mlp": 0.01066244, "balance_loss_clip": 1.05192852, "balance_loss_mlp": 1.0407691, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.4232532718517703, "language_loss": 0.83442962, "learning_rate": 3.938384702378727e-06, "loss": 0.85659301, "num_input_tokens_seen": 38461820, "step": 1777, "time_per_iteration": 2.7342305183410645 }, { "auxiliary_loss_clip": 0.01113855, "auxiliary_loss_mlp": 0.00780712, "balance_loss_clip": 1.04919302, "balance_loss_mlp": 1.00015831, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.8326039994575831, "language_loss": 0.87207437, "learning_rate": 3.938288739241625e-06, "loss": 0.89102006, "num_input_tokens_seen": 38482235, "step": 1778, "time_per_iteration": 2.859834671020508 }, { "auxiliary_loss_clip": 0.01152509, "auxiliary_loss_mlp": 0.00780436, "balance_loss_clip": 1.06804752, "balance_loss_mlp": 1.00019765, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.4525249429301823, "language_loss": 0.84165859, "learning_rate": 3.938192702604417e-06, "loss": 0.86098808, "num_input_tokens_seen": 38500690, "step": 1779, "time_per_iteration": 2.81423020362854 }, { "auxiliary_loss_clip": 0.01141718, "auxiliary_loss_mlp": 0.00779857, "balance_loss_clip": 1.05215359, "balance_loss_mlp": 1.0001775, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 1.9378348403129941, "language_loss": 0.66915894, "learning_rate": 3.9380965924707495e-06, "loss": 0.68837464, "num_input_tokens_seen": 38518405, "step": 1780, "time_per_iteration": 2.616684913635254 }, { "auxiliary_loss_clip": 0.01166288, "auxiliary_loss_mlp": 0.01054109, "balance_loss_clip": 1.05843914, "balance_loss_mlp": 1.03268683, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 1.9168180254288365, "language_loss": 0.92058647, "learning_rate": 3.938000408844265e-06, "loss": 0.94279045, "num_input_tokens_seen": 38535060, "step": 1781, "time_per_iteration": 2.6167802810668945 }, { "auxiliary_loss_clip": 0.0113109, "auxiliary_loss_mlp": 0.01064554, "balance_loss_clip": 1.0531441, "balance_loss_mlp": 1.04344225, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 1.8357670097294174, "language_loss": 0.79336482, "learning_rate": 3.9379041517286105e-06, "loss": 0.81532121, "num_input_tokens_seen": 38552855, "step": 1782, "time_per_iteration": 2.7669336795806885 }, { "auxiliary_loss_clip": 0.01158369, "auxiliary_loss_mlp": 0.01061646, "balance_loss_clip": 1.05510604, "balance_loss_mlp": 1.04016423, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.0914095256513945, "language_loss": 0.79086542, "learning_rate": 3.937807821127436e-06, "loss": 0.81306553, "num_input_tokens_seen": 38570075, "step": 1783, "time_per_iteration": 2.6349542140960693 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01065333, "balance_loss_clip": 1.0570296, "balance_loss_mlp": 1.04299295, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.1874612027367806, "language_loss": 0.86421812, "learning_rate": 3.937711417044395e-06, "loss": 0.88651407, "num_input_tokens_seen": 38587970, "step": 1784, "time_per_iteration": 2.8452541828155518 }, { "auxiliary_loss_clip": 0.01153461, "auxiliary_loss_mlp": 0.01055605, "balance_loss_clip": 1.05502176, "balance_loss_mlp": 1.03321707, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.4649130783319553, "language_loss": 1.01192284, "learning_rate": 3.937614939483143e-06, "loss": 1.03401351, "num_input_tokens_seen": 38605840, "step": 1785, "time_per_iteration": 2.690018653869629 }, { "auxiliary_loss_clip": 0.01168517, "auxiliary_loss_mlp": 0.01060763, "balance_loss_clip": 1.05854678, "balance_loss_mlp": 1.03984189, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.397915549237645, "language_loss": 0.84951413, "learning_rate": 3.937518388447339e-06, "loss": 0.87180698, "num_input_tokens_seen": 38627070, "step": 1786, "time_per_iteration": 2.637430191040039 }, { "auxiliary_loss_clip": 0.01183118, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05716729, "balance_loss_mlp": 1.03520155, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 1.7951357311742837, "language_loss": 0.78861409, "learning_rate": 3.937421763940642e-06, "loss": 0.81103605, "num_input_tokens_seen": 38645840, "step": 1787, "time_per_iteration": 2.54508900642395 }, { "auxiliary_loss_clip": 0.01174896, "auxiliary_loss_mlp": 0.01047406, "balance_loss_clip": 1.05971575, "balance_loss_mlp": 1.02528071, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.8536072321218278, "language_loss": 0.82307518, "learning_rate": 3.937325065966719e-06, "loss": 0.84529817, "num_input_tokens_seen": 38664770, "step": 1788, "time_per_iteration": 2.706247568130493 }, { "auxiliary_loss_clip": 0.01180896, "auxiliary_loss_mlp": 0.01064682, "balance_loss_clip": 1.05843878, "balance_loss_mlp": 1.04427314, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.110245519520894, "language_loss": 0.77840686, "learning_rate": 3.9372282945292335e-06, "loss": 0.80086267, "num_input_tokens_seen": 38683865, "step": 1789, "time_per_iteration": 2.6274654865264893 }, { "auxiliary_loss_clip": 0.01185566, "auxiliary_loss_mlp": 0.01065099, "balance_loss_clip": 1.0604099, "balance_loss_mlp": 1.04049408, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 2.7248977042722524, "language_loss": 0.74817526, "learning_rate": 3.937131449631859e-06, "loss": 0.77068192, "num_input_tokens_seen": 38702485, "step": 1790, "time_per_iteration": 2.624382972717285 }, { "auxiliary_loss_clip": 0.01178128, "auxiliary_loss_mlp": 0.00780572, "balance_loss_clip": 1.06110644, "balance_loss_mlp": 1.00021124, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.350797373347828, "language_loss": 0.78764236, "learning_rate": 3.9370345312782645e-06, "loss": 0.80722934, "num_input_tokens_seen": 38722475, "step": 1791, "time_per_iteration": 2.696162223815918 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01065057, "balance_loss_clip": 1.05280125, "balance_loss_mlp": 1.04117918, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.5879424734455678, "language_loss": 0.70638013, "learning_rate": 3.936937539472126e-06, "loss": 0.7283721, "num_input_tokens_seen": 38743285, "step": 1792, "time_per_iteration": 2.770874261856079 }, { "auxiliary_loss_clip": 0.01149934, "auxiliary_loss_mlp": 0.01051019, "balance_loss_clip": 1.05610943, "balance_loss_mlp": 1.02764249, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 1.920104493539276, "language_loss": 0.76565266, "learning_rate": 3.9368404742171236e-06, "loss": 0.78766215, "num_input_tokens_seen": 38763035, "step": 1793, "time_per_iteration": 2.7218761444091797 }, { "auxiliary_loss_clip": 0.01116412, "auxiliary_loss_mlp": 0.01064574, "balance_loss_clip": 1.05029237, "balance_loss_mlp": 1.0414238, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.7475786500241859, "language_loss": 0.85103315, "learning_rate": 3.936743335516936e-06, "loss": 0.87284303, "num_input_tokens_seen": 38784900, "step": 1794, "time_per_iteration": 2.7590620517730713 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01055294, "balance_loss_clip": 1.04807687, "balance_loss_mlp": 1.03146446, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 2.5236234593460924, "language_loss": 0.74585378, "learning_rate": 3.936646123375246e-06, "loss": 0.76755869, "num_input_tokens_seen": 38804695, "step": 1795, "time_per_iteration": 2.8500585556030273 }, { "auxiliary_loss_clip": 0.01124895, "auxiliary_loss_mlp": 0.01058294, "balance_loss_clip": 1.04831553, "balance_loss_mlp": 1.03479767, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 2.842374039298248, "language_loss": 0.81653619, "learning_rate": 3.936548837795741e-06, "loss": 0.83836806, "num_input_tokens_seen": 38822395, "step": 1796, "time_per_iteration": 2.7549750804901123 }, { "auxiliary_loss_clip": 0.01140492, "auxiliary_loss_mlp": 0.01083966, "balance_loss_clip": 1.05246449, "balance_loss_mlp": 1.05721593, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.59635455269928, "language_loss": 0.74233043, "learning_rate": 3.936451478782111e-06, "loss": 0.764575, "num_input_tokens_seen": 38839865, "step": 1797, "time_per_iteration": 2.6396753787994385 }, { "auxiliary_loss_clip": 0.01160286, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.05505061, "balance_loss_mlp": 1.02874684, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.0852339617015025, "language_loss": 0.81855786, "learning_rate": 3.936354046338046e-06, "loss": 0.84066033, "num_input_tokens_seen": 38857300, "step": 1798, "time_per_iteration": 2.7105324268341064 }, { "auxiliary_loss_clip": 0.01142859, "auxiliary_loss_mlp": 0.01054502, "balance_loss_clip": 1.05379176, "balance_loss_mlp": 1.03117299, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.4443000829323687, "language_loss": 0.85516405, "learning_rate": 3.936256540467242e-06, "loss": 0.87713766, "num_input_tokens_seen": 38874960, "step": 1799, "time_per_iteration": 4.159978628158569 }, { "auxiliary_loss_clip": 0.01154352, "auxiliary_loss_mlp": 0.01062903, "balance_loss_clip": 1.05493283, "balance_loss_mlp": 1.04114687, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 2.7405734706827825, "language_loss": 0.77434146, "learning_rate": 3.9361589611733955e-06, "loss": 0.79651403, "num_input_tokens_seen": 38893610, "step": 1800, "time_per_iteration": 4.52047872543335 }, { "auxiliary_loss_clip": 0.01178634, "auxiliary_loss_mlp": 0.0104758, "balance_loss_clip": 1.05722904, "balance_loss_mlp": 1.02689719, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.582468034859118, "language_loss": 0.72897375, "learning_rate": 3.9360613084602075e-06, "loss": 0.75123584, "num_input_tokens_seen": 38913485, "step": 1801, "time_per_iteration": 4.291400909423828 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01056056, "balance_loss_clip": 1.06095624, "balance_loss_mlp": 1.03478956, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 1.951139287607183, "language_loss": 0.6634692, "learning_rate": 3.935963582331381e-06, "loss": 0.68593562, "num_input_tokens_seen": 38935650, "step": 1802, "time_per_iteration": 2.722628355026245 }, { "auxiliary_loss_clip": 0.01155661, "auxiliary_loss_mlp": 0.01059375, "balance_loss_clip": 1.05326533, "balance_loss_mlp": 1.03695142, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 2.084551157592464, "language_loss": 0.81612957, "learning_rate": 3.935865782790621e-06, "loss": 0.8382799, "num_input_tokens_seen": 38954130, "step": 1803, "time_per_iteration": 4.239379167556763 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01061781, "balance_loss_clip": 1.0567112, "balance_loss_mlp": 1.03921473, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 1.9102934552723363, "language_loss": 0.91127038, "learning_rate": 3.9357679098416365e-06, "loss": 0.93351918, "num_input_tokens_seen": 38972905, "step": 1804, "time_per_iteration": 2.5836737155914307 }, { "auxiliary_loss_clip": 0.01136188, "auxiliary_loss_mlp": 0.01060133, "balance_loss_clip": 1.05617714, "balance_loss_mlp": 1.03718543, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 2.5742522317806262, "language_loss": 0.76198906, "learning_rate": 3.935669963488139e-06, "loss": 0.78395224, "num_input_tokens_seen": 38993255, "step": 1805, "time_per_iteration": 2.783137321472168 }, { "auxiliary_loss_clip": 0.01149468, "auxiliary_loss_mlp": 0.01050946, "balance_loss_clip": 1.05419612, "balance_loss_mlp": 1.03050184, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 1.7049574807827799, "language_loss": 0.85876733, "learning_rate": 3.935571943733843e-06, "loss": 0.88077152, "num_input_tokens_seen": 39012610, "step": 1806, "time_per_iteration": 2.8148701190948486 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.00779888, "balance_loss_clip": 1.05462408, "balance_loss_mlp": 1.00006652, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.554050049117878, "language_loss": 0.8108198, "learning_rate": 3.9354738505824635e-06, "loss": 0.83030605, "num_input_tokens_seen": 39030120, "step": 1807, "time_per_iteration": 2.6275649070739746 }, { "auxiliary_loss_clip": 0.01139085, "auxiliary_loss_mlp": 0.01055438, "balance_loss_clip": 1.05193985, "balance_loss_mlp": 1.03522038, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.834914777588586, "language_loss": 0.78910971, "learning_rate": 3.9353756840377225e-06, "loss": 0.81105494, "num_input_tokens_seen": 39049875, "step": 1808, "time_per_iteration": 2.722910165786743 }, { "auxiliary_loss_clip": 0.01157997, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.05918014, "balance_loss_mlp": 1.03548992, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.6201371380093192, "language_loss": 0.79013431, "learning_rate": 3.935277444103342e-06, "loss": 0.81228393, "num_input_tokens_seen": 39068935, "step": 1809, "time_per_iteration": 2.7261481285095215 }, { "auxiliary_loss_clip": 0.01180468, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.0568099, "balance_loss_mlp": 1.03705359, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 1.9004896030263678, "language_loss": 0.85129547, "learning_rate": 3.935179130783046e-06, "loss": 0.87367928, "num_input_tokens_seen": 39087370, "step": 1810, "time_per_iteration": 2.672696828842163 }, { "auxiliary_loss_clip": 0.01124301, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.04580724, "balance_loss_mlp": 1.0335803, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 1.5993643379141278, "language_loss": 0.63822675, "learning_rate": 3.935080744080564e-06, "loss": 0.66004336, "num_input_tokens_seen": 39106635, "step": 1811, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.01151891, "auxiliary_loss_mlp": 0.01050225, "balance_loss_clip": 1.05335796, "balance_loss_mlp": 1.02836192, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 1.9284151803363307, "language_loss": 0.74238706, "learning_rate": 3.934982283999626e-06, "loss": 0.76440823, "num_input_tokens_seen": 39126335, "step": 1812, "time_per_iteration": 2.727743625640869 }, { "auxiliary_loss_clip": 0.01142498, "auxiliary_loss_mlp": 0.01057826, "balance_loss_clip": 1.05199611, "balance_loss_mlp": 1.03546214, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.5783196636767667, "language_loss": 0.72746086, "learning_rate": 3.934883750543966e-06, "loss": 0.74946409, "num_input_tokens_seen": 39144820, "step": 1813, "time_per_iteration": 2.798297166824341 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01056639, "balance_loss_clip": 1.0511452, "balance_loss_mlp": 1.03515792, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.635228619121262, "language_loss": 0.82981038, "learning_rate": 3.93478514371732e-06, "loss": 0.85176599, "num_input_tokens_seen": 39165945, "step": 1814, "time_per_iteration": 2.7120048999786377 }, { "auxiliary_loss_clip": 0.01141958, "auxiliary_loss_mlp": 0.01058857, "balance_loss_clip": 1.0537864, "balance_loss_mlp": 1.03787625, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 1.9556743991494996, "language_loss": 0.84310579, "learning_rate": 3.934686463523429e-06, "loss": 0.86511397, "num_input_tokens_seen": 39183520, "step": 1815, "time_per_iteration": 2.788870096206665 }, { "auxiliary_loss_clip": 0.01146878, "auxiliary_loss_mlp": 0.01055141, "balance_loss_clip": 1.05443966, "balance_loss_mlp": 1.03182411, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 2.5374826422013195, "language_loss": 0.71670222, "learning_rate": 3.9345877099660315e-06, "loss": 0.73872244, "num_input_tokens_seen": 39201190, "step": 1816, "time_per_iteration": 2.8424103260040283 }, { "auxiliary_loss_clip": 0.01164173, "auxiliary_loss_mlp": 0.01064184, "balance_loss_clip": 1.05216932, "balance_loss_mlp": 1.04052126, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 2.016899555923086, "language_loss": 0.72880268, "learning_rate": 3.9344888830488744e-06, "loss": 0.75108624, "num_input_tokens_seen": 39221210, "step": 1817, "time_per_iteration": 2.7320947647094727 }, { "auxiliary_loss_clip": 0.01116915, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.05173278, "balance_loss_mlp": 1.03517008, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.5988628345308824, "language_loss": 0.67275256, "learning_rate": 3.934389982775706e-06, "loss": 0.69450033, "num_input_tokens_seen": 39242025, "step": 1818, "time_per_iteration": 2.8700790405273438 }, { "auxiliary_loss_clip": 0.01155804, "auxiliary_loss_mlp": 0.01065952, "balance_loss_clip": 1.05673873, "balance_loss_mlp": 1.04313517, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 3.593580913512793, "language_loss": 0.73149616, "learning_rate": 3.934291009150275e-06, "loss": 0.75371373, "num_input_tokens_seen": 39259870, "step": 1819, "time_per_iteration": 2.7091007232666016 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.00779155, "balance_loss_clip": 1.05341268, "balance_loss_mlp": 1.00027704, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 4.531598275817935, "language_loss": 0.73764241, "learning_rate": 3.934191962176335e-06, "loss": 0.75686359, "num_input_tokens_seen": 39278500, "step": 1820, "time_per_iteration": 2.6513099670410156 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01056073, "balance_loss_clip": 1.05747604, "balance_loss_mlp": 1.03297031, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 2.2567103978329337, "language_loss": 0.82532805, "learning_rate": 3.934092841857642e-06, "loss": 0.84768236, "num_input_tokens_seen": 39294800, "step": 1821, "time_per_iteration": 2.5348384380340576 }, { "auxiliary_loss_clip": 0.01148016, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.05133605, "balance_loss_mlp": 1.03077567, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 2.0770330480401578, "language_loss": 0.76271641, "learning_rate": 3.933993648197955e-06, "loss": 0.7847169, "num_input_tokens_seen": 39314625, "step": 1822, "time_per_iteration": 2.730079174041748 }, { "auxiliary_loss_clip": 0.01142446, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.04849207, "balance_loss_mlp": 1.02856421, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.734419613996414, "language_loss": 0.79309607, "learning_rate": 3.933894381201034e-06, "loss": 0.81501311, "num_input_tokens_seen": 39336465, "step": 1823, "time_per_iteration": 2.756969928741455 }, { "auxiliary_loss_clip": 0.01148165, "auxiliary_loss_mlp": 0.01049595, "balance_loss_clip": 1.05160606, "balance_loss_mlp": 1.02745807, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.4318009514182364, "language_loss": 0.79590744, "learning_rate": 3.933795040870645e-06, "loss": 0.81788504, "num_input_tokens_seen": 39357930, "step": 1824, "time_per_iteration": 2.798168182373047 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.05104232, "balance_loss_mlp": 1.03381693, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 2.127143421089703, "language_loss": 0.88138539, "learning_rate": 3.933695627210554e-06, "loss": 0.90336192, "num_input_tokens_seen": 39376380, "step": 1825, "time_per_iteration": 2.6804513931274414 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 1.04586983, "balance_loss_mlp": 1.03439498, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 1.721192594935189, "language_loss": 0.76441038, "learning_rate": 3.933596140224532e-06, "loss": 0.78625786, "num_input_tokens_seen": 39399935, "step": 1826, "time_per_iteration": 2.8315086364746094 }, { "auxiliary_loss_clip": 0.01063155, "auxiliary_loss_mlp": 0.01016957, "balance_loss_clip": 1.02709544, "balance_loss_mlp": 1.01409554, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8518463216820418, "language_loss": 0.54997343, "learning_rate": 3.93349657991635e-06, "loss": 0.57077461, "num_input_tokens_seen": 39460685, "step": 1827, "time_per_iteration": 3.1425766944885254 }, { "auxiliary_loss_clip": 0.01072651, "auxiliary_loss_mlp": 0.01010167, "balance_loss_clip": 1.02693772, "balance_loss_mlp": 1.00717473, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 0.7375455878808789, "language_loss": 0.55382878, "learning_rate": 3.933396946289784e-06, "loss": 0.57465696, "num_input_tokens_seen": 39524765, "step": 1828, "time_per_iteration": 3.168165922164917 }, { "auxiliary_loss_clip": 0.01156998, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.05407059, "balance_loss_mlp": 1.03618491, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 2.250827401167328, "language_loss": 0.84010404, "learning_rate": 3.933297239348612e-06, "loss": 0.86226743, "num_input_tokens_seen": 39543640, "step": 1829, "time_per_iteration": 2.7341628074645996 }, { "auxiliary_loss_clip": 0.01130747, "auxiliary_loss_mlp": 0.01053464, "balance_loss_clip": 1.0547024, "balance_loss_mlp": 1.03036165, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 2.342204785330024, "language_loss": 0.88880253, "learning_rate": 3.933197459096614e-06, "loss": 0.91064465, "num_input_tokens_seen": 39567525, "step": 1830, "time_per_iteration": 2.9093260765075684 }, { "auxiliary_loss_clip": 0.01049643, "auxiliary_loss_mlp": 0.01009685, "balance_loss_clip": 1.02618647, "balance_loss_mlp": 1.00681162, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6882192363357665, "language_loss": 0.55566543, "learning_rate": 3.9330976055375756e-06, "loss": 0.57625872, "num_input_tokens_seen": 39628470, "step": 1831, "time_per_iteration": 3.1713974475860596 }, { "auxiliary_loss_clip": 0.01156783, "auxiliary_loss_mlp": 0.01073931, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04965997, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 2.4937725361201495, "language_loss": 0.90836191, "learning_rate": 3.932997678675282e-06, "loss": 0.93066907, "num_input_tokens_seen": 39646670, "step": 1832, "time_per_iteration": 2.6786489486694336 }, { "auxiliary_loss_clip": 0.0106111, "auxiliary_loss_mlp": 0.01010664, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.00769615, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7154576595208243, "language_loss": 0.59911001, "learning_rate": 3.932897678513523e-06, "loss": 0.61982775, "num_input_tokens_seen": 39712915, "step": 1833, "time_per_iteration": 3.1802401542663574 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.0105502, "balance_loss_clip": 1.05312014, "balance_loss_mlp": 1.03285873, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 2.6772934272606923, "language_loss": 0.80799395, "learning_rate": 3.93279760505609e-06, "loss": 0.83021617, "num_input_tokens_seen": 39730650, "step": 1834, "time_per_iteration": 2.591374635696411 }, { "auxiliary_loss_clip": 0.01141662, "auxiliary_loss_mlp": 0.01054827, "balance_loss_clip": 1.05557871, "balance_loss_mlp": 1.03004324, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 2.4853906687508247, "language_loss": 0.89856094, "learning_rate": 3.932697458306779e-06, "loss": 0.92052579, "num_input_tokens_seen": 39751065, "step": 1835, "time_per_iteration": 2.742330312728882 }, { "auxiliary_loss_clip": 0.01131787, "auxiliary_loss_mlp": 0.01063812, "balance_loss_clip": 1.0524013, "balance_loss_mlp": 1.03758645, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 2.2754442269720023, "language_loss": 0.63256055, "learning_rate": 3.932597238269386e-06, "loss": 0.65451658, "num_input_tokens_seen": 39769245, "step": 1836, "time_per_iteration": 2.6935038566589355 }, { "auxiliary_loss_clip": 0.01138919, "auxiliary_loss_mlp": 0.01061469, "balance_loss_clip": 1.05021358, "balance_loss_mlp": 1.03954661, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 1.6726289784191204, "language_loss": 0.72792488, "learning_rate": 3.932496944947711e-06, "loss": 0.74992871, "num_input_tokens_seen": 39790830, "step": 1837, "time_per_iteration": 2.7790510654449463 }, { "auxiliary_loss_clip": 0.01165472, "auxiliary_loss_mlp": 0.01057035, "balance_loss_clip": 1.05463088, "balance_loss_mlp": 1.03551781, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.027055787194766, "language_loss": 0.78489268, "learning_rate": 3.93239657834556e-06, "loss": 0.8071177, "num_input_tokens_seen": 39809475, "step": 1838, "time_per_iteration": 4.098532438278198 }, { "auxiliary_loss_clip": 0.01154042, "auxiliary_loss_mlp": 0.01062407, "balance_loss_clip": 1.05542612, "balance_loss_mlp": 1.03970969, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.046221888979386, "language_loss": 0.71451718, "learning_rate": 3.932296138466736e-06, "loss": 0.7366817, "num_input_tokens_seen": 39826355, "step": 1839, "time_per_iteration": 4.205714464187622 }, { "auxiliary_loss_clip": 0.01187588, "auxiliary_loss_mlp": 0.00781104, "balance_loss_clip": 1.06183171, "balance_loss_mlp": 1.00018013, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.623062836625425, "language_loss": 0.79027873, "learning_rate": 3.93219562531505e-06, "loss": 0.80996567, "num_input_tokens_seen": 39845335, "step": 1840, "time_per_iteration": 2.6023378372192383 }, { "auxiliary_loss_clip": 0.01156508, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.05206251, "balance_loss_mlp": 1.02887261, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.7551987843009527, "language_loss": 0.88083529, "learning_rate": 3.932095038894311e-06, "loss": 0.90292549, "num_input_tokens_seen": 39865065, "step": 1841, "time_per_iteration": 4.3361639976501465 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01067683, "balance_loss_clip": 1.05036247, "balance_loss_mlp": 1.04453301, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 3.1603067125494126, "language_loss": 0.90521991, "learning_rate": 3.931994379208334e-06, "loss": 0.92719877, "num_input_tokens_seen": 39882780, "step": 1842, "time_per_iteration": 2.7086760997772217 }, { "auxiliary_loss_clip": 0.01152506, "auxiliary_loss_mlp": 0.01061227, "balance_loss_clip": 1.05065131, "balance_loss_mlp": 1.03982854, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.112801816568727, "language_loss": 0.85845053, "learning_rate": 3.931893646260937e-06, "loss": 0.88058788, "num_input_tokens_seen": 39900295, "step": 1843, "time_per_iteration": 4.263117790222168 }, { "auxiliary_loss_clip": 0.01119254, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05050898, "balance_loss_mlp": 1.00012159, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.4511349711086798, "language_loss": 0.74735641, "learning_rate": 3.931792840055941e-06, "loss": 0.76637971, "num_input_tokens_seen": 39922075, "step": 1844, "time_per_iteration": 2.7999000549316406 }, { "auxiliary_loss_clip": 0.01180395, "auxiliary_loss_mlp": 0.01055824, "balance_loss_clip": 1.05662274, "balance_loss_mlp": 1.03238785, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.017286766878137, "language_loss": 0.7566812, "learning_rate": 3.931691960597165e-06, "loss": 0.77904338, "num_input_tokens_seen": 39940115, "step": 1845, "time_per_iteration": 2.5305535793304443 }, { "auxiliary_loss_clip": 0.01153403, "auxiliary_loss_mlp": 0.01058911, "balance_loss_clip": 1.05442989, "balance_loss_mlp": 1.03807366, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.9628359583393364, "language_loss": 0.75953126, "learning_rate": 3.9315910078884375e-06, "loss": 0.78165436, "num_input_tokens_seen": 39959920, "step": 1846, "time_per_iteration": 2.719325542449951 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.01059369, "balance_loss_clip": 1.05823123, "balance_loss_mlp": 1.03717244, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 2.612459533347621, "language_loss": 0.8620472, "learning_rate": 3.931489981933584e-06, "loss": 0.88437986, "num_input_tokens_seen": 39974755, "step": 1847, "time_per_iteration": 2.7705559730529785 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01055145, "balance_loss_clip": 1.05562854, "balance_loss_mlp": 1.0322808, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 1.8452742714770096, "language_loss": 0.76981926, "learning_rate": 3.931388882736438e-06, "loss": 0.79218227, "num_input_tokens_seen": 39993355, "step": 1848, "time_per_iteration": 2.605933666229248 }, { "auxiliary_loss_clip": 0.01172398, "auxiliary_loss_mlp": 0.01056349, "balance_loss_clip": 1.06262445, "balance_loss_mlp": 1.03455794, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 1.6943193134392138, "language_loss": 0.77621841, "learning_rate": 3.931287710300832e-06, "loss": 0.7985059, "num_input_tokens_seen": 40012410, "step": 1849, "time_per_iteration": 2.678415536880493 }, { "auxiliary_loss_clip": 0.01138995, "auxiliary_loss_mlp": 0.00781122, "balance_loss_clip": 1.05277848, "balance_loss_mlp": 1.00010324, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 3.3234972538165066, "language_loss": 0.72098577, "learning_rate": 3.931186464630601e-06, "loss": 0.74018693, "num_input_tokens_seen": 40029315, "step": 1850, "time_per_iteration": 2.7763028144836426 }, { "auxiliary_loss_clip": 0.01170569, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05759382, "balance_loss_mlp": 1.03874469, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.0638339407107873, "language_loss": 0.81499028, "learning_rate": 3.931085145729588e-06, "loss": 0.83730704, "num_input_tokens_seen": 40045765, "step": 1851, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01061301, "balance_loss_clip": 1.05789042, "balance_loss_mlp": 1.04027295, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.365035468310974, "language_loss": 0.88270009, "learning_rate": 3.930983753601631e-06, "loss": 0.90496004, "num_input_tokens_seen": 40061660, "step": 1852, "time_per_iteration": 2.659914493560791 }, { "auxiliary_loss_clip": 0.01166772, "auxiliary_loss_mlp": 0.01060698, "balance_loss_clip": 1.05489326, "balance_loss_mlp": 1.03791702, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.1825610274136054, "language_loss": 0.72492862, "learning_rate": 3.930882288250578e-06, "loss": 0.74720335, "num_input_tokens_seen": 40080180, "step": 1853, "time_per_iteration": 2.7840964794158936 }, { "auxiliary_loss_clip": 0.01069898, "auxiliary_loss_mlp": 0.01019902, "balance_loss_clip": 1.02549517, "balance_loss_mlp": 1.01701725, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.772231443606995, "language_loss": 0.53664064, "learning_rate": 3.930780749680273e-06, "loss": 0.55753863, "num_input_tokens_seen": 40138910, "step": 1854, "time_per_iteration": 3.089354991912842 }, { "auxiliary_loss_clip": 0.01159576, "auxiliary_loss_mlp": 0.0105585, "balance_loss_clip": 1.05390525, "balance_loss_mlp": 1.03184092, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 1.863523240792578, "language_loss": 0.8468501, "learning_rate": 3.9306791378945705e-06, "loss": 0.86900431, "num_input_tokens_seen": 40157745, "step": 1855, "time_per_iteration": 2.7361156940460205 }, { "auxiliary_loss_clip": 0.01147504, "auxiliary_loss_mlp": 0.01064479, "balance_loss_clip": 1.05225825, "balance_loss_mlp": 1.0424726, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.1217067547931756, "language_loss": 0.81187081, "learning_rate": 3.9305774528973205e-06, "loss": 0.83399057, "num_input_tokens_seen": 40175375, "step": 1856, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01052259, "balance_loss_clip": 1.05843937, "balance_loss_mlp": 1.02957392, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 2.0555738298465314, "language_loss": 0.82761133, "learning_rate": 3.93047569469238e-06, "loss": 0.8498168, "num_input_tokens_seen": 40195715, "step": 1857, "time_per_iteration": 2.647184133529663 }, { "auxiliary_loss_clip": 0.01144196, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.05255508, "balance_loss_mlp": 1.02395833, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.3199985887988914, "language_loss": 0.83131742, "learning_rate": 3.930373863283608e-06, "loss": 0.85320854, "num_input_tokens_seen": 40213975, "step": 1858, "time_per_iteration": 2.726905107498169 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.04900265, "balance_loss_mlp": 1.04350638, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.0395414997027657, "language_loss": 0.9133389, "learning_rate": 3.930271958674866e-06, "loss": 0.93536508, "num_input_tokens_seen": 40233905, "step": 1859, "time_per_iteration": 3.0006766319274902 }, { "auxiliary_loss_clip": 0.01167289, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.05445409, "balance_loss_mlp": 1.02751315, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.048197345879043, "language_loss": 0.81528586, "learning_rate": 3.930169980870018e-06, "loss": 0.83745575, "num_input_tokens_seen": 40252810, "step": 1860, "time_per_iteration": 2.7216553688049316 }, { "auxiliary_loss_clip": 0.01154007, "auxiliary_loss_mlp": 0.01060885, "balance_loss_clip": 1.05737674, "balance_loss_mlp": 1.03920078, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 2.00330439318394, "language_loss": 0.75250578, "learning_rate": 3.930067929872931e-06, "loss": 0.77465475, "num_input_tokens_seen": 40272000, "step": 1861, "time_per_iteration": 2.6878490447998047 }, { "auxiliary_loss_clip": 0.01177651, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.0565964, "balance_loss_mlp": 1.03360212, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 1.9427039767358767, "language_loss": 0.88888168, "learning_rate": 3.929965805687474e-06, "loss": 0.91120267, "num_input_tokens_seen": 40290660, "step": 1862, "time_per_iteration": 2.615057945251465 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.05994737, "balance_loss_mlp": 1.04086459, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.2273555113866847, "language_loss": 0.87719512, "learning_rate": 3.92986360831752e-06, "loss": 0.89946657, "num_input_tokens_seen": 40307820, "step": 1863, "time_per_iteration": 2.6778175830841064 }, { "auxiliary_loss_clip": 0.01158667, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05455208, "balance_loss_mlp": 1.03071773, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 2.8013407816012226, "language_loss": 0.64245486, "learning_rate": 3.929761337766945e-06, "loss": 0.66459453, "num_input_tokens_seen": 40327430, "step": 1864, "time_per_iteration": 2.724076509475708 }, { "auxiliary_loss_clip": 0.01110154, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.04924703, "balance_loss_mlp": 1.02672601, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.0303098144917135, "language_loss": 0.74043733, "learning_rate": 3.929658994039627e-06, "loss": 0.7620182, "num_input_tokens_seen": 40344545, "step": 1865, "time_per_iteration": 2.8119356632232666 }, { "auxiliary_loss_clip": 0.01114683, "auxiliary_loss_mlp": 0.01070203, "balance_loss_clip": 1.05348182, "balance_loss_mlp": 1.04483545, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.7389427033573375, "language_loss": 0.84692436, "learning_rate": 3.929556577139446e-06, "loss": 0.86877316, "num_input_tokens_seen": 40362300, "step": 1866, "time_per_iteration": 2.8022067546844482 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.00781014, "balance_loss_clip": 1.04227424, "balance_loss_mlp": 1.00006938, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.704208120094955, "language_loss": 0.8104012, "learning_rate": 3.929454087070286e-06, "loss": 0.82913494, "num_input_tokens_seen": 40384720, "step": 1867, "time_per_iteration": 2.915989875793457 }, { "auxiliary_loss_clip": 0.01179505, "auxiliary_loss_mlp": 0.01060529, "balance_loss_clip": 1.05720687, "balance_loss_mlp": 1.03959608, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 2.0811636681692844, "language_loss": 0.86840278, "learning_rate": 3.929351523836035e-06, "loss": 0.8908031, "num_input_tokens_seen": 40404000, "step": 1868, "time_per_iteration": 2.6855647563934326 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.00779977, "balance_loss_clip": 1.06005311, "balance_loss_mlp": 1.00010097, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.1491178409138376, "language_loss": 0.68308532, "learning_rate": 3.9292488874405795e-06, "loss": 0.70249927, "num_input_tokens_seen": 40418665, "step": 1869, "time_per_iteration": 2.7404487133026123 }, { "auxiliary_loss_clip": 0.01133783, "auxiliary_loss_mlp": 0.01066188, "balance_loss_clip": 1.04932964, "balance_loss_mlp": 1.04225063, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.5255545896853626, "language_loss": 0.76943326, "learning_rate": 3.929146177887814e-06, "loss": 0.79143298, "num_input_tokens_seen": 40437870, "step": 1870, "time_per_iteration": 2.809734344482422 }, { "auxiliary_loss_clip": 0.01129358, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.0509038, "balance_loss_mlp": 1.03300166, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 1.8186132867503446, "language_loss": 0.76056099, "learning_rate": 3.929043395181631e-06, "loss": 0.78242326, "num_input_tokens_seen": 40455570, "step": 1871, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01051114, "balance_loss_clip": 1.04993379, "balance_loss_mlp": 1.03026426, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 1.9425066802508644, "language_loss": 0.81811988, "learning_rate": 3.928940539325929e-06, "loss": 0.83968765, "num_input_tokens_seen": 40473600, "step": 1872, "time_per_iteration": 2.851868152618408 }, { "auxiliary_loss_clip": 0.01179923, "auxiliary_loss_mlp": 0.01055722, "balance_loss_clip": 1.05722499, "balance_loss_mlp": 1.03359652, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 2.186176467187071, "language_loss": 0.8361913, "learning_rate": 3.9288376103246095e-06, "loss": 0.85854775, "num_input_tokens_seen": 40490025, "step": 1873, "time_per_iteration": 2.6668763160705566 }, { "auxiliary_loss_clip": 0.01144862, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.0525465, "balance_loss_mlp": 1.03196871, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8822875514234196, "language_loss": 0.92342389, "learning_rate": 3.928734608181575e-06, "loss": 0.94541967, "num_input_tokens_seen": 40511580, "step": 1874, "time_per_iteration": 2.700533866882324 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.05100179, "balance_loss_mlp": 1.03509891, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.6564425098873434, "language_loss": 0.75359404, "learning_rate": 3.928631532900729e-06, "loss": 0.77556133, "num_input_tokens_seen": 40530155, "step": 1875, "time_per_iteration": 2.7642719745635986 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01055271, "balance_loss_clip": 1.05893159, "balance_loss_mlp": 1.0348264, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 2.12758140825061, "language_loss": 0.71578634, "learning_rate": 3.928528384485984e-06, "loss": 0.73800993, "num_input_tokens_seen": 40549500, "step": 1876, "time_per_iteration": 2.8505096435546875 }, { "auxiliary_loss_clip": 0.01147417, "auxiliary_loss_mlp": 0.01054094, "balance_loss_clip": 1.05223966, "balance_loss_mlp": 1.03200495, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.8103612630164048, "language_loss": 0.76795971, "learning_rate": 3.9284251629412475e-06, "loss": 0.78997481, "num_input_tokens_seen": 40567475, "step": 1877, "time_per_iteration": 2.6972849369049072 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.05518627, "balance_loss_mlp": 1.04026341, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 2.1601834607000368, "language_loss": 0.87843502, "learning_rate": 3.928321868270436e-06, "loss": 0.90074658, "num_input_tokens_seen": 40583280, "step": 1878, "time_per_iteration": 5.6992692947387695 }, { "auxiliary_loss_clip": 0.01140682, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.05420399, "balance_loss_mlp": 1.03333724, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.151084139284284, "language_loss": 0.81623232, "learning_rate": 3.928218500477466e-06, "loss": 0.83818817, "num_input_tokens_seen": 40603080, "step": 1879, "time_per_iteration": 2.8688366413116455 }, { "auxiliary_loss_clip": 0.01155904, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05238748, "balance_loss_mlp": 1.03609526, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 1.941623939252122, "language_loss": 0.70234305, "learning_rate": 3.928115059566259e-06, "loss": 0.72449279, "num_input_tokens_seen": 40623255, "step": 1880, "time_per_iteration": 5.567574739456177 }, { "auxiliary_loss_clip": 0.01155691, "auxiliary_loss_mlp": 0.01052309, "balance_loss_clip": 1.05585837, "balance_loss_mlp": 1.0306015, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.6696082535169858, "language_loss": 0.72690225, "learning_rate": 3.928011545540734e-06, "loss": 0.74898225, "num_input_tokens_seen": 40641570, "step": 1881, "time_per_iteration": 2.792428493499756 }, { "auxiliary_loss_clip": 0.011425, "auxiliary_loss_mlp": 0.00781179, "balance_loss_clip": 1.05046606, "balance_loss_mlp": 1.00008667, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.2964043184115783, "language_loss": 0.74205768, "learning_rate": 3.927907958404819e-06, "loss": 0.76129448, "num_input_tokens_seen": 40658775, "step": 1882, "time_per_iteration": 4.414916515350342 }, { "auxiliary_loss_clip": 0.01177281, "auxiliary_loss_mlp": 0.01054815, "balance_loss_clip": 1.05680335, "balance_loss_mlp": 1.03203452, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 2.4326158086005965, "language_loss": 0.7923016, "learning_rate": 3.92780429816244e-06, "loss": 0.81462252, "num_input_tokens_seen": 40679555, "step": 1883, "time_per_iteration": 2.762615919113159 }, { "auxiliary_loss_clip": 0.01140926, "auxiliary_loss_mlp": 0.01058465, "balance_loss_clip": 1.05226314, "balance_loss_mlp": 1.03520727, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 2.2898863699254974, "language_loss": 0.77047318, "learning_rate": 3.927700564817529e-06, "loss": 0.79246712, "num_input_tokens_seen": 40697295, "step": 1884, "time_per_iteration": 2.835468292236328 }, { "auxiliary_loss_clip": 0.01074478, "auxiliary_loss_mlp": 0.01009476, "balance_loss_clip": 1.03993821, "balance_loss_mlp": 1.00620937, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.8138652948403053, "language_loss": 0.55151373, "learning_rate": 3.927596758374019e-06, "loss": 0.5723533, "num_input_tokens_seen": 40758095, "step": 1885, "time_per_iteration": 3.179532289505005 }, { "auxiliary_loss_clip": 0.01083888, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.04415166, "balance_loss_mlp": 1.02910316, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 1.9836288003076585, "language_loss": 0.90384823, "learning_rate": 3.927492878835848e-06, "loss": 0.92519462, "num_input_tokens_seen": 40777140, "step": 1886, "time_per_iteration": 3.038928747177124 }, { "auxiliary_loss_clip": 0.01116325, "auxiliary_loss_mlp": 0.01057697, "balance_loss_clip": 1.05137897, "balance_loss_mlp": 1.03634632, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.0132756022974023, "language_loss": 0.84852886, "learning_rate": 3.927388926206953e-06, "loss": 0.87026906, "num_input_tokens_seen": 40797505, "step": 1887, "time_per_iteration": 3.178863048553467 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01056557, "balance_loss_clip": 1.05091035, "balance_loss_mlp": 1.03549314, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 2.847610033990257, "language_loss": 0.75826252, "learning_rate": 3.927284900491277e-06, "loss": 0.78019381, "num_input_tokens_seen": 40812970, "step": 1888, "time_per_iteration": 2.7349846363067627 }, { "auxiliary_loss_clip": 0.0113463, "auxiliary_loss_mlp": 0.01062359, "balance_loss_clip": 1.05614805, "balance_loss_mlp": 1.03892243, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 2.0598279187313624, "language_loss": 0.68104899, "learning_rate": 3.927180801692764e-06, "loss": 0.7030189, "num_input_tokens_seen": 40837745, "step": 1889, "time_per_iteration": 3.144444465637207 }, { "auxiliary_loss_clip": 0.01177206, "auxiliary_loss_mlp": 0.01049162, "balance_loss_clip": 1.05653095, "balance_loss_mlp": 1.02694094, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 1.7896678692754837, "language_loss": 0.83947051, "learning_rate": 3.927076629815362e-06, "loss": 0.86173415, "num_input_tokens_seen": 40856490, "step": 1890, "time_per_iteration": 2.73126482963562 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01056017, "balance_loss_clip": 1.05039728, "balance_loss_mlp": 1.03395164, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.1678723202845256, "language_loss": 0.64663875, "learning_rate": 3.926972384863022e-06, "loss": 0.66865045, "num_input_tokens_seen": 40874070, "step": 1891, "time_per_iteration": 2.7474160194396973 }, { "auxiliary_loss_clip": 0.01145505, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05395687, "balance_loss_mlp": 1.02773631, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.126575023047711, "language_loss": 0.87889415, "learning_rate": 3.9268680668396956e-06, "loss": 0.90083933, "num_input_tokens_seen": 40892425, "step": 1892, "time_per_iteration": 2.795269250869751 }, { "auxiliary_loss_clip": 0.01119535, "auxiliary_loss_mlp": 0.01079586, "balance_loss_clip": 1.05541015, "balance_loss_mlp": 1.05461168, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 3.1806920305576973, "language_loss": 0.72902197, "learning_rate": 3.926763675749339e-06, "loss": 0.75101316, "num_input_tokens_seen": 40912190, "step": 1893, "time_per_iteration": 2.890289306640625 }, { "auxiliary_loss_clip": 0.01175698, "auxiliary_loss_mlp": 0.0106591, "balance_loss_clip": 1.05438137, "balance_loss_mlp": 1.04290223, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 1.8842571229841023, "language_loss": 0.79247093, "learning_rate": 3.92665921159591e-06, "loss": 0.81488699, "num_input_tokens_seen": 40928395, "step": 1894, "time_per_iteration": 2.6820743083953857 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.05356526, "balance_loss_mlp": 1.03944933, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 3.429237983174195, "language_loss": 0.79718482, "learning_rate": 3.926554674383371e-06, "loss": 0.81930667, "num_input_tokens_seen": 40946555, "step": 1895, "time_per_iteration": 2.829946994781494 }, { "auxiliary_loss_clip": 0.01075529, "auxiliary_loss_mlp": 0.01018518, "balance_loss_clip": 1.03062391, "balance_loss_mlp": 1.0155375, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8041110638842961, "language_loss": 0.63357508, "learning_rate": 3.926450064115686e-06, "loss": 0.65451556, "num_input_tokens_seen": 41004910, "step": 1896, "time_per_iteration": 3.3087315559387207 }, { "auxiliary_loss_clip": 0.01147265, "auxiliary_loss_mlp": 0.0106086, "balance_loss_clip": 1.05560398, "balance_loss_mlp": 1.03663635, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 1.5952307342327186, "language_loss": 0.85055745, "learning_rate": 3.926345380796821e-06, "loss": 0.8726387, "num_input_tokens_seen": 41026385, "step": 1897, "time_per_iteration": 2.8522274494171143 }, { "auxiliary_loss_clip": 0.0117836, "auxiliary_loss_mlp": 0.00780276, "balance_loss_clip": 1.05591989, "balance_loss_mlp": 1.0001986, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 3.3624139627125587, "language_loss": 0.79675245, "learning_rate": 3.9262406244307465e-06, "loss": 0.81633884, "num_input_tokens_seen": 41045315, "step": 1898, "time_per_iteration": 2.760057210922241 }, { "auxiliary_loss_clip": 0.01115338, "auxiliary_loss_mlp": 0.01064417, "balance_loss_clip": 1.04594529, "balance_loss_mlp": 1.03965724, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 2.0191769665152903, "language_loss": 0.73251313, "learning_rate": 3.926135795021435e-06, "loss": 0.75431061, "num_input_tokens_seen": 41063390, "step": 1899, "time_per_iteration": 2.7363204956054688 }, { "auxiliary_loss_clip": 0.01042449, "auxiliary_loss_mlp": 0.01003313, "balance_loss_clip": 1.03643703, "balance_loss_mlp": 1.0003922, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9089505356695228, "language_loss": 0.63434029, "learning_rate": 3.92603089257286e-06, "loss": 0.65479791, "num_input_tokens_seen": 41124180, "step": 1900, "time_per_iteration": 3.2045955657958984 }, { "auxiliary_loss_clip": 0.01113626, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.04929233, "balance_loss_mlp": 1.04378414, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.577500478750639, "language_loss": 0.77943742, "learning_rate": 3.925925917089001e-06, "loss": 0.80124187, "num_input_tokens_seen": 41143485, "step": 1901, "time_per_iteration": 2.745089530944824 }, { "auxiliary_loss_clip": 0.01171621, "auxiliary_loss_mlp": 0.01057834, "balance_loss_clip": 1.05803061, "balance_loss_mlp": 1.0359118, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 2.175933638179557, "language_loss": 0.84158623, "learning_rate": 3.925820868573839e-06, "loss": 0.86388075, "num_input_tokens_seen": 41161695, "step": 1902, "time_per_iteration": 2.6433799266815186 }, { "auxiliary_loss_clip": 0.01159941, "auxiliary_loss_mlp": 0.01056662, "balance_loss_clip": 1.05280399, "balance_loss_mlp": 1.03122306, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.7702735053047673, "language_loss": 0.77720451, "learning_rate": 3.925715747031356e-06, "loss": 0.79937053, "num_input_tokens_seen": 41181715, "step": 1903, "time_per_iteration": 2.6385905742645264 }, { "auxiliary_loss_clip": 0.01145143, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.05293322, "balance_loss_mlp": 1.02174175, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 2.212790565732917, "language_loss": 0.75751555, "learning_rate": 3.925610552465539e-06, "loss": 0.77938658, "num_input_tokens_seen": 41201770, "step": 1904, "time_per_iteration": 2.632152557373047 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.05207586, "balance_loss_mlp": 1.03279781, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.4422699353972006, "language_loss": 0.91853034, "learning_rate": 3.9255052848803764e-06, "loss": 0.94058943, "num_input_tokens_seen": 41220590, "step": 1905, "time_per_iteration": 2.7421486377716064 }, { "auxiliary_loss_clip": 0.01161686, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.04978943, "balance_loss_mlp": 1.02612448, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.5117992419356066, "language_loss": 0.77484202, "learning_rate": 3.925399944279861e-06, "loss": 0.79696143, "num_input_tokens_seen": 41237250, "step": 1906, "time_per_iteration": 2.69333553314209 }, { "auxiliary_loss_clip": 0.0117911, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.05697322, "balance_loss_mlp": 1.03222847, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 2.0720467666322113, "language_loss": 0.81739306, "learning_rate": 3.925294530667986e-06, "loss": 0.83973539, "num_input_tokens_seen": 41256680, "step": 1907, "time_per_iteration": 2.6531317234039307 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01065473, "balance_loss_clip": 1.05235374, "balance_loss_mlp": 1.04227471, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 2.1769364553121293, "language_loss": 0.84901214, "learning_rate": 3.92518904404875e-06, "loss": 0.87103164, "num_input_tokens_seen": 41270955, "step": 1908, "time_per_iteration": 2.8768258094787598 }, { "auxiliary_loss_clip": 0.01029536, "auxiliary_loss_mlp": 0.01020856, "balance_loss_clip": 1.02524137, "balance_loss_mlp": 1.01694632, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9197306473097341, "language_loss": 0.61072773, "learning_rate": 3.925083484426153e-06, "loss": 0.63123173, "num_input_tokens_seen": 41319180, "step": 1909, "time_per_iteration": 3.0845727920532227 }, { "auxiliary_loss_clip": 0.01182744, "auxiliary_loss_mlp": 0.01054075, "balance_loss_clip": 1.06014562, "balance_loss_mlp": 1.03219986, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 7.319166590530674, "language_loss": 0.79170966, "learning_rate": 3.924977851804197e-06, "loss": 0.81407785, "num_input_tokens_seen": 41337480, "step": 1910, "time_per_iteration": 2.708704710006714 }, { "auxiliary_loss_clip": 0.01156489, "auxiliary_loss_mlp": 0.01052406, "balance_loss_clip": 1.0580864, "balance_loss_mlp": 1.03029275, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.117911712245717, "language_loss": 0.7702589, "learning_rate": 3.9248721461868875e-06, "loss": 0.79234779, "num_input_tokens_seen": 41354650, "step": 1911, "time_per_iteration": 2.7597720623016357 }, { "auxiliary_loss_clip": 0.01159986, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.03227139, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.677508784227342, "language_loss": 0.79177421, "learning_rate": 3.9247663675782336e-06, "loss": 0.81392002, "num_input_tokens_seen": 41376935, "step": 1912, "time_per_iteration": 2.8143310546875 }, { "auxiliary_loss_clip": 0.01183047, "auxiliary_loss_mlp": 0.00779659, "balance_loss_clip": 1.06065917, "balance_loss_mlp": 1.00014925, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 2.291252405113977, "language_loss": 0.77942276, "learning_rate": 3.924660515982246e-06, "loss": 0.79904979, "num_input_tokens_seen": 41396105, "step": 1913, "time_per_iteration": 2.696430206298828 }, { "auxiliary_loss_clip": 0.01166892, "auxiliary_loss_mlp": 0.01052769, "balance_loss_clip": 1.05442226, "balance_loss_mlp": 1.02953506, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 1.8145547055361753, "language_loss": 0.7003395, "learning_rate": 3.924554591402939e-06, "loss": 0.72253609, "num_input_tokens_seen": 41415600, "step": 1914, "time_per_iteration": 2.739251136779785 }, { "auxiliary_loss_clip": 0.01007182, "auxiliary_loss_mlp": 0.01004682, "balance_loss_clip": 1.02677619, "balance_loss_mlp": 1.00191641, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7558771871458172, "language_loss": 0.61059874, "learning_rate": 3.92444859384433e-06, "loss": 0.6307174, "num_input_tokens_seen": 41478760, "step": 1915, "time_per_iteration": 3.56019926071167 }, { "auxiliary_loss_clip": 0.01166434, "auxiliary_loss_mlp": 0.01058573, "balance_loss_clip": 1.05994964, "balance_loss_mlp": 1.03595936, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.437201506258279, "language_loss": 0.93116963, "learning_rate": 3.924342523310436e-06, "loss": 0.95341969, "num_input_tokens_seen": 41495720, "step": 1916, "time_per_iteration": 3.244772434234619 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01059827, "balance_loss_clip": 1.05798697, "balance_loss_mlp": 1.03470993, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 1.8909260082350545, "language_loss": 0.72560197, "learning_rate": 3.9242363798052806e-06, "loss": 0.74781156, "num_input_tokens_seen": 41513585, "step": 1917, "time_per_iteration": 4.502236843109131 }, { "auxiliary_loss_clip": 0.01138773, "auxiliary_loss_mlp": 0.0104964, "balance_loss_clip": 1.05739903, "balance_loss_mlp": 1.02700245, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 9.147356795176979, "language_loss": 0.74213129, "learning_rate": 3.92413016333289e-06, "loss": 0.76401544, "num_input_tokens_seen": 41533390, "step": 1918, "time_per_iteration": 4.344711065292358 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.05532503, "balance_loss_mlp": 1.02450073, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 3.182152136597976, "language_loss": 0.86367452, "learning_rate": 3.92402387389729e-06, "loss": 0.88563335, "num_input_tokens_seen": 41551015, "step": 1919, "time_per_iteration": 4.540036201477051 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01067867, "balance_loss_clip": 1.0496366, "balance_loss_mlp": 1.04172444, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.93595243799445, "language_loss": 0.86735415, "learning_rate": 3.923917511502512e-06, "loss": 0.8893733, "num_input_tokens_seen": 41568055, "step": 1920, "time_per_iteration": 2.7719242572784424 }, { "auxiliary_loss_clip": 0.011686, "auxiliary_loss_mlp": 0.010528, "balance_loss_clip": 1.0593946, "balance_loss_mlp": 1.0302341, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 4.512761907267092, "language_loss": 0.79294932, "learning_rate": 3.923811076152589e-06, "loss": 0.81516337, "num_input_tokens_seen": 41587435, "step": 1921, "time_per_iteration": 2.798673629760742 }, { "auxiliary_loss_clip": 0.01174604, "auxiliary_loss_mlp": 0.01063526, "balance_loss_clip": 1.05685806, "balance_loss_mlp": 1.04007721, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 2.4057040360661484, "language_loss": 0.78464305, "learning_rate": 3.923704567851557e-06, "loss": 0.80702436, "num_input_tokens_seen": 41604975, "step": 1922, "time_per_iteration": 4.352341651916504 }, { "auxiliary_loss_clip": 0.01092284, "auxiliary_loss_mlp": 0.01064602, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.04229808, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.8560991769949675, "language_loss": 0.84293079, "learning_rate": 3.923597986603456e-06, "loss": 0.86449969, "num_input_tokens_seen": 41626155, "step": 1923, "time_per_iteration": 3.2956740856170654 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01056739, "balance_loss_clip": 1.0600003, "balance_loss_mlp": 1.03317094, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 1.944851076041885, "language_loss": 0.80890471, "learning_rate": 3.9234913324123264e-06, "loss": 0.83119166, "num_input_tokens_seen": 41644805, "step": 1924, "time_per_iteration": 3.0939247608184814 }, { "auxiliary_loss_clip": 0.01055916, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.03045607, "balance_loss_mlp": 1.02436543, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8171642061509322, "language_loss": 0.61196578, "learning_rate": 3.923384605282212e-06, "loss": 0.63279623, "num_input_tokens_seen": 41709345, "step": 1925, "time_per_iteration": 3.3765265941619873 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.01079328, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.0549382, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.7772533553430212, "language_loss": 0.74766397, "learning_rate": 3.923277805217161e-06, "loss": 0.77001572, "num_input_tokens_seen": 41730210, "step": 1926, "time_per_iteration": 2.754974126815796 }, { "auxiliary_loss_clip": 0.01116228, "auxiliary_loss_mlp": 0.00781701, "balance_loss_clip": 1.04683304, "balance_loss_mlp": 1.00016665, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 4.731879086182685, "language_loss": 0.71978599, "learning_rate": 3.923170932221222e-06, "loss": 0.7387653, "num_input_tokens_seen": 41750270, "step": 1927, "time_per_iteration": 2.9454004764556885 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01058796, "balance_loss_clip": 1.05250621, "balance_loss_mlp": 1.03572917, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.5938674022456252, "language_loss": 0.86854041, "learning_rate": 3.92306398629845e-06, "loss": 0.89050794, "num_input_tokens_seen": 41772975, "step": 1928, "time_per_iteration": 2.832750082015991 }, { "auxiliary_loss_clip": 0.01129041, "auxiliary_loss_mlp": 0.01060836, "balance_loss_clip": 1.05032003, "balance_loss_mlp": 1.03706551, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.6639520350020578, "language_loss": 0.77450585, "learning_rate": 3.922956967452898e-06, "loss": 0.79640466, "num_input_tokens_seen": 41791765, "step": 1929, "time_per_iteration": 2.7876811027526855 }, { "auxiliary_loss_clip": 0.01176887, "auxiliary_loss_mlp": 0.01063611, "balance_loss_clip": 1.05667901, "balance_loss_mlp": 1.0424509, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.8085677541874856, "language_loss": 0.76831949, "learning_rate": 3.922849875688626e-06, "loss": 0.79072452, "num_input_tokens_seen": 41815615, "step": 1930, "time_per_iteration": 2.819934844970703 }, { "auxiliary_loss_clip": 0.01145781, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.05066586, "balance_loss_mlp": 1.03165817, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.9434791543130712, "language_loss": 0.72291863, "learning_rate": 3.922742711009693e-06, "loss": 0.74491692, "num_input_tokens_seen": 41834810, "step": 1931, "time_per_iteration": 2.8078088760375977 }, { "auxiliary_loss_clip": 0.01146409, "auxiliary_loss_mlp": 0.01061336, "balance_loss_clip": 1.05090261, "balance_loss_mlp": 1.03575325, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 1.7378937044391531, "language_loss": 0.8222791, "learning_rate": 3.922635473420164e-06, "loss": 0.8443566, "num_input_tokens_seen": 41854975, "step": 1932, "time_per_iteration": 2.7495200634002686 }, { "auxiliary_loss_clip": 0.01030493, "auxiliary_loss_mlp": 0.01018834, "balance_loss_clip": 1.02184403, "balance_loss_mlp": 1.01556778, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7669378012870447, "language_loss": 0.61050332, "learning_rate": 3.922528162924105e-06, "loss": 0.63099658, "num_input_tokens_seen": 41911105, "step": 1933, "time_per_iteration": 3.256678581237793 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.00780156, "balance_loss_clip": 1.04764509, "balance_loss_mlp": 1.00006175, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.830760437639296, "language_loss": 0.85790741, "learning_rate": 3.922420779525586e-06, "loss": 0.8767947, "num_input_tokens_seen": 41931750, "step": 1934, "time_per_iteration": 2.9144253730773926 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01059839, "balance_loss_clip": 1.04929256, "balance_loss_mlp": 1.03453088, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.625764216143105, "language_loss": 0.66222906, "learning_rate": 3.9223133232286776e-06, "loss": 0.68400419, "num_input_tokens_seen": 41949400, "step": 1935, "time_per_iteration": 2.867152452468872 }, { "auxiliary_loss_clip": 0.01183991, "auxiliary_loss_mlp": 0.01052492, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.03111792, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 2.025938843377603, "language_loss": 0.75678742, "learning_rate": 3.922205794037456e-06, "loss": 0.77915227, "num_input_tokens_seen": 41968100, "step": 1936, "time_per_iteration": 2.7282185554504395 }, { "auxiliary_loss_clip": 0.01179718, "auxiliary_loss_mlp": 0.01049532, "balance_loss_clip": 1.05632091, "balance_loss_mlp": 1.02639306, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 2.0032002399718905, "language_loss": 0.84086847, "learning_rate": 3.922098191955998e-06, "loss": 0.86316097, "num_input_tokens_seen": 41986375, "step": 1937, "time_per_iteration": 2.715386152267456 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01048961, "balance_loss_clip": 1.05258632, "balance_loss_mlp": 1.0268234, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 3.0485930101216607, "language_loss": 0.7617709, "learning_rate": 3.921990516988384e-06, "loss": 0.78378135, "num_input_tokens_seen": 42006055, "step": 1938, "time_per_iteration": 2.7624804973602295 }, { "auxiliary_loss_clip": 0.01182576, "auxiliary_loss_mlp": 0.01055104, "balance_loss_clip": 1.05742419, "balance_loss_mlp": 1.03250146, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 1.7682499083089231, "language_loss": 0.79677606, "learning_rate": 3.921882769138696e-06, "loss": 0.81915289, "num_input_tokens_seen": 42024995, "step": 1939, "time_per_iteration": 2.71458101272583 }, { "auxiliary_loss_clip": 0.01148291, "auxiliary_loss_mlp": 0.01057951, "balance_loss_clip": 1.05209351, "balance_loss_mlp": 1.03508627, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 2.2281245193552475, "language_loss": 0.85916591, "learning_rate": 3.9217749484110215e-06, "loss": 0.88122833, "num_input_tokens_seen": 42042640, "step": 1940, "time_per_iteration": 2.7322728633880615 }, { "auxiliary_loss_clip": 0.01153746, "auxiliary_loss_mlp": 0.01056301, "balance_loss_clip": 1.05659437, "balance_loss_mlp": 1.03548717, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.4952807995381137, "language_loss": 0.75590646, "learning_rate": 3.921667054809449e-06, "loss": 0.77800703, "num_input_tokens_seen": 42067005, "step": 1941, "time_per_iteration": 2.9211390018463135 }, { "auxiliary_loss_clip": 0.01149585, "auxiliary_loss_mlp": 0.00780203, "balance_loss_clip": 1.05181897, "balance_loss_mlp": 1.00006557, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.277225749463833, "language_loss": 0.88847101, "learning_rate": 3.921559088338068e-06, "loss": 0.90776885, "num_input_tokens_seen": 42082295, "step": 1942, "time_per_iteration": 2.7145469188690186 }, { "auxiliary_loss_clip": 0.01165183, "auxiliary_loss_mlp": 0.01056257, "balance_loss_clip": 1.05553317, "balance_loss_mlp": 1.03552663, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.6547450593003057, "language_loss": 0.67979252, "learning_rate": 3.921451049000975e-06, "loss": 0.70200694, "num_input_tokens_seen": 42105295, "step": 1943, "time_per_iteration": 2.789701461791992 }, { "auxiliary_loss_clip": 0.01153022, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 1.02591634, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 1.9817763000300312, "language_loss": 0.69831288, "learning_rate": 3.921342936802265e-06, "loss": 0.72031963, "num_input_tokens_seen": 42125520, "step": 1944, "time_per_iteration": 2.827150583267212 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.03158641, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.4963028532298175, "language_loss": 0.82662582, "learning_rate": 3.921234751746038e-06, "loss": 0.84870374, "num_input_tokens_seen": 42146335, "step": 1945, "time_per_iteration": 2.7190194129943848 }, { "auxiliary_loss_clip": 0.01137101, "auxiliary_loss_mlp": 0.01062082, "balance_loss_clip": 1.04682803, "balance_loss_mlp": 1.04005265, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.3643045784637735, "language_loss": 0.76298034, "learning_rate": 3.9211264938363975e-06, "loss": 0.78497219, "num_input_tokens_seen": 42165320, "step": 1946, "time_per_iteration": 2.792555093765259 }, { "auxiliary_loss_clip": 0.01134728, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.0507704, "balance_loss_mlp": 1.03536999, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 2.058923240355934, "language_loss": 0.69014907, "learning_rate": 3.921018163077448e-06, "loss": 0.71205747, "num_input_tokens_seen": 42182955, "step": 1947, "time_per_iteration": 2.643807888031006 }, { "auxiliary_loss_clip": 0.01154759, "auxiliary_loss_mlp": 0.01067767, "balance_loss_clip": 1.05707347, "balance_loss_mlp": 1.04604673, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 2.0690991629011615, "language_loss": 0.85044622, "learning_rate": 3.920909759473295e-06, "loss": 0.87267148, "num_input_tokens_seen": 42200760, "step": 1948, "time_per_iteration": 2.6399292945861816 }, { "auxiliary_loss_clip": 0.01051031, "auxiliary_loss_mlp": 0.0075782, "balance_loss_clip": 1.0245688, "balance_loss_mlp": 0.99997467, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8206821069070506, "language_loss": 0.65139282, "learning_rate": 3.920801283028054e-06, "loss": 0.66948134, "num_input_tokens_seen": 42265745, "step": 1949, "time_per_iteration": 3.3030900955200195 }, { "auxiliary_loss_clip": 0.01159399, "auxiliary_loss_mlp": 0.01061163, "balance_loss_clip": 1.05735683, "balance_loss_mlp": 1.04054022, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.512876015443777, "language_loss": 0.71746683, "learning_rate": 3.920692733745835e-06, "loss": 0.73967248, "num_input_tokens_seen": 42286245, "step": 1950, "time_per_iteration": 2.739341974258423 }, { "auxiliary_loss_clip": 0.01175731, "auxiliary_loss_mlp": 0.01061149, "balance_loss_clip": 1.06152189, "balance_loss_mlp": 1.03907192, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.1258853115079996, "language_loss": 0.76671386, "learning_rate": 3.920584111630755e-06, "loss": 0.78908259, "num_input_tokens_seen": 42302710, "step": 1951, "time_per_iteration": 2.624788999557495 }, { "auxiliary_loss_clip": 0.01129104, "auxiliary_loss_mlp": 0.0106562, "balance_loss_clip": 1.05285251, "balance_loss_mlp": 1.04435349, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.7264952730121887, "language_loss": 0.75964963, "learning_rate": 3.9204754166869325e-06, "loss": 0.7815969, "num_input_tokens_seen": 42324115, "step": 1952, "time_per_iteration": 2.824826955795288 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01065929, "balance_loss_clip": 1.04589534, "balance_loss_mlp": 1.04451907, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 2.2111022500713453, "language_loss": 0.72316217, "learning_rate": 3.920366648918491e-06, "loss": 0.74505818, "num_input_tokens_seen": 42342505, "step": 1953, "time_per_iteration": 2.7456531524658203 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.00781136, "balance_loss_clip": 1.0549686, "balance_loss_mlp": 1.0000577, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.1208802652878522, "language_loss": 0.79780388, "learning_rate": 3.920257808329552e-06, "loss": 0.81710744, "num_input_tokens_seen": 42360525, "step": 1954, "time_per_iteration": 2.653949737548828 }, { "auxiliary_loss_clip": 0.01112399, "auxiliary_loss_mlp": 0.01059787, "balance_loss_clip": 1.04880822, "balance_loss_mlp": 1.03763783, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 1.9673692595826442, "language_loss": 0.8553021, "learning_rate": 3.920148894924246e-06, "loss": 0.87702394, "num_input_tokens_seen": 42377045, "step": 1955, "time_per_iteration": 2.7987124919891357 }, { "auxiliary_loss_clip": 0.01163172, "auxiliary_loss_mlp": 0.00779783, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00016606, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.12926288831445, "language_loss": 0.78105426, "learning_rate": 3.920039908706701e-06, "loss": 0.80048382, "num_input_tokens_seen": 42393960, "step": 1956, "time_per_iteration": 2.6247944831848145 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01058454, "balance_loss_clip": 1.05559933, "balance_loss_mlp": 1.03601909, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.264983200322237, "language_loss": 0.80487299, "learning_rate": 3.91993084968105e-06, "loss": 0.82704043, "num_input_tokens_seen": 42413160, "step": 1957, "time_per_iteration": 5.862411260604858 }, { "auxiliary_loss_clip": 0.01168294, "auxiliary_loss_mlp": 0.0105259, "balance_loss_clip": 1.05703866, "balance_loss_mlp": 1.0308696, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 4.8672025609093215, "language_loss": 0.77955222, "learning_rate": 3.919821717851428e-06, "loss": 0.80176103, "num_input_tokens_seen": 42432590, "step": 1958, "time_per_iteration": 4.4218549728393555 }, { "auxiliary_loss_clip": 0.01149976, "auxiliary_loss_mlp": 0.0105003, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.02680755, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 1.7537692363765556, "language_loss": 0.77002251, "learning_rate": 3.919712513221976e-06, "loss": 0.79202259, "num_input_tokens_seen": 42450135, "step": 1959, "time_per_iteration": 2.674323558807373 }, { "auxiliary_loss_clip": 0.01162585, "auxiliary_loss_mlp": 0.01057019, "balance_loss_clip": 1.05857027, "balance_loss_mlp": 1.03484631, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 2.2026367524708927, "language_loss": 0.70078689, "learning_rate": 3.919603235796832e-06, "loss": 0.722983, "num_input_tokens_seen": 42470050, "step": 1960, "time_per_iteration": 2.7704508304595947 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.05841374, "balance_loss_mlp": 1.03228831, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 2.663996374773888, "language_loss": 0.81045067, "learning_rate": 3.9194938855801406e-06, "loss": 0.83261371, "num_input_tokens_seen": 42484335, "step": 1961, "time_per_iteration": 4.67006778717041 }, { "auxiliary_loss_clip": 0.01163817, "auxiliary_loss_mlp": 0.00779643, "balance_loss_clip": 1.05658793, "balance_loss_mlp": 1.00009537, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.71345119244153, "language_loss": 0.92273545, "learning_rate": 3.919384462576049e-06, "loss": 0.94217002, "num_input_tokens_seen": 42502720, "step": 1962, "time_per_iteration": 2.6559524536132812 }, { "auxiliary_loss_clip": 0.01139826, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.05222392, "balance_loss_mlp": 1.03704107, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.157203116008796, "language_loss": 0.87635934, "learning_rate": 3.919274966788707e-06, "loss": 0.8983472, "num_input_tokens_seen": 42519460, "step": 1963, "time_per_iteration": 2.710042715072632 }, { "auxiliary_loss_clip": 0.0115823, "auxiliary_loss_mlp": 0.00779391, "balance_loss_clip": 1.05600929, "balance_loss_mlp": 1.00011134, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 2.8331529324994333, "language_loss": 0.83879703, "learning_rate": 3.919165398222265e-06, "loss": 0.85817325, "num_input_tokens_seen": 42539420, "step": 1964, "time_per_iteration": 2.734941244125366 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01069054, "balance_loss_clip": 1.05171156, "balance_loss_mlp": 1.04628491, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 3.9132941826799543, "language_loss": 0.8313272, "learning_rate": 3.919055756880879e-06, "loss": 0.85324299, "num_input_tokens_seen": 42558225, "step": 1965, "time_per_iteration": 2.7427306175231934 }, { "auxiliary_loss_clip": 0.01178673, "auxiliary_loss_mlp": 0.01053338, "balance_loss_clip": 1.05815279, "balance_loss_mlp": 1.03163004, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.6720023918141877, "language_loss": 0.74227381, "learning_rate": 3.918946042768707e-06, "loss": 0.76459396, "num_input_tokens_seen": 42580790, "step": 1966, "time_per_iteration": 2.8265397548675537 }, { "auxiliary_loss_clip": 0.01163407, "auxiliary_loss_mlp": 0.0106081, "balance_loss_clip": 1.06309748, "balance_loss_mlp": 1.03836274, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 2.5628488285375397, "language_loss": 0.73137337, "learning_rate": 3.918836255889908e-06, "loss": 0.7536155, "num_input_tokens_seen": 42597355, "step": 1967, "time_per_iteration": 2.706193685531616 }, { "auxiliary_loss_clip": 0.01167052, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.05852592, "balance_loss_mlp": 1.03141701, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 5.332816815546028, "language_loss": 0.8831054, "learning_rate": 3.9187263962486456e-06, "loss": 0.90531063, "num_input_tokens_seen": 42616060, "step": 1968, "time_per_iteration": 2.6308343410491943 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.06406927, "balance_loss_mlp": 1.0294776, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 2.252087054693662, "language_loss": 0.67010254, "learning_rate": 3.918616463849087e-06, "loss": 0.69230425, "num_input_tokens_seen": 42636285, "step": 1969, "time_per_iteration": 2.662480592727661 }, { "auxiliary_loss_clip": 0.01130071, "auxiliary_loss_mlp": 0.0106143, "balance_loss_clip": 1.05177045, "balance_loss_mlp": 1.03774357, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 2.153814675458072, "language_loss": 0.80455101, "learning_rate": 3.918506458695399e-06, "loss": 0.82646602, "num_input_tokens_seen": 42658320, "step": 1970, "time_per_iteration": 2.798050880432129 }, { "auxiliary_loss_clip": 0.01060284, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.02553701, "balance_loss_mlp": 1.01892686, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8165911228106061, "language_loss": 0.66192186, "learning_rate": 3.918396380791754e-06, "loss": 0.68273854, "num_input_tokens_seen": 42721500, "step": 1971, "time_per_iteration": 3.167018413543701 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.05294323, "balance_loss_mlp": 1.03422379, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.1839859106137554, "language_loss": 0.79782552, "learning_rate": 3.918286230142327e-06, "loss": 0.81990343, "num_input_tokens_seen": 42739825, "step": 1972, "time_per_iteration": 2.6908793449401855 }, { "auxiliary_loss_clip": 0.01133219, "auxiliary_loss_mlp": 0.00778766, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.00005877, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.0473813607633384, "language_loss": 0.72843599, "learning_rate": 3.918176006751292e-06, "loss": 0.74755585, "num_input_tokens_seen": 42758695, "step": 1973, "time_per_iteration": 2.7801859378814697 }, { "auxiliary_loss_clip": 0.01138022, "auxiliary_loss_mlp": 0.01049764, "balance_loss_clip": 1.05580497, "balance_loss_mlp": 1.02707887, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.6449677647733996, "language_loss": 0.72019619, "learning_rate": 3.918065710622832e-06, "loss": 0.74207413, "num_input_tokens_seen": 42778510, "step": 1974, "time_per_iteration": 2.7337663173675537 }, { "auxiliary_loss_clip": 0.01129602, "auxiliary_loss_mlp": 0.01043161, "balance_loss_clip": 1.05265522, "balance_loss_mlp": 1.02086854, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.017372400194955, "language_loss": 0.77409399, "learning_rate": 3.917955341761128e-06, "loss": 0.79582161, "num_input_tokens_seen": 42793995, "step": 1975, "time_per_iteration": 2.669546604156494 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.05880177, "balance_loss_mlp": 1.03908277, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.3842578575289, "language_loss": 0.75110453, "learning_rate": 3.917844900170364e-06, "loss": 0.77301902, "num_input_tokens_seen": 42809000, "step": 1976, "time_per_iteration": 2.8439090251922607 }, { "auxiliary_loss_clip": 0.0116819, "auxiliary_loss_mlp": 0.01049523, "balance_loss_clip": 1.05999744, "balance_loss_mlp": 1.02835166, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.8674311015318124, "language_loss": 0.74877423, "learning_rate": 3.91773438585473e-06, "loss": 0.77095133, "num_input_tokens_seen": 42831585, "step": 1977, "time_per_iteration": 2.6747169494628906 }, { "auxiliary_loss_clip": 0.01182095, "auxiliary_loss_mlp": 0.01059621, "balance_loss_clip": 1.05954552, "balance_loss_mlp": 1.03805614, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.1793079873879604, "language_loss": 0.74207634, "learning_rate": 3.9176237988184165e-06, "loss": 0.76449353, "num_input_tokens_seen": 42848420, "step": 1978, "time_per_iteration": 2.631664514541626 }, { "auxiliary_loss_clip": 0.01142323, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.06037045, "balance_loss_mlp": 1.0289247, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 1.7170872786869797, "language_loss": 0.73256385, "learning_rate": 3.917513139065616e-06, "loss": 0.754493, "num_input_tokens_seen": 42866645, "step": 1979, "time_per_iteration": 2.7442541122436523 }, { "auxiliary_loss_clip": 0.01137516, "auxiliary_loss_mlp": 0.01051378, "balance_loss_clip": 1.0566175, "balance_loss_mlp": 1.02968168, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.876224505386343, "language_loss": 0.98293436, "learning_rate": 3.917402406600525e-06, "loss": 1.00482333, "num_input_tokens_seen": 42888515, "step": 1980, "time_per_iteration": 2.787667989730835 }, { "auxiliary_loss_clip": 0.01153629, "auxiliary_loss_mlp": 0.01053612, "balance_loss_clip": 1.05595791, "balance_loss_mlp": 1.03077161, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 1.7507584506289393, "language_loss": 0.86265099, "learning_rate": 3.917291601427342e-06, "loss": 0.88472342, "num_input_tokens_seen": 42909035, "step": 1981, "time_per_iteration": 2.6680359840393066 }, { "auxiliary_loss_clip": 0.01158736, "auxiliary_loss_mlp": 0.01064978, "balance_loss_clip": 1.06144083, "balance_loss_mlp": 1.04214907, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.8908045276276995, "language_loss": 0.85375237, "learning_rate": 3.91718072355027e-06, "loss": 0.87598956, "num_input_tokens_seen": 42927555, "step": 1982, "time_per_iteration": 2.732797861099243 }, { "auxiliary_loss_clip": 0.01146432, "auxiliary_loss_mlp": 0.01050259, "balance_loss_clip": 1.05539966, "balance_loss_mlp": 1.02843213, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 2.3856086229742877, "language_loss": 0.85202634, "learning_rate": 3.917069772973513e-06, "loss": 0.87399322, "num_input_tokens_seen": 42945300, "step": 1983, "time_per_iteration": 2.6839804649353027 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.05602145, "balance_loss_mlp": 1.03399742, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 3.6641824085676022, "language_loss": 0.7693429, "learning_rate": 3.916958749701277e-06, "loss": 0.79116929, "num_input_tokens_seen": 42961295, "step": 1984, "time_per_iteration": 2.7008767127990723 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.05752373, "balance_loss_mlp": 1.0334003, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 1.917528093726237, "language_loss": 0.83058321, "learning_rate": 3.9168476537377745e-06, "loss": 0.85275191, "num_input_tokens_seen": 42980330, "step": 1985, "time_per_iteration": 2.6692728996276855 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0541923, "balance_loss_mlp": 1.02835393, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 1.8732848573733223, "language_loss": 0.74398553, "learning_rate": 3.916736485087216e-06, "loss": 0.76600474, "num_input_tokens_seen": 42996125, "step": 1986, "time_per_iteration": 2.722013473510742 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.05472732, "balance_loss_mlp": 1.03791952, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 2.4724436343771083, "language_loss": 0.72123617, "learning_rate": 3.916625243753819e-06, "loss": 0.74328756, "num_input_tokens_seen": 43014180, "step": 1987, "time_per_iteration": 2.814481258392334 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01054644, "balance_loss_clip": 1.05747938, "balance_loss_mlp": 1.03138638, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 1.9246234449532542, "language_loss": 0.72007513, "learning_rate": 3.916513929741799e-06, "loss": 0.74219012, "num_input_tokens_seen": 43032120, "step": 1988, "time_per_iteration": 2.7242019176483154 }, { "auxiliary_loss_clip": 0.0116348, "auxiliary_loss_mlp": 0.01062102, "balance_loss_clip": 1.05559146, "balance_loss_mlp": 1.03913057, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 1.7561483239324645, "language_loss": 0.81144297, "learning_rate": 3.91640254305538e-06, "loss": 0.83369875, "num_input_tokens_seen": 43052215, "step": 1989, "time_per_iteration": 2.6259546279907227 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01057689, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.03325129, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.5516320258539795, "language_loss": 0.75881672, "learning_rate": 3.916291083698784e-06, "loss": 0.7807532, "num_input_tokens_seen": 43069720, "step": 1990, "time_per_iteration": 2.6779251098632812 }, { "auxiliary_loss_clip": 0.0105322, "auxiliary_loss_mlp": 0.01019112, "balance_loss_clip": 1.02816892, "balance_loss_mlp": 1.01647794, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8628582727639288, "language_loss": 0.55184531, "learning_rate": 3.916179551676238e-06, "loss": 0.57256866, "num_input_tokens_seen": 43123130, "step": 1991, "time_per_iteration": 3.3713693618774414 }, { "auxiliary_loss_clip": 0.01136423, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.03326464, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.286300891386994, "language_loss": 0.78371406, "learning_rate": 3.916067946991971e-06, "loss": 0.80561793, "num_input_tokens_seen": 43140015, "step": 1992, "time_per_iteration": 2.6797914505004883 }, { "auxiliary_loss_clip": 0.0117949, "auxiliary_loss_mlp": 0.01056635, "balance_loss_clip": 1.05811, "balance_loss_mlp": 1.03453374, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 1.8481811043026504, "language_loss": 0.78911144, "learning_rate": 3.915956269650216e-06, "loss": 0.81147265, "num_input_tokens_seen": 43160105, "step": 1993, "time_per_iteration": 2.691301107406616 }, { "auxiliary_loss_clip": 0.01126423, "auxiliary_loss_mlp": 0.0106217, "balance_loss_clip": 1.05012226, "balance_loss_mlp": 1.04081941, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.644866568705103, "language_loss": 0.82088816, "learning_rate": 3.915844519655208e-06, "loss": 0.84277415, "num_input_tokens_seen": 43179835, "step": 1994, "time_per_iteration": 2.772905111312866 }, { "auxiliary_loss_clip": 0.0115068, "auxiliary_loss_mlp": 0.01063961, "balance_loss_clip": 1.05523098, "balance_loss_mlp": 1.0433259, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.0065598513575247, "language_loss": 0.88392794, "learning_rate": 3.915732697011183e-06, "loss": 0.9060744, "num_input_tokens_seen": 43197210, "step": 1995, "time_per_iteration": 4.206532716751099 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.06005812, "balance_loss_mlp": 1.0441823, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 1.8775058007239456, "language_loss": 0.73949909, "learning_rate": 3.9156208017223825e-06, "loss": 0.76169801, "num_input_tokens_seen": 43215050, "step": 1996, "time_per_iteration": 2.7263944149017334 }, { "auxiliary_loss_clip": 0.01141484, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05754757, "balance_loss_mlp": 1.03808212, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 1.976051865072764, "language_loss": 0.88125587, "learning_rate": 3.915508833793048e-06, "loss": 0.90327179, "num_input_tokens_seen": 43233900, "step": 1997, "time_per_iteration": 4.29426383972168 }, { "auxiliary_loss_clip": 0.01165634, "auxiliary_loss_mlp": 0.00779568, "balance_loss_clip": 1.05701697, "balance_loss_mlp": 1.00001049, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 2.1091392562336018, "language_loss": 0.79031086, "learning_rate": 3.915396793227428e-06, "loss": 0.80976284, "num_input_tokens_seen": 43252105, "step": 1998, "time_per_iteration": 4.330955266952515 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.00779642, "balance_loss_clip": 1.0576719, "balance_loss_mlp": 1.00002396, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.799585336659533, "language_loss": 0.73583078, "learning_rate": 3.915284680029769e-06, "loss": 0.75529337, "num_input_tokens_seen": 43270315, "step": 1999, "time_per_iteration": 2.754770040512085 }, { "auxiliary_loss_clip": 0.01178966, "auxiliary_loss_mlp": 0.01073097, "balance_loss_clip": 1.0602119, "balance_loss_mlp": 1.05115068, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.916355473014409, "language_loss": 0.74854898, "learning_rate": 3.915172494204323e-06, "loss": 0.77106953, "num_input_tokens_seen": 43289935, "step": 2000, "time_per_iteration": 4.3900322914123535 }, { "auxiliary_loss_clip": 0.01149374, "auxiliary_loss_mlp": 0.01069735, "balance_loss_clip": 1.05375695, "balance_loss_mlp": 1.04763341, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 1.5203973891597686, "language_loss": 0.8496564, "learning_rate": 3.915060235755344e-06, "loss": 0.87184751, "num_input_tokens_seen": 43309325, "step": 2001, "time_per_iteration": 2.6912643909454346 }, { "auxiliary_loss_clip": 0.01154057, "auxiliary_loss_mlp": 0.01063637, "balance_loss_clip": 1.05600786, "balance_loss_mlp": 1.04265642, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 2.932264271186656, "language_loss": 0.74711967, "learning_rate": 3.91494790468709e-06, "loss": 0.76929653, "num_input_tokens_seen": 43327010, "step": 2002, "time_per_iteration": 2.6991024017333984 }, { "auxiliary_loss_clip": 0.01129169, "auxiliary_loss_mlp": 0.01066705, "balance_loss_clip": 1.05340302, "balance_loss_mlp": 1.0429939, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 2.117271428042382, "language_loss": 0.78029454, "learning_rate": 3.9148355010038185e-06, "loss": 0.80225325, "num_input_tokens_seen": 43345650, "step": 2003, "time_per_iteration": 2.731381416320801 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01062886, "balance_loss_clip": 1.05728662, "balance_loss_mlp": 1.04073668, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.585850552088038, "language_loss": 0.72205627, "learning_rate": 3.914723024709793e-06, "loss": 0.74431765, "num_input_tokens_seen": 43365555, "step": 2004, "time_per_iteration": 2.725092649459839 }, { "auxiliary_loss_clip": 0.01160616, "auxiliary_loss_mlp": 0.01069457, "balance_loss_clip": 1.05870187, "balance_loss_mlp": 1.04645014, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.9357732467170252, "language_loss": 0.78415942, "learning_rate": 3.914610475809279e-06, "loss": 0.8064602, "num_input_tokens_seen": 43384990, "step": 2005, "time_per_iteration": 2.7232437133789062 }, { "auxiliary_loss_clip": 0.01073016, "auxiliary_loss_mlp": 0.00758901, "balance_loss_clip": 1.02995479, "balance_loss_mlp": 1.00011683, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.9264315537536937, "language_loss": 0.58087146, "learning_rate": 3.914497854306543e-06, "loss": 0.59919059, "num_input_tokens_seen": 43436335, "step": 2006, "time_per_iteration": 2.9570157527923584 }, { "auxiliary_loss_clip": 0.01155081, "auxiliary_loss_mlp": 0.01053472, "balance_loss_clip": 1.05803597, "balance_loss_mlp": 1.03299201, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.6109316320484448, "language_loss": 0.76524282, "learning_rate": 3.9143851602058575e-06, "loss": 0.78732836, "num_input_tokens_seen": 43456495, "step": 2007, "time_per_iteration": 2.763380289077759 }, { "auxiliary_loss_clip": 0.01147254, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.05931091, "balance_loss_mlp": 1.04177368, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 2.449779851562752, "language_loss": 0.83023942, "learning_rate": 3.914272393511494e-06, "loss": 0.85235405, "num_input_tokens_seen": 43473085, "step": 2008, "time_per_iteration": 2.7693119049072266 }, { "auxiliary_loss_clip": 0.01176157, "auxiliary_loss_mlp": 0.01052894, "balance_loss_clip": 1.0584172, "balance_loss_mlp": 1.03135288, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 2.203355340521787, "language_loss": 0.83835697, "learning_rate": 3.91415955422773e-06, "loss": 0.86064744, "num_input_tokens_seen": 43491135, "step": 2009, "time_per_iteration": 2.640944242477417 }, { "auxiliary_loss_clip": 0.01180076, "auxiliary_loss_mlp": 0.01053549, "balance_loss_clip": 1.06196725, "balance_loss_mlp": 1.02994514, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.6799099601218046, "language_loss": 0.83870012, "learning_rate": 3.914046642358844e-06, "loss": 0.8610363, "num_input_tokens_seen": 43510440, "step": 2010, "time_per_iteration": 2.716127634048462 }, { "auxiliary_loss_clip": 0.01145261, "auxiliary_loss_mlp": 0.00780804, "balance_loss_clip": 1.05555713, "balance_loss_mlp": 1.0000627, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.8933604390076018, "language_loss": 0.84194541, "learning_rate": 3.9139336579091174e-06, "loss": 0.86120605, "num_input_tokens_seen": 43530145, "step": 2011, "time_per_iteration": 2.73793625831604 }, { "auxiliary_loss_clip": 0.01148418, "auxiliary_loss_mlp": 0.01060974, "balance_loss_clip": 1.05480969, "balance_loss_mlp": 1.03905129, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 2.0524904800028154, "language_loss": 0.96236968, "learning_rate": 3.913820600882834e-06, "loss": 0.98446357, "num_input_tokens_seen": 43549315, "step": 2012, "time_per_iteration": 2.7269980907440186 }, { "auxiliary_loss_clip": 0.01146369, "auxiliary_loss_mlp": 0.01051396, "balance_loss_clip": 1.05808425, "balance_loss_mlp": 1.0289607, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 1.853151366811655, "language_loss": 0.80903435, "learning_rate": 3.913707471284283e-06, "loss": 0.83101201, "num_input_tokens_seen": 43569240, "step": 2013, "time_per_iteration": 2.740489959716797 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.0105341, "balance_loss_clip": 1.05300117, "balance_loss_mlp": 1.02962804, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 5.099975898232357, "language_loss": 0.77255923, "learning_rate": 3.9135942691177515e-06, "loss": 0.79434031, "num_input_tokens_seen": 43587710, "step": 2014, "time_per_iteration": 2.7361485958099365 }, { "auxiliary_loss_clip": 0.0116607, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.05832791, "balance_loss_mlp": 1.02791715, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 5.8570343294144465, "language_loss": 0.87169874, "learning_rate": 3.913480994387535e-06, "loss": 0.89387, "num_input_tokens_seen": 43606000, "step": 2015, "time_per_iteration": 2.6881515979766846 }, { "auxiliary_loss_clip": 0.01170382, "auxiliary_loss_mlp": 0.01051162, "balance_loss_clip": 1.05500197, "balance_loss_mlp": 1.0289886, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 2.087765239068409, "language_loss": 0.69146478, "learning_rate": 3.913367647097926e-06, "loss": 0.71368027, "num_input_tokens_seen": 43624815, "step": 2016, "time_per_iteration": 2.7096211910247803 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02390599, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 2.8043603396252865, "language_loss": 0.79858959, "learning_rate": 3.913254227253225e-06, "loss": 0.82058656, "num_input_tokens_seen": 43643960, "step": 2017, "time_per_iteration": 2.7042336463928223 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.0105052, "balance_loss_clip": 1.05479789, "balance_loss_mlp": 1.02740538, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.8700241463026654, "language_loss": 0.68828821, "learning_rate": 3.913140734857731e-06, "loss": 0.71035373, "num_input_tokens_seen": 43662650, "step": 2018, "time_per_iteration": 2.7015058994293213 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.05524123, "balance_loss_mlp": 1.02873111, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.6132330771570709, "language_loss": 0.72476816, "learning_rate": 3.91302716991575e-06, "loss": 0.74663943, "num_input_tokens_seen": 43684205, "step": 2019, "time_per_iteration": 2.8956947326660156 }, { "auxiliary_loss_clip": 0.01107167, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.05286384, "balance_loss_mlp": 1.03482556, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 1.853626515444831, "language_loss": 0.92125106, "learning_rate": 3.912913532431586e-06, "loss": 0.94290185, "num_input_tokens_seen": 43706320, "step": 2020, "time_per_iteration": 2.9980764389038086 }, { "auxiliary_loss_clip": 0.0114145, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.05289125, "balance_loss_mlp": 1.03360391, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 1.9227427415613194, "language_loss": 0.7772885, "learning_rate": 3.912799822409549e-06, "loss": 0.79925752, "num_input_tokens_seen": 43724805, "step": 2021, "time_per_iteration": 3.01798939704895 }, { "auxiliary_loss_clip": 0.0117749, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.0610733, "balance_loss_mlp": 1.0277164, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 2.054228820960504, "language_loss": 0.80712306, "learning_rate": 3.912686039853952e-06, "loss": 0.82938808, "num_input_tokens_seen": 43742320, "step": 2022, "time_per_iteration": 2.684309244155884 }, { "auxiliary_loss_clip": 0.01144749, "auxiliary_loss_mlp": 0.0106163, "balance_loss_clip": 1.055619, "balance_loss_mlp": 1.03697765, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.734031517866852, "language_loss": 0.84842217, "learning_rate": 3.912572184769108e-06, "loss": 0.87048596, "num_input_tokens_seen": 43760665, "step": 2023, "time_per_iteration": 2.6886441707611084 }, { "auxiliary_loss_clip": 0.01139348, "auxiliary_loss_mlp": 0.01053043, "balance_loss_clip": 1.05162323, "balance_loss_mlp": 1.03081048, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.3397199529221546, "language_loss": 0.85514021, "learning_rate": 3.912458257159335e-06, "loss": 0.87706411, "num_input_tokens_seen": 43779020, "step": 2024, "time_per_iteration": 2.8043718338012695 }, { "auxiliary_loss_clip": 0.01169767, "auxiliary_loss_mlp": 0.01055534, "balance_loss_clip": 1.05277538, "balance_loss_mlp": 1.03389716, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 1.8432491304976684, "language_loss": 0.72088945, "learning_rate": 3.912344257028954e-06, "loss": 0.74314243, "num_input_tokens_seen": 43798850, "step": 2025, "time_per_iteration": 2.704876184463501 }, { "auxiliary_loss_clip": 0.01148564, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.05486572, "balance_loss_mlp": 1.02555275, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 1.4969552271445652, "language_loss": 0.76075011, "learning_rate": 3.912230184382286e-06, "loss": 0.78271192, "num_input_tokens_seen": 43820130, "step": 2026, "time_per_iteration": 2.6957921981811523 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.01046261, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.02474427, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 2.2064263994277478, "language_loss": 0.88769746, "learning_rate": 3.912116039223659e-06, "loss": 0.90963376, "num_input_tokens_seen": 43838485, "step": 2027, "time_per_iteration": 2.6847639083862305 }, { "auxiliary_loss_clip": 0.01143778, "auxiliary_loss_mlp": 0.01056715, "balance_loss_clip": 1.05258501, "balance_loss_mlp": 1.03667617, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.5725885574076592, "language_loss": 0.75544459, "learning_rate": 3.912001821557399e-06, "loss": 0.77744961, "num_input_tokens_seen": 43859080, "step": 2028, "time_per_iteration": 2.7706027030944824 }, { "auxiliary_loss_clip": 0.01123185, "auxiliary_loss_mlp": 0.01057136, "balance_loss_clip": 1.0518471, "balance_loss_mlp": 1.03554714, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.0550419223931193, "language_loss": 0.76802504, "learning_rate": 3.911887531387839e-06, "loss": 0.78982824, "num_input_tokens_seen": 43879030, "step": 2029, "time_per_iteration": 2.732637405395508 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01052355, "balance_loss_clip": 1.05253625, "balance_loss_mlp": 1.03107572, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.707195979328818, "language_loss": 0.79164296, "learning_rate": 3.911773168719313e-06, "loss": 0.81373239, "num_input_tokens_seen": 43898505, "step": 2030, "time_per_iteration": 2.7254061698913574 }, { "auxiliary_loss_clip": 0.0116997, "auxiliary_loss_mlp": 0.01051357, "balance_loss_clip": 1.05618095, "balance_loss_mlp": 1.02930319, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 3.038077546298312, "language_loss": 0.74411637, "learning_rate": 3.911658733556155e-06, "loss": 0.76632965, "num_input_tokens_seen": 43917945, "step": 2031, "time_per_iteration": 2.6711080074310303 }, { "auxiliary_loss_clip": 0.01174332, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.02545118, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.7636188348969384, "language_loss": 0.75230348, "learning_rate": 3.911544225902707e-06, "loss": 0.7745049, "num_input_tokens_seen": 43937385, "step": 2032, "time_per_iteration": 2.7134530544281006 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01045735, "balance_loss_clip": 1.05129802, "balance_loss_mlp": 1.02538586, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.5809359138264147, "language_loss": 0.89502287, "learning_rate": 3.911429645763311e-06, "loss": 0.91700387, "num_input_tokens_seen": 43958130, "step": 2033, "time_per_iteration": 2.7105965614318848 }, { "auxiliary_loss_clip": 0.01155694, "auxiliary_loss_mlp": 0.01051169, "balance_loss_clip": 1.05740523, "balance_loss_mlp": 1.03005767, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 1.9580868921695649, "language_loss": 0.65195286, "learning_rate": 3.911314993142311e-06, "loss": 0.67402148, "num_input_tokens_seen": 43976800, "step": 2034, "time_per_iteration": 4.222668886184692 }, { "auxiliary_loss_clip": 0.01152239, "auxiliary_loss_mlp": 0.01055659, "balance_loss_clip": 1.05550218, "balance_loss_mlp": 1.0327704, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.6376942269871653, "language_loss": 0.76459455, "learning_rate": 3.911200268044055e-06, "loss": 0.78667355, "num_input_tokens_seen": 43996620, "step": 2035, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01176703, "auxiliary_loss_mlp": 0.01050008, "balance_loss_clip": 1.0577215, "balance_loss_mlp": 1.02798975, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 1.8460180606974623, "language_loss": 0.71294892, "learning_rate": 3.911085470472892e-06, "loss": 0.73521602, "num_input_tokens_seen": 44016175, "step": 2036, "time_per_iteration": 2.7327258586883545 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.01058389, "balance_loss_clip": 1.05778408, "balance_loss_mlp": 1.03623962, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 1.5772021569883852, "language_loss": 0.83130831, "learning_rate": 3.910970600433178e-06, "loss": 0.85333693, "num_input_tokens_seen": 44035060, "step": 2037, "time_per_iteration": 4.248440742492676 }, { "auxiliary_loss_clip": 0.01153641, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.0556947, "balance_loss_mlp": 1.0366174, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 2.676780030246967, "language_loss": 0.79765236, "learning_rate": 3.910855657929267e-06, "loss": 0.81979132, "num_input_tokens_seen": 44053330, "step": 2038, "time_per_iteration": 2.7321341037750244 }, { "auxiliary_loss_clip": 0.010642, "auxiliary_loss_mlp": 0.00759248, "balance_loss_clip": 1.02961969, "balance_loss_mlp": 1.00006962, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8248048644272604, "language_loss": 0.58659601, "learning_rate": 3.910740642965518e-06, "loss": 0.6048305, "num_input_tokens_seen": 44107575, "step": 2039, "time_per_iteration": 4.739040851593018 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01064411, "balance_loss_clip": 1.05292714, "balance_loss_mlp": 1.03912663, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 2.1548467753138136, "language_loss": 0.80099291, "learning_rate": 3.910625555546292e-06, "loss": 0.82291704, "num_input_tokens_seen": 44126075, "step": 2040, "time_per_iteration": 2.723247766494751 }, { "auxiliary_loss_clip": 0.01149343, "auxiliary_loss_mlp": 0.01058534, "balance_loss_clip": 1.05517352, "balance_loss_mlp": 1.03673029, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.8247690225218605, "language_loss": 0.82841176, "learning_rate": 3.910510395675953e-06, "loss": 0.85049051, "num_input_tokens_seen": 44145605, "step": 2041, "time_per_iteration": 2.699110984802246 }, { "auxiliary_loss_clip": 0.01136001, "auxiliary_loss_mlp": 0.01053451, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.03061032, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.9386136063873771, "language_loss": 0.67272276, "learning_rate": 3.9103951633588694e-06, "loss": 0.69461727, "num_input_tokens_seen": 44164770, "step": 2042, "time_per_iteration": 2.7042133808135986 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.05079007, "balance_loss_mlp": 1.03517294, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.912164915278887, "language_loss": 0.81765604, "learning_rate": 3.910279858599409e-06, "loss": 0.83955657, "num_input_tokens_seen": 44184025, "step": 2043, "time_per_iteration": 2.6942050457000732 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01052365, "balance_loss_clip": 1.05161905, "balance_loss_mlp": 1.03040695, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 1.7894844734354058, "language_loss": 0.80192459, "learning_rate": 3.910164481401946e-06, "loss": 0.82391244, "num_input_tokens_seen": 44202950, "step": 2044, "time_per_iteration": 2.6227192878723145 }, { "auxiliary_loss_clip": 0.01116285, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.05284619, "balance_loss_mlp": 1.03055525, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7152742607840916, "language_loss": 0.7794897, "learning_rate": 3.910049031770853e-06, "loss": 0.80117267, "num_input_tokens_seen": 44221115, "step": 2045, "time_per_iteration": 2.769017219543457 }, { "auxiliary_loss_clip": 0.01163545, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.05796146, "balance_loss_mlp": 1.03827095, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 1.852572781372854, "language_loss": 0.67284262, "learning_rate": 3.90993350971051e-06, "loss": 0.69508278, "num_input_tokens_seen": 44240575, "step": 2046, "time_per_iteration": 2.6377944946289062 }, { "auxiliary_loss_clip": 0.01173803, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.06010675, "balance_loss_mlp": 1.03202295, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 4.982373490718116, "language_loss": 0.72730684, "learning_rate": 3.909817915225297e-06, "loss": 0.74958241, "num_input_tokens_seen": 44257145, "step": 2047, "time_per_iteration": 2.5791239738464355 }, { "auxiliary_loss_clip": 0.01155159, "auxiliary_loss_mlp": 0.01060632, "balance_loss_clip": 1.05398846, "balance_loss_mlp": 1.03817296, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.8194194024321948, "language_loss": 0.76583183, "learning_rate": 3.909702248319597e-06, "loss": 0.78798974, "num_input_tokens_seen": 44278035, "step": 2048, "time_per_iteration": 2.6997592449188232 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.05524468, "balance_loss_mlp": 1.02798486, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 1.8097490634569602, "language_loss": 0.85359102, "learning_rate": 3.909586508997797e-06, "loss": 0.87553203, "num_input_tokens_seen": 44296980, "step": 2049, "time_per_iteration": 2.739617109298706 }, { "auxiliary_loss_clip": 0.01120276, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.0533725, "balance_loss_mlp": 1.02887857, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 2.6582136339172724, "language_loss": 0.75563407, "learning_rate": 3.909470697264285e-06, "loss": 0.77733827, "num_input_tokens_seen": 44318005, "step": 2050, "time_per_iteration": 2.7814078330993652 }, { "auxiliary_loss_clip": 0.01138568, "auxiliary_loss_mlp": 0.01057939, "balance_loss_clip": 1.05428278, "balance_loss_mlp": 1.03608823, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 1.81408967902731, "language_loss": 0.81166679, "learning_rate": 3.909354813123452e-06, "loss": 0.83363187, "num_input_tokens_seen": 44335260, "step": 2051, "time_per_iteration": 2.7555224895477295 }, { "auxiliary_loss_clip": 0.01171646, "auxiliary_loss_mlp": 0.00779218, "balance_loss_clip": 1.05882978, "balance_loss_mlp": 0.99996465, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 1.8885516327307212, "language_loss": 0.80445349, "learning_rate": 3.909238856579693e-06, "loss": 0.82396215, "num_input_tokens_seen": 44355315, "step": 2052, "time_per_iteration": 2.7676405906677246 }, { "auxiliary_loss_clip": 0.01165489, "auxiliary_loss_mlp": 0.010569, "balance_loss_clip": 1.0581975, "balance_loss_mlp": 1.03537059, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 2.171205541070781, "language_loss": 0.73676848, "learning_rate": 3.909122827637406e-06, "loss": 0.75899243, "num_input_tokens_seen": 44373020, "step": 2053, "time_per_iteration": 2.648609161376953 }, { "auxiliary_loss_clip": 0.01168883, "auxiliary_loss_mlp": 0.00778478, "balance_loss_clip": 1.05302441, "balance_loss_mlp": 0.99995315, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.5051513438882418, "language_loss": 0.7413671, "learning_rate": 3.909006726300991e-06, "loss": 0.76084077, "num_input_tokens_seen": 44397525, "step": 2054, "time_per_iteration": 2.871469020843506 }, { "auxiliary_loss_clip": 0.01147607, "auxiliary_loss_mlp": 0.01044612, "balance_loss_clip": 1.05402803, "balance_loss_mlp": 1.02482307, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 4.50189877271012, "language_loss": 0.85417157, "learning_rate": 3.908890552574849e-06, "loss": 0.8760938, "num_input_tokens_seen": 44415890, "step": 2055, "time_per_iteration": 2.7136077880859375 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.05999517, "balance_loss_mlp": 1.02802706, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 2.0629908776416688, "language_loss": 0.77506042, "learning_rate": 3.908774306463384e-06, "loss": 0.79687333, "num_input_tokens_seen": 44436625, "step": 2056, "time_per_iteration": 2.83107852935791 }, { "auxiliary_loss_clip": 0.01158234, "auxiliary_loss_mlp": 0.01055, "balance_loss_clip": 1.05444396, "balance_loss_mlp": 1.03405499, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 1.9893743253373262, "language_loss": 0.83361745, "learning_rate": 3.908657987971009e-06, "loss": 0.85574985, "num_input_tokens_seen": 44455265, "step": 2057, "time_per_iteration": 2.6987085342407227 }, { "auxiliary_loss_clip": 0.01141319, "auxiliary_loss_mlp": 0.01051708, "balance_loss_clip": 1.05057144, "balance_loss_mlp": 1.02991605, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.4905135493793764, "language_loss": 0.77818203, "learning_rate": 3.90854159710213e-06, "loss": 0.80011231, "num_input_tokens_seen": 44475815, "step": 2058, "time_per_iteration": 2.7149016857147217 }, { "auxiliary_loss_clip": 0.01138087, "auxiliary_loss_mlp": 0.01058134, "balance_loss_clip": 1.05117273, "balance_loss_mlp": 1.03482866, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 1.8387803476985631, "language_loss": 0.8342883, "learning_rate": 3.9084251338611624e-06, "loss": 0.85625052, "num_input_tokens_seen": 44494045, "step": 2059, "time_per_iteration": 2.7030091285705566 }, { "auxiliary_loss_clip": 0.01133517, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05123472, "balance_loss_mlp": 1.03445077, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.7478129466394217, "language_loss": 0.81420219, "learning_rate": 3.908308598252523e-06, "loss": 0.83611137, "num_input_tokens_seen": 44509120, "step": 2060, "time_per_iteration": 2.738499402999878 }, { "auxiliary_loss_clip": 0.01150334, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.05367386, "balance_loss_mlp": 1.0315125, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 1.8699548955873522, "language_loss": 0.86224365, "learning_rate": 3.9081919902806306e-06, "loss": 0.88429129, "num_input_tokens_seen": 44525780, "step": 2061, "time_per_iteration": 2.6492960453033447 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.05506253, "balance_loss_mlp": 1.03031528, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 2.006361909654615, "language_loss": 0.84949362, "learning_rate": 3.908075309949906e-06, "loss": 0.87154901, "num_input_tokens_seen": 44543125, "step": 2062, "time_per_iteration": 2.5925393104553223 }, { "auxiliary_loss_clip": 0.01124676, "auxiliary_loss_mlp": 0.01058304, "balance_loss_clip": 1.05198252, "balance_loss_mlp": 1.03498697, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 1.6181471799462952, "language_loss": 0.78765064, "learning_rate": 3.907958557264774e-06, "loss": 0.80948043, "num_input_tokens_seen": 44560275, "step": 2063, "time_per_iteration": 2.7551674842834473 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01057465, "balance_loss_clip": 1.05492854, "balance_loss_mlp": 1.03450513, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.9315517002695017, "language_loss": 0.79452097, "learning_rate": 3.907841732229663e-06, "loss": 0.81633931, "num_input_tokens_seen": 44577640, "step": 2064, "time_per_iteration": 2.699711322784424 }, { "auxiliary_loss_clip": 0.01144709, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.05316699, "balance_loss_mlp": 1.03847849, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.5611248351266016, "language_loss": 0.92676973, "learning_rate": 3.907724834849002e-06, "loss": 0.9488045, "num_input_tokens_seen": 44594860, "step": 2065, "time_per_iteration": 2.7114996910095215 }, { "auxiliary_loss_clip": 0.01147841, "auxiliary_loss_mlp": 0.01052058, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.02943158, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.7498294279318665, "language_loss": 0.80540735, "learning_rate": 3.907607865127225e-06, "loss": 0.82740629, "num_input_tokens_seen": 44614780, "step": 2066, "time_per_iteration": 2.6958389282226562 }, { "auxiliary_loss_clip": 0.01030831, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.02768898, "balance_loss_mlp": 1.04884958, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.8715885531008962, "language_loss": 0.63299954, "learning_rate": 3.907490823068766e-06, "loss": 0.6538223, "num_input_tokens_seen": 44671240, "step": 2067, "time_per_iteration": 3.200000762939453 }, { "auxiliary_loss_clip": 0.01117858, "auxiliary_loss_mlp": 0.01057985, "balance_loss_clip": 1.04878855, "balance_loss_mlp": 1.0344646, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 1.9218217735084064, "language_loss": 0.93783462, "learning_rate": 3.907373708678063e-06, "loss": 0.959593, "num_input_tokens_seen": 44691050, "step": 2068, "time_per_iteration": 2.7631025314331055 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.0105657, "balance_loss_clip": 1.05994427, "balance_loss_mlp": 1.03697169, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 1.8717926968048342, "language_loss": 0.80861229, "learning_rate": 3.9072565219595596e-06, "loss": 0.83084196, "num_input_tokens_seen": 44709850, "step": 2069, "time_per_iteration": 2.6630098819732666 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01062592, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.03963184, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.5649570979854035, "language_loss": 0.777978, "learning_rate": 3.907139262917696e-06, "loss": 0.79973656, "num_input_tokens_seen": 44731475, "step": 2070, "time_per_iteration": 2.7750463485717773 }, { "auxiliary_loss_clip": 0.01156875, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05520415, "balance_loss_mlp": 1.03055048, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 2.2051981544638166, "language_loss": 0.80743957, "learning_rate": 3.907021931556922e-06, "loss": 0.8295334, "num_input_tokens_seen": 44749685, "step": 2071, "time_per_iteration": 2.654171943664551 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01055767, "balance_loss_clip": 1.05492425, "balance_loss_mlp": 1.03405952, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 2.118828414072521, "language_loss": 0.78278041, "learning_rate": 3.906904527881684e-06, "loss": 0.80488491, "num_input_tokens_seen": 44772165, "step": 2072, "time_per_iteration": 2.753159284591675 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01055287, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.03381729, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 7.360489773093417, "language_loss": 0.752267, "learning_rate": 3.9067870518964355e-06, "loss": 0.77427667, "num_input_tokens_seen": 44790580, "step": 2073, "time_per_iteration": 2.6561899185180664 }, { "auxiliary_loss_clip": 0.01096485, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.04471385, "balance_loss_mlp": 1.03086543, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 1.9234955386089483, "language_loss": 0.90560025, "learning_rate": 3.906669503605631e-06, "loss": 0.92709696, "num_input_tokens_seen": 44806730, "step": 2074, "time_per_iteration": 2.7846343517303467 }, { "auxiliary_loss_clip": 0.01105332, "auxiliary_loss_mlp": 0.01056651, "balance_loss_clip": 1.04977274, "balance_loss_mlp": 1.03346491, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.8321626325497493, "language_loss": 0.83836985, "learning_rate": 3.906551883013728e-06, "loss": 0.8599897, "num_input_tokens_seen": 44825550, "step": 2075, "time_per_iteration": 4.412928342819214 }, { "auxiliary_loss_clip": 0.01107078, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.04380202, "balance_loss_mlp": 1.03972864, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 2.042892519020311, "language_loss": 0.73648787, "learning_rate": 3.9064341901251865e-06, "loss": 0.75818682, "num_input_tokens_seen": 44844155, "step": 2076, "time_per_iteration": 5.925223112106323 }, { "auxiliary_loss_clip": 0.01101731, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.04774427, "balance_loss_mlp": 1.02751708, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 1.8779339700875872, "language_loss": 0.7622484, "learning_rate": 3.906316424944469e-06, "loss": 0.78374755, "num_input_tokens_seen": 44863780, "step": 2077, "time_per_iteration": 2.70566987991333 }, { "auxiliary_loss_clip": 0.01156274, "auxiliary_loss_mlp": 0.01062042, "balance_loss_clip": 1.05365288, "balance_loss_mlp": 1.04001164, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.022280968605665, "language_loss": 0.82290226, "learning_rate": 3.906198587476043e-06, "loss": 0.84508544, "num_input_tokens_seen": 44881480, "step": 2078, "time_per_iteration": 4.302385568618774 }, { "auxiliary_loss_clip": 0.01144821, "auxiliary_loss_mlp": 0.01050482, "balance_loss_clip": 1.05281842, "balance_loss_mlp": 1.02855957, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.6413520418295044, "language_loss": 0.75195324, "learning_rate": 3.906080677724374e-06, "loss": 0.77390629, "num_input_tokens_seen": 44900390, "step": 2079, "time_per_iteration": 2.6915946006774902 }, { "auxiliary_loss_clip": 0.01166758, "auxiliary_loss_mlp": 0.01058474, "balance_loss_clip": 1.05881989, "balance_loss_mlp": 1.03696847, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 6.733284446627088, "language_loss": 0.83874094, "learning_rate": 3.905962695693935e-06, "loss": 0.86099327, "num_input_tokens_seen": 44920375, "step": 2080, "time_per_iteration": 2.7467572689056396 }, { "auxiliary_loss_clip": 0.01156163, "auxiliary_loss_mlp": 0.01059409, "balance_loss_clip": 1.05525088, "balance_loss_mlp": 1.03885686, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 1.8581885454518776, "language_loss": 0.84644079, "learning_rate": 3.9058446413892e-06, "loss": 0.86859655, "num_input_tokens_seen": 44938415, "step": 2081, "time_per_iteration": 2.685875654220581 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046398, "balance_loss_clip": 1.05375946, "balance_loss_mlp": 1.02594149, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.8191819349610059, "language_loss": 0.76739037, "learning_rate": 3.905726514814646e-06, "loss": 0.78942269, "num_input_tokens_seen": 44957135, "step": 2082, "time_per_iteration": 2.6133053302764893 }, { "auxiliary_loss_clip": 0.01152911, "auxiliary_loss_mlp": 0.0104632, "balance_loss_clip": 1.05701911, "balance_loss_mlp": 1.02463615, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.5415589476696265, "language_loss": 0.79044539, "learning_rate": 3.9056083159747495e-06, "loss": 0.81243765, "num_input_tokens_seen": 44974480, "step": 2083, "time_per_iteration": 2.6963307857513428 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.05509973, "balance_loss_mlp": 1.02421284, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 2.1696249857299, "language_loss": 0.89831448, "learning_rate": 3.9054900448739966e-06, "loss": 0.92026675, "num_input_tokens_seen": 44990310, "step": 2084, "time_per_iteration": 2.6770403385162354 }, { "auxiliary_loss_clip": 0.01131068, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.05299771, "balance_loss_mlp": 1.02729464, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 1.8896331095253402, "language_loss": 0.80354226, "learning_rate": 3.905371701516869e-06, "loss": 0.82533598, "num_input_tokens_seen": 45010720, "step": 2085, "time_per_iteration": 2.749783515930176 }, { "auxiliary_loss_clip": 0.01170318, "auxiliary_loss_mlp": 0.01051018, "balance_loss_clip": 1.05725896, "balance_loss_mlp": 1.03001356, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 1.8300316094254767, "language_loss": 0.88228154, "learning_rate": 3.905253285907856e-06, "loss": 0.90449488, "num_input_tokens_seen": 45030360, "step": 2086, "time_per_iteration": 2.603515148162842 }, { "auxiliary_loss_clip": 0.01134598, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05278981, "balance_loss_mlp": 1.02522027, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.0471238132540344, "language_loss": 0.86819696, "learning_rate": 3.905134798051447e-06, "loss": 0.88999224, "num_input_tokens_seen": 45045085, "step": 2087, "time_per_iteration": 2.6265859603881836 }, { "auxiliary_loss_clip": 0.01146999, "auxiliary_loss_mlp": 0.01058875, "balance_loss_clip": 1.05599046, "balance_loss_mlp": 1.03651142, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 2.3362397674907758, "language_loss": 0.73027468, "learning_rate": 3.905016237952136e-06, "loss": 0.75233346, "num_input_tokens_seen": 45065145, "step": 2088, "time_per_iteration": 2.65324330329895 }, { "auxiliary_loss_clip": 0.01062529, "auxiliary_loss_mlp": 0.01013405, "balance_loss_clip": 1.02985716, "balance_loss_mlp": 1.01079392, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7742255614948045, "language_loss": 0.61767036, "learning_rate": 3.904897605614418e-06, "loss": 0.6384297, "num_input_tokens_seen": 45126230, "step": 2089, "time_per_iteration": 3.1219804286956787 }, { "auxiliary_loss_clip": 0.01149606, "auxiliary_loss_mlp": 0.01060841, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 1.0388943, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 1.817095421446176, "language_loss": 0.7781918, "learning_rate": 3.904778901042793e-06, "loss": 0.80029625, "num_input_tokens_seen": 45145545, "step": 2090, "time_per_iteration": 2.700425863265991 }, { "auxiliary_loss_clip": 0.01046946, "auxiliary_loss_mlp": 0.01013884, "balance_loss_clip": 1.03125095, "balance_loss_mlp": 1.01101136, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.760599485634597, "language_loss": 0.59434772, "learning_rate": 3.90466012424176e-06, "loss": 0.61495602, "num_input_tokens_seen": 45206845, "step": 2091, "time_per_iteration": 3.0814294815063477 }, { "auxiliary_loss_clip": 0.01159814, "auxiliary_loss_mlp": 0.01060546, "balance_loss_clip": 1.05760789, "balance_loss_mlp": 1.041067, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 1.6552462178493936, "language_loss": 0.62916517, "learning_rate": 3.904541275215825e-06, "loss": 0.6513688, "num_input_tokens_seen": 45228495, "step": 2092, "time_per_iteration": 2.7813880443573 }, { "auxiliary_loss_clip": 0.01147016, "auxiliary_loss_mlp": 0.01061963, "balance_loss_clip": 1.05395663, "balance_loss_mlp": 1.04069614, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 2.279616692029291, "language_loss": 0.80507946, "learning_rate": 3.904422353969493e-06, "loss": 0.82716924, "num_input_tokens_seen": 45245720, "step": 2093, "time_per_iteration": 2.6768014430999756 }, { "auxiliary_loss_clip": 0.01146976, "auxiliary_loss_mlp": 0.01075616, "balance_loss_clip": 1.0524025, "balance_loss_mlp": 1.05380058, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.7347385846840702, "language_loss": 0.76003867, "learning_rate": 3.904303360507276e-06, "loss": 0.78226459, "num_input_tokens_seen": 45265650, "step": 2094, "time_per_iteration": 2.6730611324310303 }, { "auxiliary_loss_clip": 0.01117887, "auxiliary_loss_mlp": 0.01069309, "balance_loss_clip": 1.0500071, "balance_loss_mlp": 1.04892457, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.5703706409155747, "language_loss": 0.76664734, "learning_rate": 3.9041842948336835e-06, "loss": 0.78851926, "num_input_tokens_seen": 45287790, "step": 2095, "time_per_iteration": 2.958367109298706 }, { "auxiliary_loss_clip": 0.01147751, "auxiliary_loss_mlp": 0.01058477, "balance_loss_clip": 1.05202031, "balance_loss_mlp": 1.03782988, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.2556524892449326, "language_loss": 0.83266854, "learning_rate": 3.904065156953232e-06, "loss": 0.85473078, "num_input_tokens_seen": 45305720, "step": 2096, "time_per_iteration": 2.7097342014312744 }, { "auxiliary_loss_clip": 0.01163652, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05806553, "balance_loss_mlp": 1.03577375, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 1.7589400475615893, "language_loss": 0.75478256, "learning_rate": 3.903945946870439e-06, "loss": 0.77698463, "num_input_tokens_seen": 45325290, "step": 2097, "time_per_iteration": 2.642056703567505 }, { "auxiliary_loss_clip": 0.01156719, "auxiliary_loss_mlp": 0.01063976, "balance_loss_clip": 1.05648863, "balance_loss_mlp": 1.04527175, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 1.8828235460619742, "language_loss": 0.87110066, "learning_rate": 3.9038266645898246e-06, "loss": 0.89330757, "num_input_tokens_seen": 45344465, "step": 2098, "time_per_iteration": 2.63826584815979 }, { "auxiliary_loss_clip": 0.01117414, "auxiliary_loss_mlp": 0.01058025, "balance_loss_clip": 1.04983974, "balance_loss_mlp": 1.03475559, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 1.8855647331078333, "language_loss": 0.69494271, "learning_rate": 3.903707310115912e-06, "loss": 0.7166971, "num_input_tokens_seen": 45362465, "step": 2099, "time_per_iteration": 2.7813057899475098 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01061431, "balance_loss_clip": 1.04979372, "balance_loss_mlp": 1.03923464, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.0457253500590498, "language_loss": 0.81949925, "learning_rate": 3.903587883453228e-06, "loss": 0.84154058, "num_input_tokens_seen": 45382700, "step": 2100, "time_per_iteration": 2.704871416091919 }, { "auxiliary_loss_clip": 0.01159613, "auxiliary_loss_mlp": 0.01055067, "balance_loss_clip": 1.0620985, "balance_loss_mlp": 1.03408623, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 1.7810176086536167, "language_loss": 0.80399859, "learning_rate": 3.903468384606302e-06, "loss": 0.82614541, "num_input_tokens_seen": 45401005, "step": 2101, "time_per_iteration": 2.7071452140808105 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01010859, "balance_loss_clip": 1.02823138, "balance_loss_mlp": 1.00803375, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7128618749962091, "language_loss": 0.57087427, "learning_rate": 3.903348813579662e-06, "loss": 0.59168136, "num_input_tokens_seen": 45466555, "step": 2102, "time_per_iteration": 3.20320987701416 }, { "auxiliary_loss_clip": 0.01140495, "auxiliary_loss_mlp": 0.01056574, "balance_loss_clip": 1.053671, "balance_loss_mlp": 1.03661788, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 2.0306165352193988, "language_loss": 0.93653679, "learning_rate": 3.903229170377845e-06, "loss": 0.95850742, "num_input_tokens_seen": 45485165, "step": 2103, "time_per_iteration": 2.6628894805908203 }, { "auxiliary_loss_clip": 0.01144405, "auxiliary_loss_mlp": 0.01040745, "balance_loss_clip": 1.04991472, "balance_loss_mlp": 1.02174282, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 1.5962316578756222, "language_loss": 0.7804662, "learning_rate": 3.903109455005387e-06, "loss": 0.80231774, "num_input_tokens_seen": 45504630, "step": 2104, "time_per_iteration": 2.6215474605560303 }, { "auxiliary_loss_clip": 0.01135927, "auxiliary_loss_mlp": 0.01056343, "balance_loss_clip": 1.05414486, "balance_loss_mlp": 1.03683996, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 1.7362499149688688, "language_loss": 0.80728614, "learning_rate": 3.902989667466828e-06, "loss": 0.82920885, "num_input_tokens_seen": 45524885, "step": 2105, "time_per_iteration": 2.74128794670105 }, { "auxiliary_loss_clip": 0.01162904, "auxiliary_loss_mlp": 0.01056367, "balance_loss_clip": 1.05482686, "balance_loss_mlp": 1.03514743, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 1.9810187943106816, "language_loss": 0.83402872, "learning_rate": 3.90286980776671e-06, "loss": 0.85622144, "num_input_tokens_seen": 45545000, "step": 2106, "time_per_iteration": 2.676694631576538 }, { "auxiliary_loss_clip": 0.01126632, "auxiliary_loss_mlp": 0.01052067, "balance_loss_clip": 1.05697966, "balance_loss_mlp": 1.03147984, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.6951691508845637, "language_loss": 0.73469931, "learning_rate": 3.902749875909578e-06, "loss": 0.7564863, "num_input_tokens_seen": 45564210, "step": 2107, "time_per_iteration": 2.7506372928619385 }, { "auxiliary_loss_clip": 0.01162931, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.05320692, "balance_loss_mlp": 1.02599406, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.0116792159666477, "language_loss": 0.79395336, "learning_rate": 3.90262987189998e-06, "loss": 0.81602579, "num_input_tokens_seen": 45583030, "step": 2108, "time_per_iteration": 2.6611146926879883 }, { "auxiliary_loss_clip": 0.01168073, "auxiliary_loss_mlp": 0.01049192, "balance_loss_clip": 1.05300844, "balance_loss_mlp": 1.02945089, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 1.9298328790617403, "language_loss": 0.7561394, "learning_rate": 3.902509795742467e-06, "loss": 0.77831209, "num_input_tokens_seen": 45602265, "step": 2109, "time_per_iteration": 2.5963573455810547 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.04636049, "balance_loss_mlp": 1.0335331, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6171901700648081, "language_loss": 0.82806516, "learning_rate": 3.902389647441592e-06, "loss": 0.84971368, "num_input_tokens_seen": 45620595, "step": 2110, "time_per_iteration": 2.6745550632476807 }, { "auxiliary_loss_clip": 0.01145969, "auxiliary_loss_mlp": 0.00778071, "balance_loss_clip": 1.05419564, "balance_loss_mlp": 0.99996144, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 1.6765217216011241, "language_loss": 0.78092968, "learning_rate": 3.90226942700191e-06, "loss": 0.80017006, "num_input_tokens_seen": 45641140, "step": 2111, "time_per_iteration": 2.65983510017395 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01076547, "balance_loss_clip": 1.05490458, "balance_loss_mlp": 1.05352807, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 2.15738266202174, "language_loss": 0.77103376, "learning_rate": 3.902149134427982e-06, "loss": 0.79310858, "num_input_tokens_seen": 45662315, "step": 2112, "time_per_iteration": 2.870299816131592 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01074863, "balance_loss_clip": 1.05213726, "balance_loss_mlp": 1.05427516, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.9191529425470424, "language_loss": 0.85806453, "learning_rate": 3.902028769724367e-06, "loss": 0.88010758, "num_input_tokens_seen": 45680335, "step": 2113, "time_per_iteration": 4.26338267326355 }, { "auxiliary_loss_clip": 0.01137468, "auxiliary_loss_mlp": 0.01078067, "balance_loss_clip": 1.05511892, "balance_loss_mlp": 1.05670488, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 1.9721234476704599, "language_loss": 0.74027002, "learning_rate": 3.9019083328956315e-06, "loss": 0.7624253, "num_input_tokens_seen": 45696240, "step": 2114, "time_per_iteration": 2.7573230266571045 }, { "auxiliary_loss_clip": 0.01156713, "auxiliary_loss_mlp": 0.01060574, "balance_loss_clip": 1.05770111, "balance_loss_mlp": 1.03924704, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 1.7921743813213327, "language_loss": 0.83240676, "learning_rate": 3.901787823946341e-06, "loss": 0.85457963, "num_input_tokens_seen": 45713695, "step": 2115, "time_per_iteration": 4.1369829177856445 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01065557, "balance_loss_clip": 1.05875492, "balance_loss_mlp": 1.04476702, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.4840591347809418, "language_loss": 0.87010503, "learning_rate": 3.901667242881065e-06, "loss": 0.89230716, "num_input_tokens_seen": 45736655, "step": 2116, "time_per_iteration": 2.73896861076355 }, { "auxiliary_loss_clip": 0.01139498, "auxiliary_loss_mlp": 0.00777066, "balance_loss_clip": 1.05413389, "balance_loss_mlp": 0.99995339, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.753205985010591, "language_loss": 0.70374918, "learning_rate": 3.9015465897043775e-06, "loss": 0.72291481, "num_input_tokens_seen": 45758195, "step": 2117, "time_per_iteration": 2.783156156539917 }, { "auxiliary_loss_clip": 0.01127455, "auxiliary_loss_mlp": 0.0106424, "balance_loss_clip": 1.04978406, "balance_loss_mlp": 1.04068434, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 1.9957647698478755, "language_loss": 0.86237884, "learning_rate": 3.901425864420852e-06, "loss": 0.8842957, "num_input_tokens_seen": 45774280, "step": 2118, "time_per_iteration": 4.322036266326904 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01049008, "balance_loss_clip": 1.05827069, "balance_loss_mlp": 1.02951694, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 1.705293179953873, "language_loss": 0.87577266, "learning_rate": 3.901305067035068e-06, "loss": 0.89787692, "num_input_tokens_seen": 45792760, "step": 2119, "time_per_iteration": 2.6559741497039795 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.0077754, "balance_loss_clip": 1.05233431, "balance_loss_mlp": 0.99984539, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 2.05013605026053, "language_loss": 0.87824571, "learning_rate": 3.901184197551605e-06, "loss": 0.89747536, "num_input_tokens_seen": 45804300, "step": 2120, "time_per_iteration": 2.6154048442840576 }, { "auxiliary_loss_clip": 0.01170497, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.05822706, "balance_loss_mlp": 1.02626204, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 1.9784951602308867, "language_loss": 0.75584805, "learning_rate": 3.901063255975046e-06, "loss": 0.77801377, "num_input_tokens_seen": 45823780, "step": 2121, "time_per_iteration": 2.579265832901001 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.04741263, "balance_loss_mlp": 1.02727842, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.0293629108662405, "language_loss": 0.82732606, "learning_rate": 3.900942242309978e-06, "loss": 0.84893048, "num_input_tokens_seen": 45840495, "step": 2122, "time_per_iteration": 2.793870210647583 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05901408, "balance_loss_mlp": 1.02983987, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.7660235451894624, "language_loss": 0.78699338, "learning_rate": 3.90082115656099e-06, "loss": 0.80900776, "num_input_tokens_seen": 45857735, "step": 2123, "time_per_iteration": 2.70546293258667 }, { "auxiliary_loss_clip": 0.01172823, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.05931985, "balance_loss_mlp": 1.03478789, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.5643885422181942, "language_loss": 0.78931451, "learning_rate": 3.900699998732673e-06, "loss": 0.81159604, "num_input_tokens_seen": 45876485, "step": 2124, "time_per_iteration": 2.661712408065796 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.00776885, "balance_loss_clip": 1.05457389, "balance_loss_mlp": 0.99987447, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 1.9695028631977674, "language_loss": 0.75605726, "learning_rate": 3.900578768829623e-06, "loss": 0.7754308, "num_input_tokens_seen": 45894645, "step": 2125, "time_per_iteration": 2.696021556854248 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.00777059, "balance_loss_clip": 1.05398965, "balance_loss_mlp": 1.00002348, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.019802885219414, "language_loss": 0.78016824, "learning_rate": 3.900457466856434e-06, "loss": 0.79952049, "num_input_tokens_seen": 45913755, "step": 2126, "time_per_iteration": 2.721435308456421 }, { "auxiliary_loss_clip": 0.01124637, "auxiliary_loss_mlp": 0.010537, "balance_loss_clip": 1.05406642, "balance_loss_mlp": 1.03504348, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.3825945270792501, "language_loss": 0.6927852, "learning_rate": 3.9003360928177085e-06, "loss": 0.71456861, "num_input_tokens_seen": 45936095, "step": 2127, "time_per_iteration": 2.902101993560791 }, { "auxiliary_loss_clip": 0.01030231, "auxiliary_loss_mlp": 0.00759051, "balance_loss_clip": 1.02830005, "balance_loss_mlp": 1.00050259, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.853491438999862, "language_loss": 0.62831402, "learning_rate": 3.900214646718047e-06, "loss": 0.64620686, "num_input_tokens_seen": 46004655, "step": 2128, "time_per_iteration": 3.3387396335601807 }, { "auxiliary_loss_clip": 0.01145823, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.05080712, "balance_loss_mlp": 1.02599955, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.066959353069841, "language_loss": 0.77626479, "learning_rate": 3.900093128562056e-06, "loss": 0.7982012, "num_input_tokens_seen": 46023610, "step": 2129, "time_per_iteration": 2.611309766769409 }, { "auxiliary_loss_clip": 0.01122914, "auxiliary_loss_mlp": 0.01052577, "balance_loss_clip": 1.05058527, "balance_loss_mlp": 1.03029668, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.1214737401843893, "language_loss": 0.79263359, "learning_rate": 3.899971538354343e-06, "loss": 0.81438851, "num_input_tokens_seen": 46041725, "step": 2130, "time_per_iteration": 2.753243923187256 }, { "auxiliary_loss_clip": 0.01139626, "auxiliary_loss_mlp": 0.01052453, "balance_loss_clip": 1.05133748, "balance_loss_mlp": 1.03147244, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 1.7780274650921335, "language_loss": 0.70945668, "learning_rate": 3.899849876099518e-06, "loss": 0.73137754, "num_input_tokens_seen": 46061095, "step": 2131, "time_per_iteration": 2.6809306144714355 }, { "auxiliary_loss_clip": 0.01102824, "auxiliary_loss_mlp": 0.01052393, "balance_loss_clip": 1.04982638, "balance_loss_mlp": 1.03163886, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 2.2916674504462655, "language_loss": 0.72298968, "learning_rate": 3.899728141802197e-06, "loss": 0.74454176, "num_input_tokens_seen": 46082670, "step": 2132, "time_per_iteration": 2.8769233226776123 }, { "auxiliary_loss_clip": 0.01102594, "auxiliary_loss_mlp": 0.01055993, "balance_loss_clip": 1.04384947, "balance_loss_mlp": 1.03348672, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 2.0316054281953155, "language_loss": 0.82128644, "learning_rate": 3.8996063354669935e-06, "loss": 0.84287226, "num_input_tokens_seen": 46102410, "step": 2133, "time_per_iteration": 2.766897678375244 }, { "auxiliary_loss_clip": 0.01163396, "auxiliary_loss_mlp": 0.01057069, "balance_loss_clip": 1.05397773, "balance_loss_mlp": 1.03458595, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 3.232115826630309, "language_loss": 0.80001891, "learning_rate": 3.899484457098528e-06, "loss": 0.82222354, "num_input_tokens_seen": 46121145, "step": 2134, "time_per_iteration": 2.6347672939300537 }, { "auxiliary_loss_clip": 0.01159056, "auxiliary_loss_mlp": 0.01046209, "balance_loss_clip": 1.05907345, "balance_loss_mlp": 1.02614641, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.731952504909339, "language_loss": 0.82657921, "learning_rate": 3.899362506701421e-06, "loss": 0.84863198, "num_input_tokens_seen": 46140740, "step": 2135, "time_per_iteration": 2.6393656730651855 }, { "auxiliary_loss_clip": 0.0114208, "auxiliary_loss_mlp": 0.0105553, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 1.03411996, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 2.1083924470752278, "language_loss": 0.7764526, "learning_rate": 3.899240484280298e-06, "loss": 0.79842871, "num_input_tokens_seen": 46156805, "step": 2136, "time_per_iteration": 2.7195920944213867 }, { "auxiliary_loss_clip": 0.01020946, "auxiliary_loss_mlp": 0.01003991, "balance_loss_clip": 1.01967573, "balance_loss_mlp": 1.00096273, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.8964253308146478, "language_loss": 0.59152198, "learning_rate": 3.899118389839785e-06, "loss": 0.61177135, "num_input_tokens_seen": 46222085, "step": 2137, "time_per_iteration": 3.416015625 }, { "auxiliary_loss_clip": 0.01153694, "auxiliary_loss_mlp": 0.01054623, "balance_loss_clip": 1.05178177, "balance_loss_mlp": 1.03483438, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 3.244493357011547, "language_loss": 0.82344306, "learning_rate": 3.898996223384512e-06, "loss": 0.84552622, "num_input_tokens_seen": 46239970, "step": 2138, "time_per_iteration": 2.65515398979187 }, { "auxiliary_loss_clip": 0.01159586, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.05592752, "balance_loss_mlp": 1.02665496, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.5417837252920323, "language_loss": 0.78691363, "learning_rate": 3.898873984919113e-06, "loss": 0.8090024, "num_input_tokens_seen": 46257740, "step": 2139, "time_per_iteration": 2.651132345199585 }, { "auxiliary_loss_clip": 0.01136892, "auxiliary_loss_mlp": 0.01045928, "balance_loss_clip": 1.05267286, "balance_loss_mlp": 1.02582908, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 1.9541049485452633, "language_loss": 0.85289955, "learning_rate": 3.8987516744482215e-06, "loss": 0.87472773, "num_input_tokens_seen": 46275445, "step": 2140, "time_per_iteration": 2.730156183242798 }, { "auxiliary_loss_clip": 0.01143134, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05203128, "balance_loss_mlp": 1.02482224, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 1.8185491602156885, "language_loss": 0.86268306, "learning_rate": 3.898629291976476e-06, "loss": 0.88455778, "num_input_tokens_seen": 46291710, "step": 2141, "time_per_iteration": 2.62223482131958 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.0528295, "balance_loss_mlp": 1.02548814, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 3.1267362471736684, "language_loss": 0.68282312, "learning_rate": 3.898506837508518e-06, "loss": 0.70475101, "num_input_tokens_seen": 46311335, "step": 2142, "time_per_iteration": 2.71232271194458 }, { "auxiliary_loss_clip": 0.01165678, "auxiliary_loss_mlp": 0.0077895, "balance_loss_clip": 1.05764627, "balance_loss_mlp": 0.99990749, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 2.373838274123079, "language_loss": 0.83479214, "learning_rate": 3.89838431104899e-06, "loss": 0.85423845, "num_input_tokens_seen": 46330985, "step": 2143, "time_per_iteration": 2.677692174911499 }, { "auxiliary_loss_clip": 0.01175134, "auxiliary_loss_mlp": 0.00777405, "balance_loss_clip": 1.0598439, "balance_loss_mlp": 0.99994075, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.5662270309624111, "language_loss": 0.81703234, "learning_rate": 3.898261712602539e-06, "loss": 0.83655775, "num_input_tokens_seen": 46351295, "step": 2144, "time_per_iteration": 2.712620496749878 }, { "auxiliary_loss_clip": 0.01130321, "auxiliary_loss_mlp": 0.01053521, "balance_loss_clip": 1.04658103, "balance_loss_mlp": 1.03145528, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 1.8026346290528672, "language_loss": 0.78304374, "learning_rate": 3.898139042173813e-06, "loss": 0.80488217, "num_input_tokens_seen": 46368600, "step": 2145, "time_per_iteration": 2.6766605377197266 }, { "auxiliary_loss_clip": 0.01170585, "auxiliary_loss_mlp": 0.01047893, "balance_loss_clip": 1.0543592, "balance_loss_mlp": 1.02662635, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 2.147087506474235, "language_loss": 0.82865375, "learning_rate": 3.898016299767465e-06, "loss": 0.85083848, "num_input_tokens_seen": 46387370, "step": 2146, "time_per_iteration": 2.5860395431518555 }, { "auxiliary_loss_clip": 0.01141916, "auxiliary_loss_mlp": 0.0105138, "balance_loss_clip": 1.05367482, "balance_loss_mlp": 1.03062606, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.344626501147968, "language_loss": 0.71275079, "learning_rate": 3.897893485388149e-06, "loss": 0.73468375, "num_input_tokens_seen": 46409570, "step": 2147, "time_per_iteration": 2.7870359420776367 }, { "auxiliary_loss_clip": 0.01147238, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.05527067, "balance_loss_mlp": 1.03297925, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 2.120275205230366, "language_loss": 0.71432978, "learning_rate": 3.897770599040521e-06, "loss": 0.73632509, "num_input_tokens_seen": 46429320, "step": 2148, "time_per_iteration": 2.6865081787109375 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.05762172, "balance_loss_mlp": 1.03016782, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.6388902851592406, "language_loss": 0.79064089, "learning_rate": 3.897647640729242e-06, "loss": 0.81282145, "num_input_tokens_seen": 46450155, "step": 2149, "time_per_iteration": 2.6041862964630127 }, { "auxiliary_loss_clip": 0.01159527, "auxiliary_loss_mlp": 0.01046069, "balance_loss_clip": 1.05377793, "balance_loss_mlp": 1.02531469, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.034796374339078, "language_loss": 0.75976646, "learning_rate": 3.897524610458975e-06, "loss": 0.78182244, "num_input_tokens_seen": 46470280, "step": 2150, "time_per_iteration": 2.647224187850952 }, { "auxiliary_loss_clip": 0.01155787, "auxiliary_loss_mlp": 0.01055192, "balance_loss_clip": 1.05445433, "balance_loss_mlp": 1.03491461, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.3830500835005592, "language_loss": 0.70986372, "learning_rate": 3.8974015082343835e-06, "loss": 0.73197353, "num_input_tokens_seen": 46487605, "step": 2151, "time_per_iteration": 2.7008492946624756 }, { "auxiliary_loss_clip": 0.01167835, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.05603719, "balance_loss_mlp": 1.03017378, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.058334480733051, "language_loss": 0.83964819, "learning_rate": 3.897278334060137e-06, "loss": 0.86182165, "num_input_tokens_seen": 46505100, "step": 2152, "time_per_iteration": 2.6467373371124268 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01058416, "balance_loss_clip": 1.05283821, "balance_loss_mlp": 1.03888893, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.5624811365269535, "language_loss": 0.78585124, "learning_rate": 3.897155087940906e-06, "loss": 0.80802095, "num_input_tokens_seen": 46524020, "step": 2153, "time_per_iteration": 4.286921262741089 }, { "auxiliary_loss_clip": 0.01113716, "auxiliary_loss_mlp": 0.00777812, "balance_loss_clip": 1.04707122, "balance_loss_mlp": 0.99989671, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.6189787343362376, "language_loss": 0.80253434, "learning_rate": 3.897031769881364e-06, "loss": 0.82144964, "num_input_tokens_seen": 46544640, "step": 2154, "time_per_iteration": 2.7602338790893555 }, { "auxiliary_loss_clip": 0.01149958, "auxiliary_loss_mlp": 0.0105188, "balance_loss_clip": 1.05262971, "balance_loss_mlp": 1.03099442, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 1.8080432584650143, "language_loss": 0.83717728, "learning_rate": 3.896908379886188e-06, "loss": 0.85919571, "num_input_tokens_seen": 46561395, "step": 2155, "time_per_iteration": 5.696707010269165 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01056273, "balance_loss_clip": 1.05426383, "balance_loss_mlp": 1.03611445, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.4972858828122666, "language_loss": 0.76114857, "learning_rate": 3.896784917960055e-06, "loss": 0.78333133, "num_input_tokens_seen": 46579395, "step": 2156, "time_per_iteration": 2.6279313564300537 }, { "auxiliary_loss_clip": 0.01105089, "auxiliary_loss_mlp": 0.01056603, "balance_loss_clip": 1.0510118, "balance_loss_mlp": 1.03679013, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.6652476704410177, "language_loss": 0.86493659, "learning_rate": 3.896661384107648e-06, "loss": 0.88655347, "num_input_tokens_seen": 46597090, "step": 2157, "time_per_iteration": 4.4089202880859375 }, { "auxiliary_loss_clip": 0.01170107, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.05253935, "balance_loss_mlp": 1.0349642, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 2.5240136552338956, "language_loss": 0.80393612, "learning_rate": 3.896537778333651e-06, "loss": 0.8261953, "num_input_tokens_seen": 46617355, "step": 2158, "time_per_iteration": 2.702765703201294 }, { "auxiliary_loss_clip": 0.01177017, "auxiliary_loss_mlp": 0.01060365, "balance_loss_clip": 1.05905974, "balance_loss_mlp": 1.04050517, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.5307604694159607, "language_loss": 0.74881256, "learning_rate": 3.896414100642752e-06, "loss": 0.77118635, "num_input_tokens_seen": 46633130, "step": 2159, "time_per_iteration": 2.534163475036621 }, { "auxiliary_loss_clip": 0.01122909, "auxiliary_loss_mlp": 0.01058309, "balance_loss_clip": 1.04594469, "balance_loss_mlp": 1.03471708, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 1.954419432637739, "language_loss": 0.8259204, "learning_rate": 3.89629035103964e-06, "loss": 0.84773254, "num_input_tokens_seen": 46650575, "step": 2160, "time_per_iteration": 2.7358646392822266 }, { "auxiliary_loss_clip": 0.01154348, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.05873609, "balance_loss_mlp": 1.02732301, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.7252123805741888, "language_loss": 0.82310414, "learning_rate": 3.896166529529008e-06, "loss": 0.84512007, "num_input_tokens_seen": 46668780, "step": 2161, "time_per_iteration": 2.7029623985290527 }, { "auxiliary_loss_clip": 0.01145886, "auxiliary_loss_mlp": 0.01060381, "balance_loss_clip": 1.05145073, "balance_loss_mlp": 1.03911448, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.0780374068601253, "language_loss": 0.82668459, "learning_rate": 3.896042636115551e-06, "loss": 0.84874725, "num_input_tokens_seen": 46687550, "step": 2162, "time_per_iteration": 2.674825668334961 }, { "auxiliary_loss_clip": 0.0113921, "auxiliary_loss_mlp": 0.0105953, "balance_loss_clip": 1.05468941, "balance_loss_mlp": 1.03957474, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 3.928222506771022, "language_loss": 0.72579277, "learning_rate": 3.895918670803968e-06, "loss": 0.7477802, "num_input_tokens_seen": 46706730, "step": 2163, "time_per_iteration": 2.678394079208374 }, { "auxiliary_loss_clip": 0.01173873, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.05635965, "balance_loss_mlp": 0.99994016, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 2.0196348424542827, "language_loss": 0.81330699, "learning_rate": 3.895794633598958e-06, "loss": 0.83283234, "num_input_tokens_seen": 46724250, "step": 2164, "time_per_iteration": 2.6116931438446045 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01050661, "balance_loss_clip": 1.04808033, "balance_loss_mlp": 1.03061032, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.274563635903502, "language_loss": 0.72262049, "learning_rate": 3.8956705245052256e-06, "loss": 0.74432552, "num_input_tokens_seen": 46744105, "step": 2165, "time_per_iteration": 2.7646515369415283 }, { "auxiliary_loss_clip": 0.01109832, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.05059505, "balance_loss_mlp": 1.02707219, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 2.8383873988269217, "language_loss": 0.74749964, "learning_rate": 3.8955463435274765e-06, "loss": 0.76908153, "num_input_tokens_seen": 46764250, "step": 2166, "time_per_iteration": 2.7939398288726807 }, { "auxiliary_loss_clip": 0.01170298, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05364752, "balance_loss_mlp": 1.02827251, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.5379857106114436, "language_loss": 0.83098066, "learning_rate": 3.895422090670421e-06, "loss": 0.85316396, "num_input_tokens_seen": 46786865, "step": 2167, "time_per_iteration": 2.700505495071411 }, { "auxiliary_loss_clip": 0.01108628, "auxiliary_loss_mlp": 0.01059921, "balance_loss_clip": 1.04567361, "balance_loss_mlp": 1.03841531, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.6054044551173634, "language_loss": 0.83578718, "learning_rate": 3.89529776593877e-06, "loss": 0.85747266, "num_input_tokens_seen": 46807030, "step": 2168, "time_per_iteration": 2.839285135269165 }, { "auxiliary_loss_clip": 0.01079188, "auxiliary_loss_mlp": 0.01063413, "balance_loss_clip": 1.04247975, "balance_loss_mlp": 1.03861713, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 1.950315007602454, "language_loss": 0.79910588, "learning_rate": 3.8951733693372375e-06, "loss": 0.8205319, "num_input_tokens_seen": 46826280, "step": 2169, "time_per_iteration": 2.8150076866149902 }, { "auxiliary_loss_clip": 0.01174566, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05822575, "balance_loss_mlp": 1.02339983, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.4117618540057766, "language_loss": 0.66804767, "learning_rate": 3.8950489008705406e-06, "loss": 0.69024229, "num_input_tokens_seen": 46846505, "step": 2170, "time_per_iteration": 2.722769021987915 }, { "auxiliary_loss_clip": 0.0114216, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05424142, "balance_loss_mlp": 1.02637053, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.9089846415842238, "language_loss": 0.66768706, "learning_rate": 3.8949243605434e-06, "loss": 0.68957549, "num_input_tokens_seen": 46867380, "step": 2171, "time_per_iteration": 2.7474682331085205 }, { "auxiliary_loss_clip": 0.01157431, "auxiliary_loss_mlp": 0.01049079, "balance_loss_clip": 1.05283058, "balance_loss_mlp": 1.02701378, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 2.103440896006443, "language_loss": 0.72157478, "learning_rate": 3.894799748360537e-06, "loss": 0.74363995, "num_input_tokens_seen": 46886810, "step": 2172, "time_per_iteration": 2.8062691688537598 }, { "auxiliary_loss_clip": 0.01131178, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05676126, "balance_loss_mlp": 1.0248909, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.8662964619330822, "language_loss": 0.75331408, "learning_rate": 3.894675064326678e-06, "loss": 0.77508402, "num_input_tokens_seen": 46905620, "step": 2173, "time_per_iteration": 2.749630928039551 }, { "auxiliary_loss_clip": 0.01132129, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.05241716, "balance_loss_mlp": 1.03388715, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 2.8034072456055426, "language_loss": 0.70175481, "learning_rate": 3.894550308446551e-06, "loss": 0.72363639, "num_input_tokens_seen": 46925120, "step": 2174, "time_per_iteration": 2.723314046859741 }, { "auxiliary_loss_clip": 0.01047643, "auxiliary_loss_mlp": 0.01015006, "balance_loss_clip": 1.02629197, "balance_loss_mlp": 1.01260972, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.7998489021914615, "language_loss": 0.59026134, "learning_rate": 3.894425480724886e-06, "loss": 0.61088777, "num_input_tokens_seen": 46988195, "step": 2175, "time_per_iteration": 3.318049192428589 }, { "auxiliary_loss_clip": 0.01159762, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.05441868, "balance_loss_mlp": 1.03342521, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.2309284705459707, "language_loss": 0.80365628, "learning_rate": 3.894300581166417e-06, "loss": 0.82579315, "num_input_tokens_seen": 47004720, "step": 2176, "time_per_iteration": 2.631732702255249 }, { "auxiliary_loss_clip": 0.01169648, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.05513525, "balance_loss_mlp": 1.02529645, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.6906214681317566, "language_loss": 0.74661696, "learning_rate": 3.894175609775881e-06, "loss": 0.76878858, "num_input_tokens_seen": 47024255, "step": 2177, "time_per_iteration": 2.701422691345215 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.051373, "balance_loss_mlp": 1.02905297, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.8043513019060269, "language_loss": 0.82266748, "learning_rate": 3.894050566558015e-06, "loss": 0.84449303, "num_input_tokens_seen": 47042465, "step": 2178, "time_per_iteration": 2.6934497356414795 }, { "auxiliary_loss_clip": 0.01170524, "auxiliary_loss_mlp": 0.01047895, "balance_loss_clip": 1.05729508, "balance_loss_mlp": 1.02705729, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.9251611149508276, "language_loss": 0.74291968, "learning_rate": 3.893925451517562e-06, "loss": 0.76510382, "num_input_tokens_seen": 47060370, "step": 2179, "time_per_iteration": 2.6111502647399902 }, { "auxiliary_loss_clip": 0.01128297, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.04917574, "balance_loss_mlp": 1.03184354, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 1.9805514150688242, "language_loss": 0.84366202, "learning_rate": 3.893800264659266e-06, "loss": 0.8654691, "num_input_tokens_seen": 47081415, "step": 2180, "time_per_iteration": 2.731229543685913 }, { "auxiliary_loss_clip": 0.01162028, "auxiliary_loss_mlp": 0.0105845, "balance_loss_clip": 1.05875921, "balance_loss_mlp": 1.03757644, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 1.8389866248015785, "language_loss": 0.89840436, "learning_rate": 3.8936750059878746e-06, "loss": 0.92060918, "num_input_tokens_seen": 47099860, "step": 2181, "time_per_iteration": 2.643890380859375 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01051982, "balance_loss_clip": 1.05222976, "balance_loss_mlp": 1.03126323, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 2.117586475019142, "language_loss": 0.68813586, "learning_rate": 3.893549675508137e-06, "loss": 0.7101934, "num_input_tokens_seen": 47118540, "step": 2182, "time_per_iteration": 2.6198863983154297 }, { "auxiliary_loss_clip": 0.01123039, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.0502702, "balance_loss_mlp": 1.0292381, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 1.787500136217105, "language_loss": 0.78694725, "learning_rate": 3.893424273224806e-06, "loss": 0.8086918, "num_input_tokens_seen": 47136710, "step": 2183, "time_per_iteration": 2.715517520904541 }, { "auxiliary_loss_clip": 0.01169106, "auxiliary_loss_mlp": 0.01047098, "balance_loss_clip": 1.05452895, "balance_loss_mlp": 1.02586675, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 26.753588494231124, "language_loss": 0.85792655, "learning_rate": 3.893298799142636e-06, "loss": 0.88008863, "num_input_tokens_seen": 47157155, "step": 2184, "time_per_iteration": 2.632539987564087 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.01054657, "balance_loss_clip": 1.05349112, "balance_loss_mlp": 1.03230524, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 2.50466124454056, "language_loss": 0.82703435, "learning_rate": 3.893173253266387e-06, "loss": 0.84896809, "num_input_tokens_seen": 47176820, "step": 2185, "time_per_iteration": 2.6809136867523193 }, { "auxiliary_loss_clip": 0.01144077, "auxiliary_loss_mlp": 0.01054121, "balance_loss_clip": 1.05262399, "balance_loss_mlp": 1.03236496, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 1.8949462712827352, "language_loss": 0.72956109, "learning_rate": 3.893047635600818e-06, "loss": 0.75154305, "num_input_tokens_seen": 47195855, "step": 2186, "time_per_iteration": 2.628096342086792 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01050695, "balance_loss_clip": 1.05436552, "balance_loss_mlp": 1.02783096, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 1.9822444068613732, "language_loss": 0.80363685, "learning_rate": 3.892921946150693e-06, "loss": 0.82572162, "num_input_tokens_seen": 47214535, "step": 2187, "time_per_iteration": 2.762223720550537 }, { "auxiliary_loss_clip": 0.01027324, "auxiliary_loss_mlp": 0.0101023, "balance_loss_clip": 1.02364707, "balance_loss_mlp": 1.00792885, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8471850380496847, "language_loss": 0.59082437, "learning_rate": 3.892796184920778e-06, "loss": 0.61119986, "num_input_tokens_seen": 47270300, "step": 2188, "time_per_iteration": 3.302457571029663 }, { "auxiliary_loss_clip": 0.01095126, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.04827487, "balance_loss_mlp": 1.03676724, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.7340345041340466, "language_loss": 0.74211109, "learning_rate": 3.892670351915842e-06, "loss": 0.76365584, "num_input_tokens_seen": 47290720, "step": 2189, "time_per_iteration": 2.7990496158599854 }, { "auxiliary_loss_clip": 0.01160124, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.05551052, "balance_loss_mlp": 1.02799821, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.8160574809616576, "language_loss": 0.73152113, "learning_rate": 3.892544447140657e-06, "loss": 0.75361335, "num_input_tokens_seen": 47311820, "step": 2190, "time_per_iteration": 2.6485326290130615 }, { "auxiliary_loss_clip": 0.01160351, "auxiliary_loss_mlp": 0.01058461, "balance_loss_clip": 1.05671644, "balance_loss_mlp": 1.03811169, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 1.8825588242208007, "language_loss": 0.74617779, "learning_rate": 3.892418470599996e-06, "loss": 0.76836598, "num_input_tokens_seen": 47331605, "step": 2191, "time_per_iteration": 2.644484281539917 }, { "auxiliary_loss_clip": 0.0112783, "auxiliary_loss_mlp": 0.01054712, "balance_loss_clip": 1.05129039, "balance_loss_mlp": 1.03356445, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 1.8823393822145031, "language_loss": 0.79093283, "learning_rate": 3.892292422298637e-06, "loss": 0.81275827, "num_input_tokens_seen": 47350455, "step": 2192, "time_per_iteration": 2.735225200653076 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01051113, "balance_loss_clip": 1.04457211, "balance_loss_mlp": 1.02936912, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.7242105632860862, "language_loss": 0.85350716, "learning_rate": 3.892166302241361e-06, "loss": 0.87514639, "num_input_tokens_seen": 47368225, "step": 2193, "time_per_iteration": 4.262877941131592 }, { "auxiliary_loss_clip": 0.0104173, "auxiliary_loss_mlp": 0.01015651, "balance_loss_clip": 1.02609122, "balance_loss_mlp": 1.01280212, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.7746813180799224, "language_loss": 0.54112649, "learning_rate": 3.8920401104329475e-06, "loss": 0.56170022, "num_input_tokens_seen": 47427125, "step": 2194, "time_per_iteration": 6.223008394241333 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.02828002, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 2.1079865649821925, "language_loss": 0.72433972, "learning_rate": 3.891913846878185e-06, "loss": 0.74650574, "num_input_tokens_seen": 47450275, "step": 2195, "time_per_iteration": 2.6357345581054688 }, { "auxiliary_loss_clip": 0.01136503, "auxiliary_loss_mlp": 0.00778731, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 0.99996454, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.5737174748369949, "language_loss": 0.78126895, "learning_rate": 3.891787511581859e-06, "loss": 0.8004213, "num_input_tokens_seen": 47469155, "step": 2196, "time_per_iteration": 2.7118594646453857 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05453539, "balance_loss_mlp": 1.03210831, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 1.9385650447291836, "language_loss": 0.74632496, "learning_rate": 3.89166110454876e-06, "loss": 0.76847541, "num_input_tokens_seen": 47488405, "step": 2197, "time_per_iteration": 4.270530939102173 }, { "auxiliary_loss_clip": 0.01173786, "auxiliary_loss_mlp": 0.01050846, "balance_loss_clip": 1.05440533, "balance_loss_mlp": 1.02947164, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 1.785688190112577, "language_loss": 0.79566747, "learning_rate": 3.891534625783685e-06, "loss": 0.81791383, "num_input_tokens_seen": 47505650, "step": 2198, "time_per_iteration": 2.6145474910736084 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.01057159, "balance_loss_clip": 1.05536175, "balance_loss_mlp": 1.03647637, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.56313218775589, "language_loss": 0.82932216, "learning_rate": 3.891408075291425e-06, "loss": 0.85159647, "num_input_tokens_seen": 47521540, "step": 2199, "time_per_iteration": 2.5715503692626953 }, { "auxiliary_loss_clip": 0.01122554, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.05047798, "balance_loss_mlp": 1.03045249, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.8710902505917797, "language_loss": 0.69579422, "learning_rate": 3.8912814530767826e-06, "loss": 0.71754128, "num_input_tokens_seen": 47543625, "step": 2200, "time_per_iteration": 2.8001365661621094 }, { "auxiliary_loss_clip": 0.01167798, "auxiliary_loss_mlp": 0.01058155, "balance_loss_clip": 1.05345917, "balance_loss_mlp": 1.03618431, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.647659287704997, "language_loss": 0.84624702, "learning_rate": 3.891154759144557e-06, "loss": 0.86850655, "num_input_tokens_seen": 47563740, "step": 2201, "time_per_iteration": 2.6485981941223145 }, { "auxiliary_loss_clip": 0.0117188, "auxiliary_loss_mlp": 0.01055627, "balance_loss_clip": 1.05427861, "balance_loss_mlp": 1.03431273, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.7446392584198542, "language_loss": 0.87088037, "learning_rate": 3.891027993499554e-06, "loss": 0.8931554, "num_input_tokens_seen": 47582655, "step": 2202, "time_per_iteration": 2.5921456813812256 }, { "auxiliary_loss_clip": 0.01139991, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.05299544, "balance_loss_mlp": 1.03267026, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 2.405254380671628, "language_loss": 0.72801507, "learning_rate": 3.89090115614658e-06, "loss": 0.7499491, "num_input_tokens_seen": 47600875, "step": 2203, "time_per_iteration": 2.6257405281066895 }, { "auxiliary_loss_clip": 0.01124508, "auxiliary_loss_mlp": 0.0105959, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.03916979, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.044348475010678, "language_loss": 0.73170948, "learning_rate": 3.890774247090444e-06, "loss": 0.75355047, "num_input_tokens_seen": 47619250, "step": 2204, "time_per_iteration": 2.753830909729004 }, { "auxiliary_loss_clip": 0.01160826, "auxiliary_loss_mlp": 0.01054406, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.03225708, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 2.094172729236468, "language_loss": 0.78377104, "learning_rate": 3.89064726633596e-06, "loss": 0.80592328, "num_input_tokens_seen": 47639445, "step": 2205, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01048818, "balance_loss_clip": 1.04975629, "balance_loss_mlp": 1.02782559, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 1.8609089802832188, "language_loss": 0.78638101, "learning_rate": 3.890520213887941e-06, "loss": 0.80812073, "num_input_tokens_seen": 47658740, "step": 2206, "time_per_iteration": 2.691962718963623 }, { "auxiliary_loss_clip": 0.01124965, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.04958403, "balance_loss_mlp": 1.02649069, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 2.2777192787220066, "language_loss": 0.74672282, "learning_rate": 3.890393089751208e-06, "loss": 0.76843208, "num_input_tokens_seen": 47676880, "step": 2207, "time_per_iteration": 2.7062454223632812 }, { "auxiliary_loss_clip": 0.01143208, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.05257845, "balance_loss_mlp": 1.02672219, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 1.692212064021935, "language_loss": 0.84061795, "learning_rate": 3.890265893930578e-06, "loss": 0.8625294, "num_input_tokens_seen": 47696635, "step": 2208, "time_per_iteration": 2.687717914581299 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.05847478, "balance_loss_mlp": 1.03411973, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 1.7032258459750478, "language_loss": 0.85587811, "learning_rate": 3.890138626430876e-06, "loss": 0.8779313, "num_input_tokens_seen": 47717760, "step": 2209, "time_per_iteration": 2.646015167236328 }, { "auxiliary_loss_clip": 0.01138084, "auxiliary_loss_mlp": 0.00778828, "balance_loss_clip": 1.05316806, "balance_loss_mlp": 1.00002563, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.237247968175465, "language_loss": 0.81797457, "learning_rate": 3.890011287256929e-06, "loss": 0.83714366, "num_input_tokens_seen": 47737685, "step": 2210, "time_per_iteration": 2.676262378692627 }, { "auxiliary_loss_clip": 0.0104445, "auxiliary_loss_mlp": 0.00757817, "balance_loss_clip": 1.03801322, "balance_loss_mlp": 1.00007725, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7515252652740232, "language_loss": 0.58031559, "learning_rate": 3.889883876413563e-06, "loss": 0.59833825, "num_input_tokens_seen": 47802415, "step": 2211, "time_per_iteration": 3.3914146423339844 }, { "auxiliary_loss_clip": 0.01064712, "auxiliary_loss_mlp": 0.01012978, "balance_loss_clip": 1.04205871, "balance_loss_mlp": 1.01083231, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.8012428422082742, "language_loss": 0.55299425, "learning_rate": 3.889756393905611e-06, "loss": 0.57377112, "num_input_tokens_seen": 47871485, "step": 2212, "time_per_iteration": 3.2910914421081543 }, { "auxiliary_loss_clip": 0.01132433, "auxiliary_loss_mlp": 0.01054299, "balance_loss_clip": 1.05107963, "balance_loss_mlp": 1.0331986, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.484635795733661, "language_loss": 0.74228692, "learning_rate": 3.889628839737908e-06, "loss": 0.7641542, "num_input_tokens_seen": 47888315, "step": 2213, "time_per_iteration": 2.755777597427368 }, { "auxiliary_loss_clip": 0.01114671, "auxiliary_loss_mlp": 0.01051459, "balance_loss_clip": 1.04682255, "balance_loss_mlp": 1.03231359, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.850943077435394, "language_loss": 0.79699469, "learning_rate": 3.889501213915291e-06, "loss": 0.81865597, "num_input_tokens_seen": 47906600, "step": 2214, "time_per_iteration": 2.702603340148926 }, { "auxiliary_loss_clip": 0.01143494, "auxiliary_loss_mlp": 0.01052411, "balance_loss_clip": 1.05555344, "balance_loss_mlp": 1.03171659, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.8782588426913054, "language_loss": 0.69341159, "learning_rate": 3.889373516442597e-06, "loss": 0.71537066, "num_input_tokens_seen": 47927630, "step": 2215, "time_per_iteration": 2.769237518310547 }, { "auxiliary_loss_clip": 0.01167307, "auxiliary_loss_mlp": 0.01051423, "balance_loss_clip": 1.06098068, "balance_loss_mlp": 1.03132463, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.884566493826098, "language_loss": 0.81262428, "learning_rate": 3.889245747324671e-06, "loss": 0.83481157, "num_input_tokens_seen": 47947935, "step": 2216, "time_per_iteration": 2.7427120208740234 }, { "auxiliary_loss_clip": 0.01163681, "auxiliary_loss_mlp": 0.01056545, "balance_loss_clip": 1.06198788, "balance_loss_mlp": 1.03631544, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 3.783334161704178, "language_loss": 0.87299347, "learning_rate": 3.889117906566356e-06, "loss": 0.89519572, "num_input_tokens_seen": 47965515, "step": 2217, "time_per_iteration": 2.709527015686035 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01056364, "balance_loss_clip": 1.06054497, "balance_loss_mlp": 1.0343225, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 4.412823416345162, "language_loss": 0.73105222, "learning_rate": 3.888989994172501e-06, "loss": 0.75314289, "num_input_tokens_seen": 47985675, "step": 2218, "time_per_iteration": 2.697733163833618 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.01051151, "balance_loss_clip": 1.0535965, "balance_loss_mlp": 1.02993202, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7935349411013712, "language_loss": 0.86911142, "learning_rate": 3.8888620101479565e-06, "loss": 0.89091408, "num_input_tokens_seen": 48004985, "step": 2219, "time_per_iteration": 2.7641642093658447 }, { "auxiliary_loss_clip": 0.01141172, "auxiliary_loss_mlp": 0.0106326, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.04406714, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 1.8604531362737113, "language_loss": 0.77244747, "learning_rate": 3.888733954497574e-06, "loss": 0.79449183, "num_input_tokens_seen": 48024965, "step": 2220, "time_per_iteration": 2.732160806655884 }, { "auxiliary_loss_clip": 0.01146487, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.05399704, "balance_loss_mlp": 1.03001785, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.3004113327688955, "language_loss": 0.79467338, "learning_rate": 3.888605827226212e-06, "loss": 0.81662482, "num_input_tokens_seen": 48040890, "step": 2221, "time_per_iteration": 2.685612440109253 }, { "auxiliary_loss_clip": 0.01062777, "auxiliary_loss_mlp": 0.01021711, "balance_loss_clip": 1.03293467, "balance_loss_mlp": 1.0194701, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9755051104211709, "language_loss": 0.68938822, "learning_rate": 3.8884776283387275e-06, "loss": 0.71023309, "num_input_tokens_seen": 48091855, "step": 2222, "time_per_iteration": 3.0336835384368896 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01058574, "balance_loss_clip": 1.05544209, "balance_loss_mlp": 1.03940475, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 2.1295993667823416, "language_loss": 0.67389107, "learning_rate": 3.888349357839982e-06, "loss": 0.69577825, "num_input_tokens_seen": 48111350, "step": 2223, "time_per_iteration": 2.7134146690368652 }, { "auxiliary_loss_clip": 0.01161386, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.05785358, "balance_loss_mlp": 1.04010296, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 4.277142483609355, "language_loss": 0.82505226, "learning_rate": 3.88822101573484e-06, "loss": 0.84727186, "num_input_tokens_seen": 48129840, "step": 2224, "time_per_iteration": 2.608372926712036 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0573926, "balance_loss_mlp": 1.0290221, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 1.9890294619132924, "language_loss": 0.66270435, "learning_rate": 3.888092602028167e-06, "loss": 0.68493932, "num_input_tokens_seen": 48149240, "step": 2225, "time_per_iteration": 2.6304945945739746 }, { "auxiliary_loss_clip": 0.01153626, "auxiliary_loss_mlp": 0.01051637, "balance_loss_clip": 1.05233717, "balance_loss_mlp": 1.03180075, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.2915668787246997, "language_loss": 0.89469218, "learning_rate": 3.887964116724835e-06, "loss": 0.91674477, "num_input_tokens_seen": 48166330, "step": 2226, "time_per_iteration": 2.6002328395843506 }, { "auxiliary_loss_clip": 0.01150395, "auxiliary_loss_mlp": 0.01054296, "balance_loss_clip": 1.0549798, "balance_loss_mlp": 1.03423262, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.7271512115821777, "language_loss": 0.73209751, "learning_rate": 3.887835559829712e-06, "loss": 0.75414443, "num_input_tokens_seen": 48187600, "step": 2227, "time_per_iteration": 2.706193447113037 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.05518484, "balance_loss_mlp": 1.02683568, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.848999829625599, "language_loss": 0.85160232, "learning_rate": 3.8877069313476764e-06, "loss": 0.87365323, "num_input_tokens_seen": 48204400, "step": 2228, "time_per_iteration": 2.689209222793579 }, { "auxiliary_loss_clip": 0.01132803, "auxiliary_loss_mlp": 0.01052829, "balance_loss_clip": 1.04935181, "balance_loss_mlp": 1.03126431, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 1.909679794697233, "language_loss": 0.81460214, "learning_rate": 3.8875782312836054e-06, "loss": 0.83645844, "num_input_tokens_seen": 48222180, "step": 2229, "time_per_iteration": 2.6380228996276855 }, { "auxiliary_loss_clip": 0.0110557, "auxiliary_loss_mlp": 0.01052684, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.03233457, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 1.7464076089691416, "language_loss": 0.73822236, "learning_rate": 3.887449459642378e-06, "loss": 0.7598049, "num_input_tokens_seen": 48243245, "step": 2230, "time_per_iteration": 2.7332983016967773 }, { "auxiliary_loss_clip": 0.01125236, "auxiliary_loss_mlp": 0.01058977, "balance_loss_clip": 1.05213606, "balance_loss_mlp": 1.03890252, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 1.6827882777998602, "language_loss": 0.80133682, "learning_rate": 3.8873206164288785e-06, "loss": 0.82317901, "num_input_tokens_seen": 48262600, "step": 2231, "time_per_iteration": 2.6759045124053955 }, { "auxiliary_loss_clip": 0.01111387, "auxiliary_loss_mlp": 0.01057582, "balance_loss_clip": 1.04997492, "balance_loss_mlp": 1.03499198, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 1.746756846769887, "language_loss": 0.72152746, "learning_rate": 3.887191701647992e-06, "loss": 0.74321723, "num_input_tokens_seen": 48285075, "step": 2232, "time_per_iteration": 4.391890048980713 }, { "auxiliary_loss_clip": 0.0112104, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.0481019, "balance_loss_mlp": 1.03039551, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 2.4719586176391686, "language_loss": 0.65116024, "learning_rate": 3.8870627153046066e-06, "loss": 0.67288864, "num_input_tokens_seen": 48301285, "step": 2233, "time_per_iteration": 4.234508037567139 }, { "auxiliary_loss_clip": 0.01167005, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.05189967, "balance_loss_mlp": 1.02421367, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.4864430088666656, "language_loss": 0.80878961, "learning_rate": 3.886933657403615e-06, "loss": 0.8309058, "num_input_tokens_seen": 48317835, "step": 2234, "time_per_iteration": 4.175215005874634 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.05052733, "balance_loss_mlp": 1.03268874, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 2.0569321713284827, "language_loss": 0.82114553, "learning_rate": 3.886804527949909e-06, "loss": 0.84309351, "num_input_tokens_seen": 48335670, "step": 2235, "time_per_iteration": 2.6588025093078613 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02983022, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.6363146905087136, "language_loss": 0.86092007, "learning_rate": 3.8866753269483864e-06, "loss": 0.88293117, "num_input_tokens_seen": 48357805, "step": 2236, "time_per_iteration": 4.349383592605591 }, { "auxiliary_loss_clip": 0.01166751, "auxiliary_loss_mlp": 0.01047925, "balance_loss_clip": 1.05288053, "balance_loss_mlp": 1.02724242, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 1.82135056053112, "language_loss": 0.77258497, "learning_rate": 3.886546054403946e-06, "loss": 0.79473174, "num_input_tokens_seen": 48377845, "step": 2237, "time_per_iteration": 2.6398766040802 }, { "auxiliary_loss_clip": 0.01145425, "auxiliary_loss_mlp": 0.01051006, "balance_loss_clip": 1.05016851, "balance_loss_mlp": 1.02919102, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 2.440947698046141, "language_loss": 0.78772336, "learning_rate": 3.886416710321491e-06, "loss": 0.80968761, "num_input_tokens_seen": 48394735, "step": 2238, "time_per_iteration": 2.6556923389434814 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01050085, "balance_loss_clip": 1.05123293, "balance_loss_mlp": 1.02878201, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 2.9136729194949735, "language_loss": 0.68486369, "learning_rate": 3.886287294705924e-06, "loss": 0.70678043, "num_input_tokens_seen": 48414200, "step": 2239, "time_per_iteration": 2.6778814792633057 }, { "auxiliary_loss_clip": 0.01147129, "auxiliary_loss_mlp": 0.01052633, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.03197384, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.3763106012672925, "language_loss": 0.81277847, "learning_rate": 3.8861578075621555e-06, "loss": 0.8347761, "num_input_tokens_seen": 48431065, "step": 2240, "time_per_iteration": 2.5920939445495605 }, { "auxiliary_loss_clip": 0.01107793, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.04459488, "balance_loss_mlp": 1.02884459, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.7269080191231387, "language_loss": 0.77183759, "learning_rate": 3.886028248895093e-06, "loss": 0.79341465, "num_input_tokens_seen": 48450335, "step": 2241, "time_per_iteration": 2.7224419116973877 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.05439126, "balance_loss_mlp": 1.02324009, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 2.0305903786470743, "language_loss": 0.83062387, "learning_rate": 3.88589861870965e-06, "loss": 0.85267115, "num_input_tokens_seen": 48468555, "step": 2242, "time_per_iteration": 2.5794169902801514 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.05504107, "balance_loss_mlp": 1.03469825, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 2.465549548535016, "language_loss": 0.6498239, "learning_rate": 3.885768917010744e-06, "loss": 0.67209053, "num_input_tokens_seen": 48488515, "step": 2243, "time_per_iteration": 2.6709110736846924 }, { "auxiliary_loss_clip": 0.01125086, "auxiliary_loss_mlp": 0.01046786, "balance_loss_clip": 1.04593956, "balance_loss_mlp": 1.02618706, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.7770524512670738, "language_loss": 0.72633034, "learning_rate": 3.8856391438032895e-06, "loss": 0.74804902, "num_input_tokens_seen": 48510515, "step": 2244, "time_per_iteration": 2.713803768157959 }, { "auxiliary_loss_clip": 0.0115377, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.05312431, "balance_loss_mlp": 1.03209639, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.7564166456764931, "language_loss": 0.86023217, "learning_rate": 3.88550929909221e-06, "loss": 0.88228464, "num_input_tokens_seen": 48529940, "step": 2245, "time_per_iteration": 2.626560926437378 }, { "auxiliary_loss_clip": 0.01149467, "auxiliary_loss_mlp": 0.0105327, "balance_loss_clip": 1.05035663, "balance_loss_mlp": 1.03346968, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.7861449859595755, "language_loss": 0.78912753, "learning_rate": 3.88537938288243e-06, "loss": 0.8111549, "num_input_tokens_seen": 48548190, "step": 2246, "time_per_iteration": 2.6543703079223633 }, { "auxiliary_loss_clip": 0.010304, "auxiliary_loss_mlp": 0.01015407, "balance_loss_clip": 1.03666449, "balance_loss_mlp": 1.01285601, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7509256694227144, "language_loss": 0.6054731, "learning_rate": 3.885249395178874e-06, "loss": 0.62593114, "num_input_tokens_seen": 48613165, "step": 2247, "time_per_iteration": 3.3349809646606445 }, { "auxiliary_loss_clip": 0.01162017, "auxiliary_loss_mlp": 0.01056869, "balance_loss_clip": 1.05492628, "balance_loss_mlp": 1.03470767, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 2.562042993856578, "language_loss": 0.80841738, "learning_rate": 3.885119335986473e-06, "loss": 0.83060622, "num_input_tokens_seen": 48631705, "step": 2248, "time_per_iteration": 2.6279287338256836 }, { "auxiliary_loss_clip": 0.0114073, "auxiliary_loss_mlp": 0.01049128, "balance_loss_clip": 1.05086231, "balance_loss_mlp": 1.03054309, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9247838227480492, "language_loss": 0.77108699, "learning_rate": 3.884989205310157e-06, "loss": 0.79298556, "num_input_tokens_seen": 48649740, "step": 2249, "time_per_iteration": 2.7100210189819336 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01057649, "balance_loss_clip": 1.05325472, "balance_loss_mlp": 1.03863478, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.7403695434994237, "language_loss": 0.84457541, "learning_rate": 3.884859003154862e-06, "loss": 0.86642522, "num_input_tokens_seen": 48671565, "step": 2250, "time_per_iteration": 2.789350986480713 }, { "auxiliary_loss_clip": 0.01155547, "auxiliary_loss_mlp": 0.0105348, "balance_loss_clip": 1.05310512, "balance_loss_mlp": 1.03243995, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 3.018154510939524, "language_loss": 0.81796515, "learning_rate": 3.884728729525524e-06, "loss": 0.84005541, "num_input_tokens_seen": 48690425, "step": 2251, "time_per_iteration": 2.685617208480835 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.05235004, "balance_loss_mlp": 1.03888273, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.7680273527580506, "language_loss": 0.86173487, "learning_rate": 3.884598384427084e-06, "loss": 0.88399172, "num_input_tokens_seen": 48707505, "step": 2252, "time_per_iteration": 2.597219467163086 }, { "auxiliary_loss_clip": 0.01052296, "auxiliary_loss_mlp": 0.01018557, "balance_loss_clip": 1.02446079, "balance_loss_mlp": 1.01632786, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.8028920055572067, "language_loss": 0.61837333, "learning_rate": 3.884467967864485e-06, "loss": 0.6390819, "num_input_tokens_seen": 48775895, "step": 2253, "time_per_iteration": 3.25115704536438 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01055639, "balance_loss_clip": 1.0539906, "balance_loss_mlp": 1.03587449, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 1.6376691715964824, "language_loss": 0.89441288, "learning_rate": 3.884337479842671e-06, "loss": 0.91652036, "num_input_tokens_seen": 48798370, "step": 2254, "time_per_iteration": 2.6803932189941406 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01063066, "balance_loss_clip": 1.04506016, "balance_loss_mlp": 1.03872383, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.1104776784573787, "language_loss": 0.84626925, "learning_rate": 3.884206920366591e-06, "loss": 0.86821771, "num_input_tokens_seen": 48817955, "step": 2255, "time_per_iteration": 2.7074074745178223 }, { "auxiliary_loss_clip": 0.01165481, "auxiliary_loss_mlp": 0.01058458, "balance_loss_clip": 1.05211091, "balance_loss_mlp": 1.03767991, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 4.791676738707355, "language_loss": 0.74684238, "learning_rate": 3.884076289441196e-06, "loss": 0.76908177, "num_input_tokens_seen": 48836330, "step": 2256, "time_per_iteration": 2.590178966522217 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01054317, "balance_loss_clip": 1.04977024, "balance_loss_mlp": 1.03338361, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 5.890843360804152, "language_loss": 0.8309083, "learning_rate": 3.88394558707144e-06, "loss": 0.85272169, "num_input_tokens_seen": 48851890, "step": 2257, "time_per_iteration": 2.642096519470215 }, { "auxiliary_loss_clip": 0.0114984, "auxiliary_loss_mlp": 0.00780177, "balance_loss_clip": 1.05128407, "balance_loss_mlp": 1.00013828, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.1957250492246505, "language_loss": 0.82045269, "learning_rate": 3.883814813262277e-06, "loss": 0.83975297, "num_input_tokens_seen": 48865510, "step": 2258, "time_per_iteration": 2.6279473304748535 }, { "auxiliary_loss_clip": 0.01155515, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.05172098, "balance_loss_mlp": 1.03152323, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.6364031487830464, "language_loss": 0.82694167, "learning_rate": 3.883683968018669e-06, "loss": 0.849042, "num_input_tokens_seen": 48882360, "step": 2259, "time_per_iteration": 2.677804708480835 }, { "auxiliary_loss_clip": 0.01127201, "auxiliary_loss_mlp": 0.01054646, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.03547728, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 2.0790748617118853, "language_loss": 0.73916006, "learning_rate": 3.8835530513455755e-06, "loss": 0.76097858, "num_input_tokens_seen": 48902700, "step": 2260, "time_per_iteration": 2.7416799068450928 }, { "auxiliary_loss_clip": 0.01144177, "auxiliary_loss_mlp": 0.01056881, "balance_loss_clip": 1.05196047, "balance_loss_mlp": 1.03691387, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 3.546593987683097, "language_loss": 0.74799728, "learning_rate": 3.883422063247961e-06, "loss": 0.77000785, "num_input_tokens_seen": 48922525, "step": 2261, "time_per_iteration": 2.675342559814453 }, { "auxiliary_loss_clip": 0.01170469, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.05486035, "balance_loss_mlp": 1.03043413, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 2.967396076139427, "language_loss": 0.63602281, "learning_rate": 3.883291003730794e-06, "loss": 0.65823734, "num_input_tokens_seen": 48942510, "step": 2262, "time_per_iteration": 2.660538911819458 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.0516696, "balance_loss_mlp": 1.03216195, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.301949377353301, "language_loss": 0.81810403, "learning_rate": 3.883159872799043e-06, "loss": 0.84010524, "num_input_tokens_seen": 48962625, "step": 2263, "time_per_iteration": 2.840043783187866 }, { "auxiliary_loss_clip": 0.01098888, "auxiliary_loss_mlp": 0.01064302, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.0410558, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.7561035968690553, "language_loss": 0.87737143, "learning_rate": 3.8830286704576815e-06, "loss": 0.89900339, "num_input_tokens_seen": 48982525, "step": 2264, "time_per_iteration": 2.784648895263672 }, { "auxiliary_loss_clip": 0.01157618, "auxiliary_loss_mlp": 0.01049521, "balance_loss_clip": 1.05161715, "balance_loss_mlp": 1.02709746, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 3.151792845640157, "language_loss": 0.7115528, "learning_rate": 3.882897396711683e-06, "loss": 0.7336241, "num_input_tokens_seen": 48997605, "step": 2265, "time_per_iteration": 2.6108245849609375 }, { "auxiliary_loss_clip": 0.01111831, "auxiliary_loss_mlp": 0.01042545, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02256525, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 4.918827494175735, "language_loss": 0.6671263, "learning_rate": 3.882766051566027e-06, "loss": 0.68867004, "num_input_tokens_seen": 49018535, "step": 2266, "time_per_iteration": 2.7810373306274414 }, { "auxiliary_loss_clip": 0.01127539, "auxiliary_loss_mlp": 0.01057589, "balance_loss_clip": 1.05683684, "balance_loss_mlp": 1.03739524, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.707924588861666, "language_loss": 0.7634865, "learning_rate": 3.882634635025694e-06, "loss": 0.78533769, "num_input_tokens_seen": 49038865, "step": 2267, "time_per_iteration": 2.7682721614837646 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01048207, "balance_loss_clip": 1.04668903, "balance_loss_mlp": 1.02641535, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 2.9531688260339934, "language_loss": 0.81653506, "learning_rate": 3.882503147095667e-06, "loss": 0.83835161, "num_input_tokens_seen": 49058010, "step": 2268, "time_per_iteration": 2.645081043243408 }, { "auxiliary_loss_clip": 0.01155147, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.05424881, "balance_loss_mlp": 1.02738333, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.9923150848418427, "language_loss": 0.75975174, "learning_rate": 3.882371587780931e-06, "loss": 0.78178769, "num_input_tokens_seen": 49080330, "step": 2269, "time_per_iteration": 2.6764814853668213 }, { "auxiliary_loss_clip": 0.0113465, "auxiliary_loss_mlp": 0.01049702, "balance_loss_clip": 1.04941857, "balance_loss_mlp": 1.02844727, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 2.1475090354855473, "language_loss": 0.81328762, "learning_rate": 3.882239957086477e-06, "loss": 0.83513117, "num_input_tokens_seen": 49097035, "step": 2270, "time_per_iteration": 2.6801655292510986 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.04989171, "balance_loss_mlp": 1.03773928, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 3.2227070482893976, "language_loss": 0.75812757, "learning_rate": 3.882108255017295e-06, "loss": 0.78014266, "num_input_tokens_seen": 49113945, "step": 2271, "time_per_iteration": 4.197805166244507 }, { "auxiliary_loss_clip": 0.01156913, "auxiliary_loss_mlp": 0.01061846, "balance_loss_clip": 1.05097795, "balance_loss_mlp": 1.03921962, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.2800716885469754, "language_loss": 0.80251753, "learning_rate": 3.881976481578379e-06, "loss": 0.82470512, "num_input_tokens_seen": 49132855, "step": 2272, "time_per_iteration": 4.1461029052734375 }, { "auxiliary_loss_clip": 0.01055091, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.02539539, "balance_loss_mlp": 1.04001904, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.7097054685047118, "language_loss": 0.60739923, "learning_rate": 3.8818446367747255e-06, "loss": 0.62837708, "num_input_tokens_seen": 49198310, "step": 2273, "time_per_iteration": 4.731219530105591 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.00780474, "balance_loss_clip": 1.0523783, "balance_loss_mlp": 1.00008452, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 2.4844725334882583, "language_loss": 0.77506429, "learning_rate": 3.881712720611336e-06, "loss": 0.79452413, "num_input_tokens_seen": 49217250, "step": 2274, "time_per_iteration": 2.7122738361358643 }, { "auxiliary_loss_clip": 0.01154937, "auxiliary_loss_mlp": 0.01054542, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03271496, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 2.391437383339344, "language_loss": 0.78256011, "learning_rate": 3.881580733093211e-06, "loss": 0.8046549, "num_input_tokens_seen": 49236615, "step": 2275, "time_per_iteration": 2.6674444675445557 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.05220842, "balance_loss_mlp": 1.02449977, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.271072834476717, "language_loss": 0.81682789, "learning_rate": 3.881448674225356e-06, "loss": 0.83882004, "num_input_tokens_seen": 49253935, "step": 2276, "time_per_iteration": 4.202202558517456 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01060078, "balance_loss_clip": 1.05228245, "balance_loss_mlp": 1.03604531, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 5.063053962589045, "language_loss": 0.69948691, "learning_rate": 3.881316544012779e-06, "loss": 0.72173715, "num_input_tokens_seen": 49273605, "step": 2277, "time_per_iteration": 2.708591938018799 }, { "auxiliary_loss_clip": 0.01160044, "auxiliary_loss_mlp": 0.00780297, "balance_loss_clip": 1.05169702, "balance_loss_mlp": 1.00017083, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.062701620585305, "language_loss": 0.80197465, "learning_rate": 3.88118434246049e-06, "loss": 0.82137805, "num_input_tokens_seen": 49291785, "step": 2278, "time_per_iteration": 2.6916158199310303 }, { "auxiliary_loss_clip": 0.01159146, "auxiliary_loss_mlp": 0.01060686, "balance_loss_clip": 1.05954766, "balance_loss_mlp": 1.03925228, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 7.088344486179519, "language_loss": 0.75048816, "learning_rate": 3.881052069573502e-06, "loss": 0.77268648, "num_input_tokens_seen": 49311405, "step": 2279, "time_per_iteration": 2.7316977977752686 }, { "auxiliary_loss_clip": 0.01101952, "auxiliary_loss_mlp": 0.01066685, "balance_loss_clip": 1.04605758, "balance_loss_mlp": 1.04485774, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 2.5293116992138223, "language_loss": 0.76743513, "learning_rate": 3.880919725356831e-06, "loss": 0.78912151, "num_input_tokens_seen": 49331835, "step": 2280, "time_per_iteration": 2.813720941543579 }, { "auxiliary_loss_clip": 0.01108594, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04457331, "balance_loss_mlp": 1.04022956, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 2.0597640944890325, "language_loss": 0.79657966, "learning_rate": 3.880787309815496e-06, "loss": 0.81827366, "num_input_tokens_seen": 49352290, "step": 2281, "time_per_iteration": 2.8325345516204834 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.0107773, "balance_loss_clip": 1.05715084, "balance_loss_mlp": 1.05671358, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 2.0769142230572877, "language_loss": 0.83383757, "learning_rate": 3.880654822954518e-06, "loss": 0.85638046, "num_input_tokens_seen": 49370285, "step": 2282, "time_per_iteration": 2.5988755226135254 }, { "auxiliary_loss_clip": 0.01142098, "auxiliary_loss_mlp": 0.01075909, "balance_loss_clip": 1.04898703, "balance_loss_mlp": 1.05583453, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 1.5269487193470777, "language_loss": 0.73526621, "learning_rate": 3.8805222647789195e-06, "loss": 0.75744629, "num_input_tokens_seen": 49389610, "step": 2283, "time_per_iteration": 2.7099714279174805 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01062577, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.04173923, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 2.2306012559941455, "language_loss": 0.83934438, "learning_rate": 3.880389635293729e-06, "loss": 0.86157191, "num_input_tokens_seen": 49408390, "step": 2284, "time_per_iteration": 2.7315831184387207 }, { "auxiliary_loss_clip": 0.01151427, "auxiliary_loss_mlp": 0.01070288, "balance_loss_clip": 1.05204272, "balance_loss_mlp": 1.04779351, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 2.0900141273659223, "language_loss": 0.7557056, "learning_rate": 3.880256934503974e-06, "loss": 0.77792281, "num_input_tokens_seen": 49427725, "step": 2285, "time_per_iteration": 2.7257747650146484 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01064539, "balance_loss_clip": 1.05233073, "balance_loss_mlp": 1.04392731, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 2.727019945657865, "language_loss": 0.74521589, "learning_rate": 3.880124162414689e-06, "loss": 0.76730204, "num_input_tokens_seen": 49449000, "step": 2286, "time_per_iteration": 2.742582082748413 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01059198, "balance_loss_clip": 1.04906356, "balance_loss_mlp": 1.03659606, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 2.2168449035378357, "language_loss": 0.86683542, "learning_rate": 3.879991319030908e-06, "loss": 0.88868147, "num_input_tokens_seen": 49468360, "step": 2287, "time_per_iteration": 2.802088499069214 }, { "auxiliary_loss_clip": 0.01124712, "auxiliary_loss_mlp": 0.01064517, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.04207003, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.0592152854463106, "language_loss": 0.68410838, "learning_rate": 3.879858404357666e-06, "loss": 0.70600063, "num_input_tokens_seen": 49493450, "step": 2288, "time_per_iteration": 2.861175537109375 }, { "auxiliary_loss_clip": 0.01112106, "auxiliary_loss_mlp": 0.01071262, "balance_loss_clip": 1.05062151, "balance_loss_mlp": 1.04666936, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 2.3933568244149357, "language_loss": 0.87090456, "learning_rate": 3.879725418400005e-06, "loss": 0.89273822, "num_input_tokens_seen": 49511220, "step": 2289, "time_per_iteration": 2.7185773849487305 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.00781167, "balance_loss_clip": 1.0480957, "balance_loss_mlp": 1.00019848, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.8106848287624444, "language_loss": 0.74668044, "learning_rate": 3.879592361162969e-06, "loss": 0.76579404, "num_input_tokens_seen": 49529820, "step": 2290, "time_per_iteration": 2.6751222610473633 }, { "auxiliary_loss_clip": 0.01039657, "auxiliary_loss_mlp": 0.01081332, "balance_loss_clip": 1.03094769, "balance_loss_mlp": 1.07881641, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7179159366671727, "language_loss": 0.51597112, "learning_rate": 3.8794592326516015e-06, "loss": 0.53718102, "num_input_tokens_seen": 49595325, "step": 2291, "time_per_iteration": 3.2823359966278076 }, { "auxiliary_loss_clip": 0.01157406, "auxiliary_loss_mlp": 0.01052846, "balance_loss_clip": 1.05224037, "balance_loss_mlp": 1.03123331, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 1.9326408617769533, "language_loss": 0.71273667, "learning_rate": 3.879326032870952e-06, "loss": 0.7348392, "num_input_tokens_seen": 49615850, "step": 2292, "time_per_iteration": 2.74045729637146 }, { "auxiliary_loss_clip": 0.01156871, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.05427122, "balance_loss_mlp": 1.02931166, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 6.592759889378346, "language_loss": 0.8047784, "learning_rate": 3.879192761826071e-06, "loss": 0.82684022, "num_input_tokens_seen": 49631860, "step": 2293, "time_per_iteration": 2.587576389312744 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.0554558, "balance_loss_mlp": 1.02921653, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 1.9082895606463517, "language_loss": 0.78440171, "learning_rate": 3.879059419522011e-06, "loss": 0.80647767, "num_input_tokens_seen": 49652145, "step": 2294, "time_per_iteration": 2.7152793407440186 }, { "auxiliary_loss_clip": 0.01126374, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.05281758, "balance_loss_mlp": 1.03104973, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 1.991103290125302, "language_loss": 0.80339509, "learning_rate": 3.878926005963831e-06, "loss": 0.82516527, "num_input_tokens_seen": 49669880, "step": 2295, "time_per_iteration": 2.7026021480560303 }, { "auxiliary_loss_clip": 0.01154693, "auxiliary_loss_mlp": 0.01052186, "balance_loss_clip": 1.05239046, "balance_loss_mlp": 1.03102624, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.7450624966187134, "language_loss": 0.78661883, "learning_rate": 3.878792521156588e-06, "loss": 0.80868757, "num_input_tokens_seen": 49687255, "step": 2296, "time_per_iteration": 2.566929340362549 }, { "auxiliary_loss_clip": 0.01153425, "auxiliary_loss_mlp": 0.01069343, "balance_loss_clip": 1.05437231, "balance_loss_mlp": 1.04811132, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 1.7434096141785573, "language_loss": 0.78663194, "learning_rate": 3.8786589651053446e-06, "loss": 0.80885959, "num_input_tokens_seen": 49706650, "step": 2297, "time_per_iteration": 2.6254489421844482 }, { "auxiliary_loss_clip": 0.01110905, "auxiliary_loss_mlp": 0.01059754, "balance_loss_clip": 1.05296302, "balance_loss_mlp": 1.03871369, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 1.929043788877404, "language_loss": 0.69199705, "learning_rate": 3.878525337815164e-06, "loss": 0.71370363, "num_input_tokens_seen": 49725715, "step": 2298, "time_per_iteration": 2.791301965713501 }, { "auxiliary_loss_clip": 0.01137772, "auxiliary_loss_mlp": 0.01061768, "balance_loss_clip": 1.0517292, "balance_loss_mlp": 1.04059684, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 1.7910922430646712, "language_loss": 0.86382294, "learning_rate": 3.878391639291116e-06, "loss": 0.88581836, "num_input_tokens_seen": 49744710, "step": 2299, "time_per_iteration": 2.6075453758239746 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.05378175, "balance_loss_mlp": 1.03292871, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 2.2378660690879606, "language_loss": 0.75468475, "learning_rate": 3.878257869538267e-06, "loss": 0.77690154, "num_input_tokens_seen": 49764300, "step": 2300, "time_per_iteration": 2.663328170776367 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01047248, "balance_loss_clip": 1.05274105, "balance_loss_mlp": 1.02664876, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 2.5571861214345963, "language_loss": 0.82463622, "learning_rate": 3.878124028561692e-06, "loss": 0.8464148, "num_input_tokens_seen": 49778380, "step": 2301, "time_per_iteration": 2.6705129146575928 }, { "auxiliary_loss_clip": 0.0113862, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05323792, "balance_loss_mlp": 1.00021625, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9612043619218924, "language_loss": 0.85957694, "learning_rate": 3.877990116366466e-06, "loss": 0.87874192, "num_input_tokens_seen": 49797460, "step": 2302, "time_per_iteration": 2.679797410964966 }, { "auxiliary_loss_clip": 0.01059341, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.03226125, "balance_loss_mlp": 1.02244604, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7598813547967705, "language_loss": 0.65591633, "learning_rate": 3.877856132957667e-06, "loss": 0.67676187, "num_input_tokens_seen": 49868005, "step": 2303, "time_per_iteration": 3.3249399662017822 }, { "auxiliary_loss_clip": 0.01151443, "auxiliary_loss_mlp": 0.01046478, "balance_loss_clip": 1.05337632, "balance_loss_mlp": 1.02655792, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 3.141207945865242, "language_loss": 0.78663635, "learning_rate": 3.877722078340374e-06, "loss": 0.80861557, "num_input_tokens_seen": 49885825, "step": 2304, "time_per_iteration": 2.7364001274108887 }, { "auxiliary_loss_clip": 0.01157514, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.05566275, "balance_loss_mlp": 1.02385736, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.7487365854034607, "language_loss": 0.77559888, "learning_rate": 3.877587952519672e-06, "loss": 0.79760659, "num_input_tokens_seen": 49905975, "step": 2305, "time_per_iteration": 2.7814202308654785 }, { "auxiliary_loss_clip": 0.01074766, "auxiliary_loss_mlp": 0.01055718, "balance_loss_clip": 1.04160607, "balance_loss_mlp": 1.03473723, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 1.8207477060355044, "language_loss": 0.87737936, "learning_rate": 3.877453755500647e-06, "loss": 0.89868426, "num_input_tokens_seen": 49925800, "step": 2306, "time_per_iteration": 2.917616605758667 }, { "auxiliary_loss_clip": 0.01064826, "auxiliary_loss_mlp": 0.0101208, "balance_loss_clip": 1.02692199, "balance_loss_mlp": 1.0094099, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8728538231155298, "language_loss": 0.59008431, "learning_rate": 3.877319487288387e-06, "loss": 0.61085337, "num_input_tokens_seen": 49977620, "step": 2307, "time_per_iteration": 3.4345149993896484 }, { "auxiliary_loss_clip": 0.01169624, "auxiliary_loss_mlp": 0.00778134, "balance_loss_clip": 1.05528641, "balance_loss_mlp": 1.00021303, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 1.8467673932802395, "language_loss": 0.79483795, "learning_rate": 3.877185147887984e-06, "loss": 0.81431556, "num_input_tokens_seen": 49996650, "step": 2308, "time_per_iteration": 2.7137296199798584 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05118585, "balance_loss_mlp": 1.03054297, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 2.352128383160346, "language_loss": 0.78101134, "learning_rate": 3.877050737304533e-06, "loss": 0.80282485, "num_input_tokens_seen": 50015640, "step": 2309, "time_per_iteration": 2.9259471893310547 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.01057348, "balance_loss_clip": 1.04979932, "balance_loss_mlp": 1.03620028, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 3.914796791761399, "language_loss": 0.68133545, "learning_rate": 3.876916255543129e-06, "loss": 0.70318997, "num_input_tokens_seen": 50033500, "step": 2310, "time_per_iteration": 4.27877140045166 }, { "auxiliary_loss_clip": 0.01164985, "auxiliary_loss_mlp": 0.01062516, "balance_loss_clip": 1.05356944, "balance_loss_mlp": 1.04021168, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 1.934954545600412, "language_loss": 0.84295756, "learning_rate": 3.8767817026088725e-06, "loss": 0.86523259, "num_input_tokens_seen": 50050075, "step": 2311, "time_per_iteration": 2.5612359046936035 }, { "auxiliary_loss_clip": 0.01173749, "auxiliary_loss_mlp": 0.01055474, "balance_loss_clip": 1.05752683, "balance_loss_mlp": 1.0350771, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.9213009430481143, "language_loss": 0.82358992, "learning_rate": 3.876647078506866e-06, "loss": 0.84588212, "num_input_tokens_seen": 50070080, "step": 2312, "time_per_iteration": 5.737139701843262 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.00778347, "balance_loss_clip": 1.05464363, "balance_loss_mlp": 1.00023031, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 2.109799495913242, "language_loss": 0.86732674, "learning_rate": 3.876512383242215e-06, "loss": 0.88640809, "num_input_tokens_seen": 50090040, "step": 2313, "time_per_iteration": 2.8402304649353027 }, { "auxiliary_loss_clip": 0.01168088, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.05670547, "balance_loss_mlp": 1.04115057, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 1.784990717237318, "language_loss": 0.79935932, "learning_rate": 3.876377616820024e-06, "loss": 0.8216576, "num_input_tokens_seen": 50110595, "step": 2314, "time_per_iteration": 2.683448076248169 }, { "auxiliary_loss_clip": 0.01124732, "auxiliary_loss_mlp": 0.01061041, "balance_loss_clip": 1.04845023, "balance_loss_mlp": 1.04103708, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 2.585875079553688, "language_loss": 0.85367405, "learning_rate": 3.876242779245409e-06, "loss": 0.87553179, "num_input_tokens_seen": 50125430, "step": 2315, "time_per_iteration": 4.332594394683838 }, { "auxiliary_loss_clip": 0.01156122, "auxiliary_loss_mlp": 0.01058532, "balance_loss_clip": 1.05397022, "balance_loss_mlp": 1.0372889, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.333331492160627, "language_loss": 0.77170396, "learning_rate": 3.876107870523477e-06, "loss": 0.79385042, "num_input_tokens_seen": 50144120, "step": 2316, "time_per_iteration": 2.654604911804199 }, { "auxiliary_loss_clip": 0.01163967, "auxiliary_loss_mlp": 0.00780027, "balance_loss_clip": 1.05353916, "balance_loss_mlp": 1.00024533, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 2.1485284032262086, "language_loss": 0.76820493, "learning_rate": 3.875972890659349e-06, "loss": 0.78764486, "num_input_tokens_seen": 50162500, "step": 2317, "time_per_iteration": 2.6501235961914062 }, { "auxiliary_loss_clip": 0.01144052, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.05156648, "balance_loss_mlp": 1.04074025, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.7797832869421444, "language_loss": 0.80185997, "learning_rate": 3.875837839658139e-06, "loss": 0.82391089, "num_input_tokens_seen": 50182415, "step": 2318, "time_per_iteration": 2.7097995281219482 }, { "auxiliary_loss_clip": 0.01049096, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.03358936, "balance_loss_mlp": 1.04518783, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.854553938374386, "language_loss": 0.59004617, "learning_rate": 3.87570271752497e-06, "loss": 0.61102188, "num_input_tokens_seen": 50245160, "step": 2319, "time_per_iteration": 3.2631640434265137 }, { "auxiliary_loss_clip": 0.0111484, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.04508984, "balance_loss_mlp": 1.03437412, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.3313836691947722, "language_loss": 0.64993447, "learning_rate": 3.875567524264967e-06, "loss": 0.67163646, "num_input_tokens_seen": 50268215, "step": 2320, "time_per_iteration": 2.8668782711029053 }, { "auxiliary_loss_clip": 0.01096421, "auxiliary_loss_mlp": 0.01056652, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.03521848, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 2.285151015895421, "language_loss": 0.70708811, "learning_rate": 3.875432259883256e-06, "loss": 0.72861886, "num_input_tokens_seen": 50288575, "step": 2321, "time_per_iteration": 2.8273603916168213 }, { "auxiliary_loss_clip": 0.01117698, "auxiliary_loss_mlp": 0.01061754, "balance_loss_clip": 1.04603076, "balance_loss_mlp": 1.03698206, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.7926270181208543, "language_loss": 0.85931206, "learning_rate": 3.875296924384965e-06, "loss": 0.88110662, "num_input_tokens_seen": 50308735, "step": 2322, "time_per_iteration": 2.833807945251465 }, { "auxiliary_loss_clip": 0.01120545, "auxiliary_loss_mlp": 0.01055036, "balance_loss_clip": 1.04616976, "balance_loss_mlp": 1.03568828, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.5963293576391182, "language_loss": 0.67159557, "learning_rate": 3.875161517775226e-06, "loss": 0.69335139, "num_input_tokens_seen": 50331025, "step": 2323, "time_per_iteration": 2.875265121459961 }, { "auxiliary_loss_clip": 0.01127992, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04900301, "balance_loss_mlp": 1.03432369, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 2.0757452253793485, "language_loss": 0.88878977, "learning_rate": 3.875026040059175e-06, "loss": 0.9106214, "num_input_tokens_seen": 50349725, "step": 2324, "time_per_iteration": 2.6841063499450684 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05145955, "balance_loss_mlp": 1.03541231, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 2.8450589371660526, "language_loss": 0.70621002, "learning_rate": 3.8748904912419485e-06, "loss": 0.72832638, "num_input_tokens_seen": 50367965, "step": 2325, "time_per_iteration": 2.694218397140503 }, { "auxiliary_loss_clip": 0.01134393, "auxiliary_loss_mlp": 0.00778751, "balance_loss_clip": 1.05273592, "balance_loss_mlp": 1.00028229, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.230299294128946, "language_loss": 0.81657004, "learning_rate": 3.874754871328688e-06, "loss": 0.83570141, "num_input_tokens_seen": 50385605, "step": 2326, "time_per_iteration": 2.715306282043457 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.02745473, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.729713540462037, "language_loss": 0.89241689, "learning_rate": 3.874619180324534e-06, "loss": 0.91438794, "num_input_tokens_seen": 50403985, "step": 2327, "time_per_iteration": 2.679626941680908 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01057397, "balance_loss_clip": 1.04873121, "balance_loss_mlp": 1.0352242, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.9217951598838363, "language_loss": 0.84760427, "learning_rate": 3.874483418234632e-06, "loss": 0.86937821, "num_input_tokens_seen": 50421590, "step": 2328, "time_per_iteration": 2.7277352809906006 }, { "auxiliary_loss_clip": 0.01151775, "auxiliary_loss_mlp": 0.0104443, "balance_loss_clip": 1.05300856, "balance_loss_mlp": 1.02421176, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.6116398320348613, "language_loss": 0.73835862, "learning_rate": 3.874347585064131e-06, "loss": 0.76032066, "num_input_tokens_seen": 50443945, "step": 2329, "time_per_iteration": 2.6911025047302246 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.02644169, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.565670250114109, "language_loss": 0.78373277, "learning_rate": 3.874211680818183e-06, "loss": 0.80573165, "num_input_tokens_seen": 50462065, "step": 2330, "time_per_iteration": 2.703225612640381 }, { "auxiliary_loss_clip": 0.01144455, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.05247569, "balance_loss_mlp": 1.02692819, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.2215524337864143, "language_loss": 0.72115719, "learning_rate": 3.87407570550194e-06, "loss": 0.74306256, "num_input_tokens_seen": 50479565, "step": 2331, "time_per_iteration": 2.7044217586517334 }, { "auxiliary_loss_clip": 0.01159691, "auxiliary_loss_mlp": 0.01051771, "balance_loss_clip": 1.0558939, "balance_loss_mlp": 1.03234017, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.5806705357110964, "language_loss": 0.72634697, "learning_rate": 3.873939659120557e-06, "loss": 0.7484616, "num_input_tokens_seen": 50497305, "step": 2332, "time_per_iteration": 2.647564649581909 }, { "auxiliary_loss_clip": 0.01063058, "auxiliary_loss_mlp": 0.01022564, "balance_loss_clip": 1.03391051, "balance_loss_mlp": 1.01944101, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.8445516092095569, "language_loss": 0.56185365, "learning_rate": 3.873803541679196e-06, "loss": 0.58270991, "num_input_tokens_seen": 50549735, "step": 2333, "time_per_iteration": 3.038390636444092 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.05246043, "balance_loss_mlp": 1.02587318, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.7702774265545234, "language_loss": 0.82728767, "learning_rate": 3.873667353183016e-06, "loss": 0.84902453, "num_input_tokens_seen": 50570100, "step": 2334, "time_per_iteration": 2.7205803394317627 }, { "auxiliary_loss_clip": 0.01129244, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02593565, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7790720657464538, "language_loss": 0.80958998, "learning_rate": 3.8735310936371825e-06, "loss": 0.83132899, "num_input_tokens_seen": 50589185, "step": 2335, "time_per_iteration": 2.7844314575195312 }, { "auxiliary_loss_clip": 0.01108373, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04802513, "balance_loss_mlp": 1.02160311, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 1.739505291070366, "language_loss": 0.81987065, "learning_rate": 3.873394763046862e-06, "loss": 0.84139174, "num_input_tokens_seen": 50609645, "step": 2336, "time_per_iteration": 2.7787351608276367 }, { "auxiliary_loss_clip": 0.01150445, "auxiliary_loss_mlp": 0.01046319, "balance_loss_clip": 1.05603921, "balance_loss_mlp": 1.02709103, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.7584048007565314, "language_loss": 0.80606967, "learning_rate": 3.873258361417225e-06, "loss": 0.82803738, "num_input_tokens_seen": 50628385, "step": 2337, "time_per_iteration": 2.6119275093078613 }, { "auxiliary_loss_clip": 0.01150898, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.03202438, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 2.383737065589604, "language_loss": 0.78994334, "learning_rate": 3.873121888753442e-06, "loss": 0.81196302, "num_input_tokens_seen": 50647260, "step": 2338, "time_per_iteration": 2.672427177429199 }, { "auxiliary_loss_clip": 0.01158377, "auxiliary_loss_mlp": 0.01050168, "balance_loss_clip": 1.05894089, "balance_loss_mlp": 1.02919865, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 2.117725014058833, "language_loss": 0.79766536, "learning_rate": 3.87298534506069e-06, "loss": 0.81975079, "num_input_tokens_seen": 50666130, "step": 2339, "time_per_iteration": 2.68635892868042 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.04686952, "balance_loss_mlp": 1.04463232, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 2.0269377249156793, "language_loss": 0.65632963, "learning_rate": 3.872848730344146e-06, "loss": 0.67795384, "num_input_tokens_seen": 50687440, "step": 2340, "time_per_iteration": 2.9426286220550537 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.0310297, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.8518792803213917, "language_loss": 0.78760445, "learning_rate": 3.87271204460899e-06, "loss": 0.80959821, "num_input_tokens_seen": 50704030, "step": 2341, "time_per_iteration": 2.8814899921417236 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.0554986, "balance_loss_mlp": 1.03876162, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.2693198584224454, "language_loss": 0.80322361, "learning_rate": 3.8725752878604066e-06, "loss": 0.82542449, "num_input_tokens_seen": 50723305, "step": 2342, "time_per_iteration": 2.604814291000366 }, { "auxiliary_loss_clip": 0.01152048, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.03858757, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 2.4727499245104343, "language_loss": 0.77686632, "learning_rate": 3.87243846010358e-06, "loss": 0.79895234, "num_input_tokens_seen": 50743270, "step": 2343, "time_per_iteration": 2.676823854446411 }, { "auxiliary_loss_clip": 0.0105659, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.03650093, "balance_loss_mlp": 1.03438878, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8521752699932517, "language_loss": 0.61553669, "learning_rate": 3.872301561343699e-06, "loss": 0.63647842, "num_input_tokens_seen": 50802710, "step": 2344, "time_per_iteration": 3.156792402267456 }, { "auxiliary_loss_clip": 0.01147637, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.05167484, "balance_loss_mlp": 1.03121877, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.558783678159347, "language_loss": 0.64331692, "learning_rate": 3.872164591585956e-06, "loss": 0.6652869, "num_input_tokens_seen": 50822625, "step": 2345, "time_per_iteration": 2.654100179672241 }, { "auxiliary_loss_clip": 0.01154879, "auxiliary_loss_mlp": 0.0104633, "balance_loss_clip": 1.05009735, "balance_loss_mlp": 1.02562308, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 2.26337760563351, "language_loss": 0.73892581, "learning_rate": 3.8720275508355435e-06, "loss": 0.76093793, "num_input_tokens_seen": 50842330, "step": 2346, "time_per_iteration": 2.7032830715179443 }, { "auxiliary_loss_clip": 0.0115447, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.0572027, "balance_loss_mlp": 1.02929008, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.7675181118684058, "language_loss": 0.7727294, "learning_rate": 3.8718904390976585e-06, "loss": 0.79476202, "num_input_tokens_seen": 50861035, "step": 2347, "time_per_iteration": 2.678647518157959 }, { "auxiliary_loss_clip": 0.01164131, "auxiliary_loss_mlp": 0.01052088, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.03370619, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 2.592464695784388, "language_loss": 0.76753062, "learning_rate": 3.8717532563775e-06, "loss": 0.78969282, "num_input_tokens_seen": 50880105, "step": 2348, "time_per_iteration": 2.7450597286224365 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05267334, "balance_loss_mlp": 1.02295136, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.8617784303344698, "language_loss": 0.86794335, "learning_rate": 3.871616002680272e-06, "loss": 0.8898412, "num_input_tokens_seen": 50897720, "step": 2349, "time_per_iteration": 2.662508964538574 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01048616, "balance_loss_clip": 1.05632985, "balance_loss_mlp": 1.02897048, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 2.650060051711467, "language_loss": 0.88758218, "learning_rate": 3.871478678011177e-06, "loss": 0.90957808, "num_input_tokens_seen": 50918385, "step": 2350, "time_per_iteration": 4.1697962284088135 }, { "auxiliary_loss_clip": 0.01142704, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 1.05369377, "balance_loss_mlp": 1.02442729, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.801090232061166, "language_loss": 0.8094542, "learning_rate": 3.871341282375423e-06, "loss": 0.83133256, "num_input_tokens_seen": 50938270, "step": 2351, "time_per_iteration": 2.6769907474517822 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.05100775, "balance_loss_mlp": 1.02096045, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 2.590933181784672, "language_loss": 0.82796198, "learning_rate": 3.871203815778219e-06, "loss": 0.84985888, "num_input_tokens_seen": 50958155, "step": 2352, "time_per_iteration": 5.713203430175781 }, { "auxiliary_loss_clip": 0.01063742, "auxiliary_loss_mlp": 0.01009803, "balance_loss_clip": 1.03462291, "balance_loss_mlp": 1.0060122, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.9118003008214054, "language_loss": 0.61876011, "learning_rate": 3.87106627822478e-06, "loss": 0.63949555, "num_input_tokens_seen": 51020705, "step": 2353, "time_per_iteration": 3.1698319911956787 }, { "auxiliary_loss_clip": 0.01134069, "auxiliary_loss_mlp": 0.01049094, "balance_loss_clip": 1.0536828, "balance_loss_mlp": 1.03039002, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.5909284402791886, "language_loss": 0.87075388, "learning_rate": 3.8709286697203196e-06, "loss": 0.89258552, "num_input_tokens_seen": 51039995, "step": 2354, "time_per_iteration": 2.6781272888183594 }, { "auxiliary_loss_clip": 0.01124592, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.0527302, "balance_loss_mlp": 1.02562428, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.035812967878614, "language_loss": 0.74701214, "learning_rate": 3.870790990270057e-06, "loss": 0.76871634, "num_input_tokens_seen": 51059075, "step": 2355, "time_per_iteration": 4.464852571487427 }, { "auxiliary_loss_clip": 0.01062228, "auxiliary_loss_mlp": 0.01003337, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 0.99947417, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6801443738216844, "language_loss": 0.51819825, "learning_rate": 3.870653239879212e-06, "loss": 0.53885388, "num_input_tokens_seen": 51120380, "step": 2356, "time_per_iteration": 3.094026803970337 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01057535, "balance_loss_clip": 1.05662966, "balance_loss_mlp": 1.0379492, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 1.9928903491175036, "language_loss": 0.70598352, "learning_rate": 3.8705154185530095e-06, "loss": 0.72821522, "num_input_tokens_seen": 51136950, "step": 2357, "time_per_iteration": 2.569486141204834 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.04706419, "balance_loss_mlp": 1.0355413, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 2.1046358800035234, "language_loss": 0.82020235, "learning_rate": 3.870377526296674e-06, "loss": 0.84192204, "num_input_tokens_seen": 51155175, "step": 2358, "time_per_iteration": 2.719344139099121 }, { "auxiliary_loss_clip": 0.01145283, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.05257189, "balance_loss_mlp": 1.02932954, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 2.2336131404929787, "language_loss": 0.71575904, "learning_rate": 3.870239563115436e-06, "loss": 0.73771417, "num_input_tokens_seen": 51174500, "step": 2359, "time_per_iteration": 2.6914820671081543 }, { "auxiliary_loss_clip": 0.0111529, "auxiliary_loss_mlp": 0.007787, "balance_loss_clip": 1.0526464, "balance_loss_mlp": 1.00033379, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 2.4314273775499906, "language_loss": 0.7541784, "learning_rate": 3.870101529014526e-06, "loss": 0.77311832, "num_input_tokens_seen": 51194270, "step": 2360, "time_per_iteration": 2.803493022918701 }, { "auxiliary_loss_clip": 0.01108644, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.0491271, "balance_loss_mlp": 1.03136814, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.374719540518049, "language_loss": 0.81920552, "learning_rate": 3.869963423999178e-06, "loss": 0.84082878, "num_input_tokens_seen": 51211850, "step": 2361, "time_per_iteration": 2.8039920330047607 }, { "auxiliary_loss_clip": 0.0115065, "auxiliary_loss_mlp": 0.01057946, "balance_loss_clip": 1.05230403, "balance_loss_mlp": 1.03802609, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 1.9397979109407166, "language_loss": 0.74081504, "learning_rate": 3.86982524807463e-06, "loss": 0.76290095, "num_input_tokens_seen": 51233545, "step": 2362, "time_per_iteration": 2.7272114753723145 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.05355787, "balance_loss_mlp": 1.02861547, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 1.7489521991344694, "language_loss": 0.74221587, "learning_rate": 3.869687001246122e-06, "loss": 0.76423442, "num_input_tokens_seen": 51257615, "step": 2363, "time_per_iteration": 2.789802312850952 }, { "auxiliary_loss_clip": 0.01128802, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.04769099, "balance_loss_mlp": 1.03180885, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.7832713632097879, "language_loss": 0.73034167, "learning_rate": 3.8695486835188946e-06, "loss": 0.75215018, "num_input_tokens_seen": 51279645, "step": 2364, "time_per_iteration": 2.8508312702178955 }, { "auxiliary_loss_clip": 0.01142769, "auxiliary_loss_mlp": 0.01049829, "balance_loss_clip": 1.05160844, "balance_loss_mlp": 1.03207827, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 1.875477198706701, "language_loss": 0.90395916, "learning_rate": 3.869410294898195e-06, "loss": 0.92588514, "num_input_tokens_seen": 51299775, "step": 2365, "time_per_iteration": 2.6807806491851807 }, { "auxiliary_loss_clip": 0.01127252, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.04759967, "balance_loss_mlp": 1.03394318, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.719218863067841, "language_loss": 0.65305161, "learning_rate": 3.869271835389268e-06, "loss": 0.67487329, "num_input_tokens_seen": 51319430, "step": 2366, "time_per_iteration": 2.7293641567230225 }, { "auxiliary_loss_clip": 0.01143576, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05218709, "balance_loss_mlp": 1.03058839, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.3740196514966256, "language_loss": 0.80331928, "learning_rate": 3.8691333049973665e-06, "loss": 0.82527137, "num_input_tokens_seen": 51336045, "step": 2367, "time_per_iteration": 2.67529296875 }, { "auxiliary_loss_clip": 0.01138517, "auxiliary_loss_mlp": 0.01062653, "balance_loss_clip": 1.05117869, "balance_loss_mlp": 1.0402534, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 2.0081973718426283, "language_loss": 0.82346755, "learning_rate": 3.868994703727742e-06, "loss": 0.84547925, "num_input_tokens_seen": 51357030, "step": 2368, "time_per_iteration": 2.7447288036346436 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01052229, "balance_loss_clip": 1.05180073, "balance_loss_mlp": 1.03065228, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 2.6586279461428144, "language_loss": 0.8711772, "learning_rate": 3.868856031585652e-06, "loss": 0.89292705, "num_input_tokens_seen": 51374890, "step": 2369, "time_per_iteration": 2.736872673034668 }, { "auxiliary_loss_clip": 0.01127301, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.05011857, "balance_loss_mlp": 1.02170992, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.7900856007188275, "language_loss": 0.75828248, "learning_rate": 3.868717288576354e-06, "loss": 0.77997375, "num_input_tokens_seen": 51398100, "step": 2370, "time_per_iteration": 2.762603998184204 }, { "auxiliary_loss_clip": 0.01158195, "auxiliary_loss_mlp": 0.00781098, "balance_loss_clip": 1.05268764, "balance_loss_mlp": 1.00028419, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 1.7770434161065212, "language_loss": 0.82934797, "learning_rate": 3.868578474705109e-06, "loss": 0.84874088, "num_input_tokens_seen": 51418745, "step": 2371, "time_per_iteration": 2.6224656105041504 }, { "auxiliary_loss_clip": 0.01173447, "auxiliary_loss_mlp": 0.0105718, "balance_loss_clip": 1.05837953, "balance_loss_mlp": 1.03638947, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.0431625041319825, "language_loss": 0.82982123, "learning_rate": 3.868439589977181e-06, "loss": 0.85212755, "num_input_tokens_seen": 51437455, "step": 2372, "time_per_iteration": 2.575690269470215 }, { "auxiliary_loss_clip": 0.01172196, "auxiliary_loss_mlp": 0.0105022, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.0285356, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 3.3704326167450582, "language_loss": 0.8438468, "learning_rate": 3.868300634397836e-06, "loss": 0.86607099, "num_input_tokens_seen": 51455710, "step": 2373, "time_per_iteration": 2.7160356044769287 }, { "auxiliary_loss_clip": 0.01141742, "auxiliary_loss_mlp": 0.01055295, "balance_loss_clip": 1.05160809, "balance_loss_mlp": 1.03598261, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 3.5035356392631836, "language_loss": 0.86027539, "learning_rate": 3.8681616079723445e-06, "loss": 0.88224572, "num_input_tokens_seen": 51471270, "step": 2374, "time_per_iteration": 2.6845595836639404 }, { "auxiliary_loss_clip": 0.01164623, "auxiliary_loss_mlp": 0.01061957, "balance_loss_clip": 1.05515146, "balance_loss_mlp": 1.03996301, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6059368749673757, "language_loss": 0.79169822, "learning_rate": 3.868022510705977e-06, "loss": 0.81396401, "num_input_tokens_seen": 51492705, "step": 2375, "time_per_iteration": 2.738156795501709 }, { "auxiliary_loss_clip": 0.01163115, "auxiliary_loss_mlp": 0.01058224, "balance_loss_clip": 1.05641222, "balance_loss_mlp": 1.0368259, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.559097553272684, "language_loss": 0.76907504, "learning_rate": 3.867883342604009e-06, "loss": 0.79128844, "num_input_tokens_seen": 51510780, "step": 2376, "time_per_iteration": 2.751178741455078 }, { "auxiliary_loss_clip": 0.01160115, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.054515, "balance_loss_mlp": 1.03040111, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 2.7331999261828592, "language_loss": 0.92795181, "learning_rate": 3.867744103671717e-06, "loss": 0.95006979, "num_input_tokens_seen": 51531400, "step": 2377, "time_per_iteration": 2.6584725379943848 }, { "auxiliary_loss_clip": 0.01147246, "auxiliary_loss_mlp": 0.01061419, "balance_loss_clip": 1.05362535, "balance_loss_mlp": 1.03793442, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 2.9252003733204894, "language_loss": 0.91754365, "learning_rate": 3.867604793914382e-06, "loss": 0.93963027, "num_input_tokens_seen": 51548215, "step": 2378, "time_per_iteration": 2.8107075691223145 }, { "auxiliary_loss_clip": 0.01164153, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.05712187, "balance_loss_mlp": 1.03092849, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 2.1292902842232966, "language_loss": 0.73961306, "learning_rate": 3.8674654133372864e-06, "loss": 0.76178491, "num_input_tokens_seen": 51566820, "step": 2379, "time_per_iteration": 2.7029881477355957 }, { "auxiliary_loss_clip": 0.01137551, "auxiliary_loss_mlp": 0.01055012, "balance_loss_clip": 1.05204058, "balance_loss_mlp": 1.0330174, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 2.1898245228218784, "language_loss": 0.78818595, "learning_rate": 3.867325961945714e-06, "loss": 0.81011152, "num_input_tokens_seen": 51585075, "step": 2380, "time_per_iteration": 2.7213294506073 }, { "auxiliary_loss_clip": 0.01126442, "auxiliary_loss_mlp": 0.01057409, "balance_loss_clip": 1.05457354, "balance_loss_mlp": 1.03580785, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 4.699041640805274, "language_loss": 0.87895483, "learning_rate": 3.867186439744955e-06, "loss": 0.90079331, "num_input_tokens_seen": 51603185, "step": 2381, "time_per_iteration": 2.7144110202789307 }, { "auxiliary_loss_clip": 0.01141327, "auxiliary_loss_mlp": 0.01052708, "balance_loss_clip": 1.05200005, "balance_loss_mlp": 1.03088117, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.47508592106904, "language_loss": 0.76396096, "learning_rate": 3.867046846740299e-06, "loss": 0.78590137, "num_input_tokens_seen": 51620880, "step": 2382, "time_per_iteration": 2.6185953617095947 }, { "auxiliary_loss_clip": 0.01132222, "auxiliary_loss_mlp": 0.01054019, "balance_loss_clip": 1.05162048, "balance_loss_mlp": 1.03319359, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 4.3017095308344375, "language_loss": 0.76636785, "learning_rate": 3.866907182937039e-06, "loss": 0.7882303, "num_input_tokens_seen": 51640170, "step": 2383, "time_per_iteration": 2.7408525943756104 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01052888, "balance_loss_clip": 1.05078864, "balance_loss_mlp": 1.02926064, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.3526544982502284, "language_loss": 0.87649417, "learning_rate": 3.866767448340471e-06, "loss": 0.8984201, "num_input_tokens_seen": 51656580, "step": 2384, "time_per_iteration": 2.6798789501190186 }, { "auxiliary_loss_clip": 0.01164805, "auxiliary_loss_mlp": 0.01053206, "balance_loss_clip": 1.05644679, "balance_loss_mlp": 1.02985239, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.6134761315069284, "language_loss": 0.79340684, "learning_rate": 3.866627642955895e-06, "loss": 0.81558692, "num_input_tokens_seen": 51674645, "step": 2385, "time_per_iteration": 2.5856544971466064 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.05148256, "balance_loss_mlp": 1.02182722, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 2.6990187663653247, "language_loss": 0.74960196, "learning_rate": 3.866487766788612e-06, "loss": 0.77159584, "num_input_tokens_seen": 51695770, "step": 2386, "time_per_iteration": 2.6670751571655273 }, { "auxiliary_loss_clip": 0.01171639, "auxiliary_loss_mlp": 0.01048096, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02733016, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 2.299870083842227, "language_loss": 0.78659731, "learning_rate": 3.866347819843925e-06, "loss": 0.80879462, "num_input_tokens_seen": 51714165, "step": 2387, "time_per_iteration": 2.5805532932281494 }, { "auxiliary_loss_clip": 0.01140581, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05355716, "balance_loss_mlp": 1.03317428, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 6.554164509194222, "language_loss": 0.82492924, "learning_rate": 3.866207802127143e-06, "loss": 0.84688807, "num_input_tokens_seen": 51734440, "step": 2388, "time_per_iteration": 2.656609058380127 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.0537287, "balance_loss_mlp": 1.02674508, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 2.5973624291758655, "language_loss": 0.82025754, "learning_rate": 3.866067713643573e-06, "loss": 0.84227914, "num_input_tokens_seen": 51753730, "step": 2389, "time_per_iteration": 4.21793794631958 }, { "auxiliary_loss_clip": 0.01145665, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.05107975, "balance_loss_mlp": 1.02513266, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 3.7970835440683097, "language_loss": 0.83056784, "learning_rate": 3.8659275543985285e-06, "loss": 0.85249299, "num_input_tokens_seen": 51771195, "step": 2390, "time_per_iteration": 2.6859514713287354 }, { "auxiliary_loss_clip": 0.01152608, "auxiliary_loss_mlp": 0.01054404, "balance_loss_clip": 1.05400729, "balance_loss_mlp": 1.0334475, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 1.8176612067028404, "language_loss": 0.75018179, "learning_rate": 3.865787324397324e-06, "loss": 0.77225184, "num_input_tokens_seen": 51792290, "step": 2391, "time_per_iteration": 5.726900577545166 }, { "auxiliary_loss_clip": 0.01045505, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.03226101, "balance_loss_mlp": 1.0303973, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8787809928903102, "language_loss": 0.61848003, "learning_rate": 3.865647023645277e-06, "loss": 0.63926852, "num_input_tokens_seen": 51843675, "step": 2392, "time_per_iteration": 3.113558053970337 }, { "auxiliary_loss_clip": 0.01158698, "auxiliary_loss_mlp": 0.01058807, "balance_loss_clip": 1.05467868, "balance_loss_mlp": 1.03608608, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 2.718376715006273, "language_loss": 0.77346605, "learning_rate": 3.865506652147709e-06, "loss": 0.79564106, "num_input_tokens_seen": 51860285, "step": 2393, "time_per_iteration": 2.6578521728515625 }, { "auxiliary_loss_clip": 0.0116951, "auxiliary_loss_mlp": 0.01052986, "balance_loss_clip": 1.05671048, "balance_loss_mlp": 1.03287578, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 5.715284956255472, "language_loss": 0.76301813, "learning_rate": 3.865366209909941e-06, "loss": 0.78524309, "num_input_tokens_seen": 51880105, "step": 2394, "time_per_iteration": 4.345217943191528 }, { "auxiliary_loss_clip": 0.01165266, "auxiliary_loss_mlp": 0.01053501, "balance_loss_clip": 1.05325842, "balance_loss_mlp": 1.03365326, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 2.2496244390836893, "language_loss": 0.85859704, "learning_rate": 3.8652256969372994e-06, "loss": 0.88078463, "num_input_tokens_seen": 51905175, "step": 2395, "time_per_iteration": 2.739717483520508 }, { "auxiliary_loss_clip": 0.0112523, "auxiliary_loss_mlp": 0.01051092, "balance_loss_clip": 1.04946184, "balance_loss_mlp": 1.028669, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 4.117082508421602, "language_loss": 0.82894099, "learning_rate": 3.865085113235113e-06, "loss": 0.85070425, "num_input_tokens_seen": 51924490, "step": 2396, "time_per_iteration": 2.686732053756714 }, { "auxiliary_loss_clip": 0.01126754, "auxiliary_loss_mlp": 0.00779833, "balance_loss_clip": 1.04752374, "balance_loss_mlp": 1.00036597, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 6.956399779275871, "language_loss": 0.82801461, "learning_rate": 3.864944458808712e-06, "loss": 0.84708053, "num_input_tokens_seen": 51940490, "step": 2397, "time_per_iteration": 2.742809534072876 }, { "auxiliary_loss_clip": 0.01168871, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.05485702, "balance_loss_mlp": 1.02892387, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 8.355198005975433, "language_loss": 0.8001197, "learning_rate": 3.86480373366343e-06, "loss": 0.82230783, "num_input_tokens_seen": 51957910, "step": 2398, "time_per_iteration": 2.573267936706543 }, { "auxiliary_loss_clip": 0.01152449, "auxiliary_loss_mlp": 0.01053407, "balance_loss_clip": 1.05287588, "balance_loss_mlp": 1.03336823, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 3.294581575970509, "language_loss": 0.64690518, "learning_rate": 3.864662937804603e-06, "loss": 0.66896379, "num_input_tokens_seen": 51978010, "step": 2399, "time_per_iteration": 2.6831774711608887 }, { "auxiliary_loss_clip": 0.01134916, "auxiliary_loss_mlp": 0.01052493, "balance_loss_clip": 1.04998159, "balance_loss_mlp": 1.03119016, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 3.586256880371596, "language_loss": 0.82207137, "learning_rate": 3.864522071237571e-06, "loss": 0.84394544, "num_input_tokens_seen": 51998515, "step": 2400, "time_per_iteration": 2.6812663078308105 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01051884, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.02954376, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 2.3908005596579165, "language_loss": 0.74217784, "learning_rate": 3.864381133967676e-06, "loss": 0.76419652, "num_input_tokens_seen": 52019270, "step": 2401, "time_per_iteration": 2.773838520050049 }, { "auxiliary_loss_clip": 0.01137207, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.05065656, "balance_loss_mlp": 1.02671885, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 2.616063077702737, "language_loss": 0.80771816, "learning_rate": 3.86424012600026e-06, "loss": 0.82956612, "num_input_tokens_seen": 52039315, "step": 2402, "time_per_iteration": 2.786031723022461 }, { "auxiliary_loss_clip": 0.01120897, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.02988231, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.397935571801219, "language_loss": 0.84159613, "learning_rate": 3.864099047340673e-06, "loss": 0.86332625, "num_input_tokens_seen": 52056555, "step": 2403, "time_per_iteration": 2.8113911151885986 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.00783127, "balance_loss_clip": 1.04854488, "balance_loss_mlp": 1.00030184, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 2.224282169770823, "language_loss": 0.70142806, "learning_rate": 3.863957897994262e-06, "loss": 0.72055018, "num_input_tokens_seen": 52075800, "step": 2404, "time_per_iteration": 2.7748003005981445 }, { "auxiliary_loss_clip": 0.01144289, "auxiliary_loss_mlp": 0.01051404, "balance_loss_clip": 1.05279732, "balance_loss_mlp": 1.03099549, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.429117427076043, "language_loss": 0.73179376, "learning_rate": 3.863816677966381e-06, "loss": 0.75375068, "num_input_tokens_seen": 52092585, "step": 2405, "time_per_iteration": 2.7927868366241455 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01054584, "balance_loss_clip": 1.04661417, "balance_loss_mlp": 1.0326612, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 7.089523066959408, "language_loss": 0.73039794, "learning_rate": 3.863675387262386e-06, "loss": 0.75202763, "num_input_tokens_seen": 52108990, "step": 2406, "time_per_iteration": 2.742253303527832 }, { "auxiliary_loss_clip": 0.01157268, "auxiliary_loss_mlp": 0.01054465, "balance_loss_clip": 1.05420268, "balance_loss_mlp": 1.03198171, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 5.383630788916188, "language_loss": 0.75570732, "learning_rate": 3.8635340258876325e-06, "loss": 0.77782464, "num_input_tokens_seen": 52125385, "step": 2407, "time_per_iteration": 2.654636859893799 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05440819, "balance_loss_mlp": 1.03392315, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 2.0240540465866146, "language_loss": 0.79426706, "learning_rate": 3.8633925938474826e-06, "loss": 0.81648088, "num_input_tokens_seen": 52144985, "step": 2408, "time_per_iteration": 2.663611650466919 }, { "auxiliary_loss_clip": 0.01155332, "auxiliary_loss_mlp": 0.01053557, "balance_loss_clip": 1.05411625, "balance_loss_mlp": 1.03107429, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 2.249858190268702, "language_loss": 0.82188261, "learning_rate": 3.863251091147299e-06, "loss": 0.84397143, "num_input_tokens_seen": 52163885, "step": 2409, "time_per_iteration": 2.6218342781066895 }, { "auxiliary_loss_clip": 0.01116852, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.04859877, "balance_loss_mlp": 1.04340839, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 3.918408886138166, "language_loss": 0.74477464, "learning_rate": 3.863109517792446e-06, "loss": 0.76659817, "num_input_tokens_seen": 52184325, "step": 2410, "time_per_iteration": 2.8525002002716064 }, { "auxiliary_loss_clip": 0.01166422, "auxiliary_loss_mlp": 0.0105028, "balance_loss_clip": 1.05447876, "balance_loss_mlp": 1.0300622, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 2.976325973684052, "language_loss": 0.81616414, "learning_rate": 3.8629678737882945e-06, "loss": 0.8383311, "num_input_tokens_seen": 52202740, "step": 2411, "time_per_iteration": 2.580059051513672 }, { "auxiliary_loss_clip": 0.01143671, "auxiliary_loss_mlp": 0.01055066, "balance_loss_clip": 1.05553794, "balance_loss_mlp": 1.03366852, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 2.049708152728223, "language_loss": 0.69947547, "learning_rate": 3.862826159140214e-06, "loss": 0.72146285, "num_input_tokens_seen": 52223100, "step": 2412, "time_per_iteration": 2.792389392852783 }, { "auxiliary_loss_clip": 0.01153861, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.05600309, "balance_loss_mlp": 1.02669024, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 1.9741671649406984, "language_loss": 0.76655865, "learning_rate": 3.862684373853579e-06, "loss": 0.78857231, "num_input_tokens_seen": 52239690, "step": 2413, "time_per_iteration": 2.6535370349884033 }, { "auxiliary_loss_clip": 0.01072879, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.04041791, "balance_loss_mlp": 1.0252564, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.9047547971056389, "language_loss": 0.58883119, "learning_rate": 3.8625425179337656e-06, "loss": 0.60984492, "num_input_tokens_seen": 52296705, "step": 2414, "time_per_iteration": 3.1230342388153076 }, { "auxiliary_loss_clip": 0.01059489, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.03874373, "balance_loss_mlp": 1.00692892, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8422279258983576, "language_loss": 0.62171185, "learning_rate": 3.862400591386154e-06, "loss": 0.64240396, "num_input_tokens_seen": 52361830, "step": 2415, "time_per_iteration": 3.1932270526885986 }, { "auxiliary_loss_clip": 0.01151643, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05383611, "balance_loss_mlp": 1.02500319, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 2.2913061581681036, "language_loss": 0.71468806, "learning_rate": 3.8622585942161245e-06, "loss": 0.73667121, "num_input_tokens_seen": 52379420, "step": 2416, "time_per_iteration": 2.5892374515533447 }, { "auxiliary_loss_clip": 0.01050816, "auxiliary_loss_mlp": 0.010049, "balance_loss_clip": 1.03675056, "balance_loss_mlp": 1.00211036, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7147623603004897, "language_loss": 0.6037569, "learning_rate": 3.8621165264290635e-06, "loss": 0.62431407, "num_input_tokens_seen": 52446290, "step": 2417, "time_per_iteration": 3.3065359592437744 }, { "auxiliary_loss_clip": 0.01168766, "auxiliary_loss_mlp": 0.01053548, "balance_loss_clip": 1.05357766, "balance_loss_mlp": 1.03275824, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 3.7032433533234346, "language_loss": 0.78014368, "learning_rate": 3.861974388030356e-06, "loss": 0.80236679, "num_input_tokens_seen": 52467295, "step": 2418, "time_per_iteration": 2.887986183166504 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01049779, "balance_loss_clip": 1.04354823, "balance_loss_mlp": 1.02911985, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 2.096300480609688, "language_loss": 0.71208847, "learning_rate": 3.861832179025394e-06, "loss": 0.73372757, "num_input_tokens_seen": 52487295, "step": 2419, "time_per_iteration": 2.764268636703491 }, { "auxiliary_loss_clip": 0.01142427, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05351484, "balance_loss_mlp": 1.03300607, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 2.414673655978061, "language_loss": 0.89847761, "learning_rate": 3.861689899419569e-06, "loss": 0.92045164, "num_input_tokens_seen": 52504220, "step": 2420, "time_per_iteration": 2.7500016689300537 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01060929, "balance_loss_clip": 1.05202007, "balance_loss_mlp": 1.04072309, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 2.0953123539002383, "language_loss": 0.82278717, "learning_rate": 3.861547549218276e-06, "loss": 0.8449465, "num_input_tokens_seen": 52521900, "step": 2421, "time_per_iteration": 2.672722816467285 }, { "auxiliary_loss_clip": 0.01099277, "auxiliary_loss_mlp": 0.01056793, "balance_loss_clip": 1.04282439, "balance_loss_mlp": 1.03507352, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.667429152986229, "language_loss": 0.81741488, "learning_rate": 3.861405128426914e-06, "loss": 0.83897555, "num_input_tokens_seen": 52540495, "step": 2422, "time_per_iteration": 2.739992141723633 }, { "auxiliary_loss_clip": 0.01031842, "auxiliary_loss_mlp": 0.00760413, "balance_loss_clip": 1.0271318, "balance_loss_mlp": 1.00019872, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9102961670465963, "language_loss": 0.63342595, "learning_rate": 3.861262637050883e-06, "loss": 0.65134847, "num_input_tokens_seen": 52603305, "step": 2423, "time_per_iteration": 3.2704036235809326 }, { "auxiliary_loss_clip": 0.01112855, "auxiliary_loss_mlp": 0.00780065, "balance_loss_clip": 1.05457556, "balance_loss_mlp": 1.00038898, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 2.2239460229896206, "language_loss": 0.82163274, "learning_rate": 3.861120075095585e-06, "loss": 0.84056193, "num_input_tokens_seen": 52623435, "step": 2424, "time_per_iteration": 2.7993249893188477 }, { "auxiliary_loss_clip": 0.01141208, "auxiliary_loss_mlp": 0.01069468, "balance_loss_clip": 1.0535512, "balance_loss_mlp": 1.0496788, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 2.769336045727131, "language_loss": 0.78602695, "learning_rate": 3.860977442566429e-06, "loss": 0.80813372, "num_input_tokens_seen": 52642255, "step": 2425, "time_per_iteration": 2.698594093322754 }, { "auxiliary_loss_clip": 0.01156078, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.05603778, "balance_loss_mlp": 1.04148602, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 50.77412231982301, "language_loss": 0.83184898, "learning_rate": 3.860834739468821e-06, "loss": 0.85403109, "num_input_tokens_seen": 52658700, "step": 2426, "time_per_iteration": 2.6948676109313965 }, { "auxiliary_loss_clip": 0.01166642, "auxiliary_loss_mlp": 0.01060596, "balance_loss_clip": 1.05706, "balance_loss_mlp": 1.04040194, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 3.7420612082917475, "language_loss": 0.87215799, "learning_rate": 3.860691965808173e-06, "loss": 0.8944304, "num_input_tokens_seen": 52678140, "step": 2427, "time_per_iteration": 2.6479666233062744 }, { "auxiliary_loss_clip": 0.01128634, "auxiliary_loss_mlp": 0.01064346, "balance_loss_clip": 1.04835391, "balance_loss_mlp": 1.0405997, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 1.9221483903926033, "language_loss": 0.66815829, "learning_rate": 3.8605491215899e-06, "loss": 0.69008809, "num_input_tokens_seen": 52696825, "step": 2428, "time_per_iteration": 2.6971306800842285 }, { "auxiliary_loss_clip": 0.01155557, "auxiliary_loss_mlp": 0.01059343, "balance_loss_clip": 1.05335426, "balance_loss_mlp": 1.03842235, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 2.0918238083564242, "language_loss": 0.83231717, "learning_rate": 3.860406206819417e-06, "loss": 0.8544662, "num_input_tokens_seen": 52715125, "step": 2429, "time_per_iteration": 4.283279895782471 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.01053505, "balance_loss_clip": 1.04625869, "balance_loss_mlp": 1.03446746, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 2.4559042296603746, "language_loss": 0.79087842, "learning_rate": 3.860263221502145e-06, "loss": 0.81262159, "num_input_tokens_seen": 52734015, "step": 2430, "time_per_iteration": 4.197890758514404 }, { "auxiliary_loss_clip": 0.01170782, "auxiliary_loss_mlp": 0.01061965, "balance_loss_clip": 1.05820751, "balance_loss_mlp": 1.04179525, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.4376691278662506, "language_loss": 0.82910693, "learning_rate": 3.860120165643504e-06, "loss": 0.85143435, "num_input_tokens_seen": 52753025, "step": 2431, "time_per_iteration": 4.162708282470703 }, { "auxiliary_loss_clip": 0.011607, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05553937, "balance_loss_mlp": 1.03853524, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 2.881661839068268, "language_loss": 0.78330141, "learning_rate": 3.859977039248921e-06, "loss": 0.80550951, "num_input_tokens_seen": 52773420, "step": 2432, "time_per_iteration": 2.6907777786254883 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.00782861, "balance_loss_clip": 1.05517077, "balance_loss_mlp": 1.00040507, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 2.3488382544651887, "language_loss": 0.79515982, "learning_rate": 3.859833842323822e-06, "loss": 0.81464243, "num_input_tokens_seen": 52792870, "step": 2433, "time_per_iteration": 2.719841241836548 }, { "auxiliary_loss_clip": 0.01124303, "auxiliary_loss_mlp": 0.01055776, "balance_loss_clip": 1.05385411, "balance_loss_mlp": 1.03484273, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 2.0782880949269926, "language_loss": 0.77905983, "learning_rate": 3.859690574873638e-06, "loss": 0.80086064, "num_input_tokens_seen": 52811615, "step": 2434, "time_per_iteration": 4.371506929397583 }, { "auxiliary_loss_clip": 0.01066282, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.05327988, "balance_loss_mlp": 1.03022039, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8566726319617045, "language_loss": 0.58453119, "learning_rate": 3.8595472369038e-06, "loss": 0.60552537, "num_input_tokens_seen": 52873230, "step": 2435, "time_per_iteration": 3.229882001876831 }, { "auxiliary_loss_clip": 0.01160087, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05263698, "balance_loss_mlp": 1.0257076, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 3.775553645712452, "language_loss": 0.88436592, "learning_rate": 3.859403828419744e-06, "loss": 0.90641725, "num_input_tokens_seen": 52889325, "step": 2436, "time_per_iteration": 2.568624973297119 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.00780257, "balance_loss_clip": 1.05587268, "balance_loss_mlp": 1.00041819, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 2.028718201913856, "language_loss": 0.74904168, "learning_rate": 3.85926034942691e-06, "loss": 0.7684052, "num_input_tokens_seen": 52909705, "step": 2437, "time_per_iteration": 2.6361188888549805 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01050068, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.02729869, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 3.0822234004311033, "language_loss": 0.73914421, "learning_rate": 3.859116799930736e-06, "loss": 0.76129669, "num_input_tokens_seen": 52930300, "step": 2438, "time_per_iteration": 2.7590928077697754 }, { "auxiliary_loss_clip": 0.01154571, "auxiliary_loss_mlp": 0.01046509, "balance_loss_clip": 1.05747688, "balance_loss_mlp": 1.02708936, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 4.476318678757457, "language_loss": 0.74410725, "learning_rate": 3.858973179936668e-06, "loss": 0.76611805, "num_input_tokens_seen": 52949955, "step": 2439, "time_per_iteration": 2.627037763595581 }, { "auxiliary_loss_clip": 0.01152452, "auxiliary_loss_mlp": 0.01051294, "balance_loss_clip": 1.05477583, "balance_loss_mlp": 1.0309453, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 2.1583973700525343, "language_loss": 0.74123728, "learning_rate": 3.85882948945015e-06, "loss": 0.76327467, "num_input_tokens_seen": 52972905, "step": 2440, "time_per_iteration": 2.79715633392334 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.05471611, "balance_loss_mlp": 1.02493691, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.9756103236146798, "language_loss": 0.82730794, "learning_rate": 3.85868572847663e-06, "loss": 0.84935671, "num_input_tokens_seen": 52994850, "step": 2441, "time_per_iteration": 2.6505653858184814 }, { "auxiliary_loss_clip": 0.01152605, "auxiliary_loss_mlp": 0.01049175, "balance_loss_clip": 1.05408478, "balance_loss_mlp": 1.02796757, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 2.582118236216862, "language_loss": 0.71455544, "learning_rate": 3.858541897021563e-06, "loss": 0.73657322, "num_input_tokens_seen": 53014740, "step": 2442, "time_per_iteration": 2.772648572921753 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.05283213, "balance_loss_mlp": 1.02224207, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 3.6780587187273155, "language_loss": 0.81992352, "learning_rate": 3.8583979950904e-06, "loss": 0.84165335, "num_input_tokens_seen": 53029780, "step": 2443, "time_per_iteration": 2.6979780197143555 }, { "auxiliary_loss_clip": 0.01147138, "auxiliary_loss_mlp": 0.0105693, "balance_loss_clip": 1.05402422, "balance_loss_mlp": 1.03474557, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 3.190851099873364, "language_loss": 0.83093917, "learning_rate": 3.858254022688599e-06, "loss": 0.85297978, "num_input_tokens_seen": 53048620, "step": 2444, "time_per_iteration": 2.7177255153656006 }, { "auxiliary_loss_clip": 0.01134628, "auxiliary_loss_mlp": 0.01051986, "balance_loss_clip": 1.05385137, "balance_loss_mlp": 1.03213811, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 3.1425569240832414, "language_loss": 0.71183646, "learning_rate": 3.85810997982162e-06, "loss": 0.7337026, "num_input_tokens_seen": 53070055, "step": 2445, "time_per_iteration": 2.735361099243164 }, { "auxiliary_loss_clip": 0.01095177, "auxiliary_loss_mlp": 0.01023118, "balance_loss_clip": 1.05335557, "balance_loss_mlp": 1.01999438, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.824990401786658, "language_loss": 0.63083708, "learning_rate": 3.857965866494923e-06, "loss": 0.65202004, "num_input_tokens_seen": 53126945, "step": 2446, "time_per_iteration": 3.0853025913238525 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01045249, "balance_loss_clip": 1.05621576, "balance_loss_mlp": 1.02491164, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 2.813052009295296, "language_loss": 0.74895924, "learning_rate": 3.857821682713975e-06, "loss": 0.77061838, "num_input_tokens_seen": 53149130, "step": 2447, "time_per_iteration": 2.858643054962158 }, { "auxiliary_loss_clip": 0.01168929, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.0604012, "balance_loss_mlp": 1.02383327, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 2.2427639286159367, "language_loss": 0.8528471, "learning_rate": 3.857677428484242e-06, "loss": 0.87496543, "num_input_tokens_seen": 53167120, "step": 2448, "time_per_iteration": 2.699781894683838 }, { "auxiliary_loss_clip": 0.01092169, "auxiliary_loss_mlp": 0.01019616, "balance_loss_clip": 1.05051064, "balance_loss_mlp": 1.01654005, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7683837313264128, "language_loss": 0.56829578, "learning_rate": 3.857533103811195e-06, "loss": 0.58941364, "num_input_tokens_seen": 53227945, "step": 2449, "time_per_iteration": 3.1478211879730225 }, { "auxiliary_loss_clip": 0.01135016, "auxiliary_loss_mlp": 0.01050801, "balance_loss_clip": 1.05464292, "balance_loss_mlp": 1.03023791, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.9048653074507311, "language_loss": 0.85067344, "learning_rate": 3.857388708700307e-06, "loss": 0.87253165, "num_input_tokens_seen": 53244615, "step": 2450, "time_per_iteration": 2.726008653640747 }, { "auxiliary_loss_clip": 0.01158708, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.05984712, "balance_loss_mlp": 1.02994645, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 2.306043539040143, "language_loss": 0.74523091, "learning_rate": 3.857244243157052e-06, "loss": 0.76731533, "num_input_tokens_seen": 53262205, "step": 2451, "time_per_iteration": 2.641082286834717 }, { "auxiliary_loss_clip": 0.01133915, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05399728, "balance_loss_mlp": 1.02031422, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.8026547738986978, "language_loss": 0.82384264, "learning_rate": 3.85709970718691e-06, "loss": 0.84556639, "num_input_tokens_seen": 53282445, "step": 2452, "time_per_iteration": 2.7810096740722656 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01041864, "balance_loss_clip": 1.05924153, "balance_loss_mlp": 1.0238874, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.6675065143572472, "language_loss": 0.74075705, "learning_rate": 3.856955100795361e-06, "loss": 0.76219124, "num_input_tokens_seen": 53299060, "step": 2453, "time_per_iteration": 2.7913167476654053 }, { "auxiliary_loss_clip": 0.01141798, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.05557632, "balance_loss_mlp": 1.026353, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 1.9958141581621542, "language_loss": 0.7558704, "learning_rate": 3.856810423987889e-06, "loss": 0.77774906, "num_input_tokens_seen": 53315970, "step": 2454, "time_per_iteration": 2.7199089527130127 }, { "auxiliary_loss_clip": 0.01147348, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.05733335, "balance_loss_mlp": 1.01864362, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 2.0858167958418674, "language_loss": 0.83077228, "learning_rate": 3.856665676769979e-06, "loss": 0.85262716, "num_input_tokens_seen": 53332940, "step": 2455, "time_per_iteration": 2.75616192817688 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.05704689, "balance_loss_mlp": 1.02452159, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 2.3702229998953976, "language_loss": 0.83881497, "learning_rate": 3.85652085914712e-06, "loss": 0.86054951, "num_input_tokens_seen": 53353295, "step": 2456, "time_per_iteration": 2.7914254665374756 }, { "auxiliary_loss_clip": 0.01154014, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.05863023, "balance_loss_mlp": 1.02514231, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 2.4172359629848996, "language_loss": 0.84154665, "learning_rate": 3.856375971124805e-06, "loss": 0.86352402, "num_input_tokens_seen": 53373410, "step": 2457, "time_per_iteration": 2.688265323638916 }, { "auxiliary_loss_clip": 0.01155788, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.02529585, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 6.310680797376285, "language_loss": 0.75692672, "learning_rate": 3.856231012708527e-06, "loss": 0.77891362, "num_input_tokens_seen": 53391430, "step": 2458, "time_per_iteration": 2.698697805404663 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.05451179, "balance_loss_mlp": 1.02718902, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 3.1268711361266393, "language_loss": 0.83348328, "learning_rate": 3.856085983903782e-06, "loss": 0.85513484, "num_input_tokens_seen": 53409960, "step": 2459, "time_per_iteration": 2.790552854537964 }, { "auxiliary_loss_clip": 0.01126767, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.05070424, "balance_loss_mlp": 1.02435231, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 3.1203941208753534, "language_loss": 0.7554391, "learning_rate": 3.855940884716071e-06, "loss": 0.77712965, "num_input_tokens_seen": 53426160, "step": 2460, "time_per_iteration": 2.815455675125122 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.05845904, "balance_loss_mlp": 1.03770471, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 3.59241393994, "language_loss": 0.81227219, "learning_rate": 3.855795715150896e-06, "loss": 0.83418173, "num_input_tokens_seen": 53448530, "step": 2461, "time_per_iteration": 2.785569190979004 }, { "auxiliary_loss_clip": 0.01156748, "auxiliary_loss_mlp": 0.01051178, "balance_loss_clip": 1.05812359, "balance_loss_mlp": 1.03044713, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 3.2910626990147183, "language_loss": 0.66117477, "learning_rate": 3.855650475213761e-06, "loss": 0.683254, "num_input_tokens_seen": 53465915, "step": 2462, "time_per_iteration": 2.7222983837127686 }, { "auxiliary_loss_clip": 0.01136035, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.05622339, "balance_loss_mlp": 1.02965331, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 1.8120706772856114, "language_loss": 0.67226064, "learning_rate": 3.8555051649101745e-06, "loss": 0.69411635, "num_input_tokens_seen": 53496055, "step": 2463, "time_per_iteration": 3.0344398021698 }, { "auxiliary_loss_clip": 0.01153077, "auxiliary_loss_mlp": 0.01050435, "balance_loss_clip": 1.05550933, "balance_loss_mlp": 1.0307889, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 1.9881580745750587, "language_loss": 0.76870739, "learning_rate": 3.855359784245646e-06, "loss": 0.79074258, "num_input_tokens_seen": 53513790, "step": 2464, "time_per_iteration": 2.69480037689209 }, { "auxiliary_loss_clip": 0.01133748, "auxiliary_loss_mlp": 0.01057139, "balance_loss_clip": 1.05392432, "balance_loss_mlp": 1.03769565, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.8401367705559406, "language_loss": 0.79628456, "learning_rate": 3.855214333225688e-06, "loss": 0.81819344, "num_input_tokens_seen": 53533410, "step": 2465, "time_per_iteration": 2.6989939212799072 }, { "auxiliary_loss_clip": 0.01170385, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.06119514, "balance_loss_mlp": 1.03568494, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 2.005541134809237, "language_loss": 0.76272273, "learning_rate": 3.855068811855817e-06, "loss": 0.78497583, "num_input_tokens_seen": 53554775, "step": 2466, "time_per_iteration": 2.646245002746582 }, { "auxiliary_loss_clip": 0.01018939, "auxiliary_loss_mlp": 0.0114331, "balance_loss_clip": 1.03313899, "balance_loss_mlp": 1.14004362, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.8320983618395327, "language_loss": 0.6004858, "learning_rate": 3.854923220141551e-06, "loss": 0.62210834, "num_input_tokens_seen": 53609675, "step": 2467, "time_per_iteration": 3.33776593208313 }, { "auxiliary_loss_clip": 0.01141854, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.05437851, "balance_loss_mlp": 1.02509522, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 2.92694776694492, "language_loss": 0.87666196, "learning_rate": 3.85477755808841e-06, "loss": 0.89852077, "num_input_tokens_seen": 53626950, "step": 2468, "time_per_iteration": 4.266207456588745 }, { "auxiliary_loss_clip": 0.01130189, "auxiliary_loss_mlp": 0.01048186, "balance_loss_clip": 1.05255163, "balance_loss_mlp": 1.02782488, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 2.2284173124426223, "language_loss": 0.7598694, "learning_rate": 3.854631825701919e-06, "loss": 0.78165317, "num_input_tokens_seen": 53644200, "step": 2469, "time_per_iteration": 4.217481851577759 }, { "auxiliary_loss_clip": 0.01126269, "auxiliary_loss_mlp": 0.0104139, "balance_loss_clip": 1.05208421, "balance_loss_mlp": 1.02251911, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 6.591244267451795, "language_loss": 0.75895017, "learning_rate": 3.854486022987603e-06, "loss": 0.78062677, "num_input_tokens_seen": 53659650, "step": 2470, "time_per_iteration": 2.7157187461853027 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.05831027, "balance_loss_mlp": 1.02571499, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.8610043660805562, "language_loss": 0.7215873, "learning_rate": 3.8543401499509905e-06, "loss": 0.74364614, "num_input_tokens_seen": 53680275, "step": 2471, "time_per_iteration": 4.162387132644653 }, { "auxiliary_loss_clip": 0.01135244, "auxiliary_loss_mlp": 0.01047611, "balance_loss_clip": 1.05438995, "balance_loss_mlp": 1.02717888, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 1.979025280241548, "language_loss": 0.89558828, "learning_rate": 3.854194206597615e-06, "loss": 0.91741687, "num_input_tokens_seen": 53698270, "step": 2472, "time_per_iteration": 2.739457607269287 }, { "auxiliary_loss_clip": 0.01134625, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.06334805, "balance_loss_mlp": 1.02964163, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 2.6029609251362764, "language_loss": 0.80801564, "learning_rate": 3.854048192933008e-06, "loss": 0.82985294, "num_input_tokens_seen": 53716845, "step": 2473, "time_per_iteration": 4.412883758544922 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.0626657, "balance_loss_mlp": 1.03267312, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 3.426519274325147, "language_loss": 0.77372944, "learning_rate": 3.853902108962709e-06, "loss": 0.79585278, "num_input_tokens_seen": 53734970, "step": 2474, "time_per_iteration": 2.6879520416259766 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01059785, "balance_loss_clip": 1.05597806, "balance_loss_mlp": 1.04041362, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 2.4771626433268734, "language_loss": 0.82151824, "learning_rate": 3.853755954692255e-06, "loss": 0.84335828, "num_input_tokens_seen": 53753415, "step": 2475, "time_per_iteration": 2.7828469276428223 }, { "auxiliary_loss_clip": 0.01115855, "auxiliary_loss_mlp": 0.01052322, "balance_loss_clip": 1.0614953, "balance_loss_mlp": 1.03341544, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.9349243252831771, "language_loss": 0.80917645, "learning_rate": 3.85360973012719e-06, "loss": 0.83085823, "num_input_tokens_seen": 53770305, "step": 2476, "time_per_iteration": 2.7227590084075928 }, { "auxiliary_loss_clip": 0.01156019, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.06338036, "balance_loss_mlp": 1.03216898, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 2.0032169897498346, "language_loss": 0.77659523, "learning_rate": 3.853463435273058e-06, "loss": 0.79865897, "num_input_tokens_seen": 53788895, "step": 2477, "time_per_iteration": 2.740241765975952 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.01092005, "balance_loss_clip": 1.07879949, "balance_loss_mlp": 1.08730817, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8188153224748298, "language_loss": 0.60153681, "learning_rate": 3.853317070135407e-06, "loss": 0.62348026, "num_input_tokens_seen": 53850260, "step": 2478, "time_per_iteration": 3.2467947006225586 }, { "auxiliary_loss_clip": 0.01107417, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.0516423, "balance_loss_mlp": 1.03041577, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.666109649137694, "language_loss": 0.7139731, "learning_rate": 3.853170634719787e-06, "loss": 0.73553181, "num_input_tokens_seen": 53867520, "step": 2479, "time_per_iteration": 2.7973475456237793 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05563831, "balance_loss_mlp": 1.02407789, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.7687137634424535, "language_loss": 0.80758464, "learning_rate": 3.853024129031751e-06, "loss": 0.82942122, "num_input_tokens_seen": 53886620, "step": 2480, "time_per_iteration": 2.7238829135894775 }, { "auxiliary_loss_clip": 0.01138106, "auxiliary_loss_mlp": 0.0104537, "balance_loss_clip": 1.0584991, "balance_loss_mlp": 1.02627277, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 4.65741826395702, "language_loss": 0.84375542, "learning_rate": 3.852877553076854e-06, "loss": 0.86559021, "num_input_tokens_seen": 53902230, "step": 2481, "time_per_iteration": 2.791550874710083 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05772805, "balance_loss_mlp": 1.02948999, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 8.035113387353048, "language_loss": 0.77703977, "learning_rate": 3.8527309068606546e-06, "loss": 0.79903734, "num_input_tokens_seen": 53919475, "step": 2482, "time_per_iteration": 2.7310593128204346 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01040426, "balance_loss_clip": 1.05452228, "balance_loss_mlp": 1.02032781, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.207731010812049, "language_loss": 0.78967929, "learning_rate": 3.852584190388713e-06, "loss": 0.81141514, "num_input_tokens_seen": 53939150, "step": 2483, "time_per_iteration": 2.749671220779419 }, { "auxiliary_loss_clip": 0.01154122, "auxiliary_loss_mlp": 0.00776708, "balance_loss_clip": 1.06144214, "balance_loss_mlp": 1.00029397, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 2.020127706544282, "language_loss": 0.70361555, "learning_rate": 3.852437403666595e-06, "loss": 0.72292387, "num_input_tokens_seen": 53958735, "step": 2484, "time_per_iteration": 2.737781524658203 }, { "auxiliary_loss_clip": 0.01141919, "auxiliary_loss_mlp": 0.00778215, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.00030363, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 2.165877689982274, "language_loss": 0.84666765, "learning_rate": 3.852290546699863e-06, "loss": 0.86586899, "num_input_tokens_seen": 53975065, "step": 2485, "time_per_iteration": 2.697976589202881 }, { "auxiliary_loss_clip": 0.01145272, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05639958, "balance_loss_mlp": 1.02257001, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 2.5229241908443023, "language_loss": 0.8476423, "learning_rate": 3.8521436194940894e-06, "loss": 0.86951739, "num_input_tokens_seen": 53993330, "step": 2486, "time_per_iteration": 2.6799628734588623 }, { "auxiliary_loss_clip": 0.01149031, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.05667424, "balance_loss_mlp": 1.0230875, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 2.1822908802725203, "language_loss": 0.74762607, "learning_rate": 3.851996622054842e-06, "loss": 0.76950949, "num_input_tokens_seen": 54010515, "step": 2487, "time_per_iteration": 2.8037290573120117 }, { "auxiliary_loss_clip": 0.01153097, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.05934322, "balance_loss_mlp": 1.02611899, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 16.320028017118723, "language_loss": 0.72210175, "learning_rate": 3.8518495543877e-06, "loss": 0.74407548, "num_input_tokens_seen": 54031315, "step": 2488, "time_per_iteration": 2.8031094074249268 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.01054916, "balance_loss_clip": 1.05569518, "balance_loss_mlp": 1.03636682, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 3.2458980886023143, "language_loss": 0.71352434, "learning_rate": 3.851702416498235e-06, "loss": 0.73544884, "num_input_tokens_seen": 54045965, "step": 2489, "time_per_iteration": 2.648883819580078 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01052603, "balance_loss_clip": 1.05376494, "balance_loss_mlp": 1.03357768, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.893198448080141, "language_loss": 0.81559736, "learning_rate": 3.8515552083920295e-06, "loss": 0.8375001, "num_input_tokens_seen": 54059960, "step": 2490, "time_per_iteration": 2.702808380126953 }, { "auxiliary_loss_clip": 0.01125097, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.05606139, "balance_loss_mlp": 1.03803492, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.9071281232744548, "language_loss": 0.80057055, "learning_rate": 3.851407930074666e-06, "loss": 0.82238084, "num_input_tokens_seen": 54079330, "step": 2491, "time_per_iteration": 2.833272933959961 }, { "auxiliary_loss_clip": 0.01143407, "auxiliary_loss_mlp": 0.01052558, "balance_loss_clip": 1.05301452, "balance_loss_mlp": 1.03195894, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 2.3105790695512294, "language_loss": 0.90820229, "learning_rate": 3.851260581551727e-06, "loss": 0.93016195, "num_input_tokens_seen": 54097555, "step": 2492, "time_per_iteration": 2.684178352355957 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01063543, "balance_loss_clip": 1.05835843, "balance_loss_mlp": 1.04508913, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 6.881290297472923, "language_loss": 0.79406559, "learning_rate": 3.851113162828802e-06, "loss": 0.81622434, "num_input_tokens_seen": 54115600, "step": 2493, "time_per_iteration": 2.6558918952941895 }, { "auxiliary_loss_clip": 0.0114858, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.05345511, "balance_loss_mlp": 1.03258693, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 2.3431247769189967, "language_loss": 0.79894584, "learning_rate": 3.85096567391148e-06, "loss": 0.82095182, "num_input_tokens_seen": 54135220, "step": 2494, "time_per_iteration": 2.6774168014526367 }, { "auxiliary_loss_clip": 0.01137216, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.05474579, "balance_loss_mlp": 1.03212965, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.928941284350508, "language_loss": 0.66480517, "learning_rate": 3.850818114805354e-06, "loss": 0.68668592, "num_input_tokens_seen": 54161065, "step": 2495, "time_per_iteration": 3.1090729236602783 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.06896818, "balance_loss_mlp": 1.03560257, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.9030283421527312, "language_loss": 0.59524739, "learning_rate": 3.850670485516019e-06, "loss": 0.61666763, "num_input_tokens_seen": 54225095, "step": 2496, "time_per_iteration": 3.2250726222991943 }, { "auxiliary_loss_clip": 0.01163934, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.05690169, "balance_loss_mlp": 1.0360074, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 3.063784198565679, "language_loss": 0.65276247, "learning_rate": 3.850522786049075e-06, "loss": 0.67495906, "num_input_tokens_seen": 54243750, "step": 2497, "time_per_iteration": 2.619946002960205 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.05308235, "balance_loss_mlp": 1.03316998, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.5552670947231086, "language_loss": 0.75182658, "learning_rate": 3.850375016410121e-06, "loss": 0.77362406, "num_input_tokens_seen": 54266185, "step": 2498, "time_per_iteration": 2.778163433074951 }, { "auxiliary_loss_clip": 0.01132738, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.02701163, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 3.357364003851319, "language_loss": 0.71821117, "learning_rate": 3.850227176604761e-06, "loss": 0.74000776, "num_input_tokens_seen": 54283940, "step": 2499, "time_per_iteration": 2.6929259300231934 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.0547812, "balance_loss_mlp": 1.03236222, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.1406696998963652, "language_loss": 0.7206136, "learning_rate": 3.850079266638601e-06, "loss": 0.7424742, "num_input_tokens_seen": 54304830, "step": 2500, "time_per_iteration": 2.769988536834717 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.0105021, "balance_loss_clip": 1.06063724, "balance_loss_mlp": 1.03181624, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 2.0251881980439306, "language_loss": 0.65127194, "learning_rate": 3.849931286517249e-06, "loss": 0.6731143, "num_input_tokens_seen": 54325595, "step": 2501, "time_per_iteration": 2.810945510864258 }, { "auxiliary_loss_clip": 0.01137877, "auxiliary_loss_mlp": 0.01055223, "balance_loss_clip": 1.0541079, "balance_loss_mlp": 1.03511274, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.209666371186328, "language_loss": 0.83401144, "learning_rate": 3.849783236246318e-06, "loss": 0.85594243, "num_input_tokens_seen": 54342180, "step": 2502, "time_per_iteration": 2.6780545711517334 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01049887, "balance_loss_clip": 1.05318308, "balance_loss_mlp": 1.0323875, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 2.0319272128830947, "language_loss": 0.77134645, "learning_rate": 3.849635115831421e-06, "loss": 0.79307491, "num_input_tokens_seen": 54360255, "step": 2503, "time_per_iteration": 2.7579123973846436 }, { "auxiliary_loss_clip": 0.01159116, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02692807, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 1.9852139459946199, "language_loss": 0.85514295, "learning_rate": 3.849486925278176e-06, "loss": 0.87717503, "num_input_tokens_seen": 54378260, "step": 2504, "time_per_iteration": 2.631882905960083 }, { "auxiliary_loss_clip": 0.01146113, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.05622697, "balance_loss_mlp": 1.03098798, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.8222645508164372, "language_loss": 0.83178544, "learning_rate": 3.8493386645922e-06, "loss": 0.85372692, "num_input_tokens_seen": 54399745, "step": 2505, "time_per_iteration": 2.7706007957458496 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01053819, "balance_loss_clip": 1.05586648, "balance_loss_mlp": 1.03590202, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 2.0148067518000445, "language_loss": 0.76044405, "learning_rate": 3.849190333779117e-06, "loss": 0.7822392, "num_input_tokens_seen": 54417105, "step": 2506, "time_per_iteration": 2.70989990234375 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01041911, "balance_loss_clip": 1.05785728, "balance_loss_mlp": 1.02305174, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 2.823460856599666, "language_loss": 0.76220375, "learning_rate": 3.849041932844552e-06, "loss": 0.78427601, "num_input_tokens_seen": 54433920, "step": 2507, "time_per_iteration": 2.5367634296417236 }, { "auxiliary_loss_clip": 0.01144479, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.05261898, "balance_loss_mlp": 1.02306986, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 2.5197772895304906, "language_loss": 0.68633789, "learning_rate": 3.848893461794131e-06, "loss": 0.70819366, "num_input_tokens_seen": 54451540, "step": 2508, "time_per_iteration": 4.303388833999634 }, { "auxiliary_loss_clip": 0.01130299, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.05477214, "balance_loss_mlp": 1.02835178, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 2.840517748098311, "language_loss": 0.77994299, "learning_rate": 3.8487449206334845e-06, "loss": 0.80171108, "num_input_tokens_seen": 54470800, "step": 2509, "time_per_iteration": 4.380200147628784 }, { "auxiliary_loss_clip": 0.01141335, "auxiliary_loss_mlp": 0.00776843, "balance_loss_clip": 1.05463386, "balance_loss_mlp": 1.00027037, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 2.53406994590866, "language_loss": 0.79959804, "learning_rate": 3.848596309368246e-06, "loss": 0.81877983, "num_input_tokens_seen": 54486525, "step": 2510, "time_per_iteration": 4.219487428665161 }, { "auxiliary_loss_clip": 0.01150641, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05529225, "balance_loss_mlp": 1.02794981, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 1.8628702139594306, "language_loss": 0.73398602, "learning_rate": 3.8484476280040495e-06, "loss": 0.75596589, "num_input_tokens_seen": 54503795, "step": 2511, "time_per_iteration": 2.62237811088562 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04747009, "balance_loss_mlp": 1.02365553, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 2.20399257021602, "language_loss": 0.68716824, "learning_rate": 3.848298876546534e-06, "loss": 0.70853454, "num_input_tokens_seen": 54523025, "step": 2512, "time_per_iteration": 2.823359489440918 }, { "auxiliary_loss_clip": 0.01149398, "auxiliary_loss_mlp": 0.01043296, "balance_loss_clip": 1.05574036, "balance_loss_mlp": 1.02615356, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.6278607305338877, "language_loss": 0.73833561, "learning_rate": 3.84815005500134e-06, "loss": 0.76026255, "num_input_tokens_seen": 54545025, "step": 2513, "time_per_iteration": 4.386258602142334 }, { "auxiliary_loss_clip": 0.01059691, "auxiliary_loss_mlp": 0.01109321, "balance_loss_clip": 1.0685482, "balance_loss_mlp": 1.10529137, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.9017688875456507, "language_loss": 0.64720047, "learning_rate": 3.84800116337411e-06, "loss": 0.6688906, "num_input_tokens_seen": 54604545, "step": 2514, "time_per_iteration": 3.254983425140381 }, { "auxiliary_loss_clip": 0.01146323, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.05674648, "balance_loss_mlp": 1.02584124, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 3.178381755435586, "language_loss": 0.72995645, "learning_rate": 3.8478522016704916e-06, "loss": 0.7518549, "num_input_tokens_seen": 54620590, "step": 2515, "time_per_iteration": 2.67921781539917 }, { "auxiliary_loss_clip": 0.01133382, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.05675673, "balance_loss_mlp": 1.02120531, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 2.0712989062813243, "language_loss": 0.7773214, "learning_rate": 3.8477031698961325e-06, "loss": 0.79905832, "num_input_tokens_seen": 54640410, "step": 2516, "time_per_iteration": 2.763467788696289 }, { "auxiliary_loss_clip": 0.01087601, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.05344796, "balance_loss_mlp": 1.00160813, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7270407819118658, "language_loss": 0.54622567, "learning_rate": 3.8475540680566835e-06, "loss": 0.56714946, "num_input_tokens_seen": 54701430, "step": 2517, "time_per_iteration": 3.2293660640716553 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.04499209, "balance_loss_mlp": 1.02427244, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 3.035771526476276, "language_loss": 0.78264821, "learning_rate": 3.8474048961577995e-06, "loss": 0.80418587, "num_input_tokens_seen": 54720845, "step": 2518, "time_per_iteration": 2.8154754638671875 }, { "auxiliary_loss_clip": 0.01147342, "auxiliary_loss_mlp": 0.01056368, "balance_loss_clip": 1.05279088, "balance_loss_mlp": 1.03681803, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.1881526177791097, "language_loss": 0.70480245, "learning_rate": 3.847255654205137e-06, "loss": 0.72683954, "num_input_tokens_seen": 54740495, "step": 2519, "time_per_iteration": 2.7098515033721924 }, { "auxiliary_loss_clip": 0.01152463, "auxiliary_loss_mlp": 0.01056975, "balance_loss_clip": 1.05683672, "balance_loss_mlp": 1.03802037, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.9048594994100874, "language_loss": 0.78681207, "learning_rate": 3.847106342204354e-06, "loss": 0.80890644, "num_input_tokens_seen": 54758415, "step": 2520, "time_per_iteration": 2.664187431335449 }, { "auxiliary_loss_clip": 0.01140573, "auxiliary_loss_mlp": 0.01071607, "balance_loss_clip": 1.05435348, "balance_loss_mlp": 1.05244994, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 3.950911503454746, "language_loss": 0.74849677, "learning_rate": 3.846956960161114e-06, "loss": 0.77061862, "num_input_tokens_seen": 54779355, "step": 2521, "time_per_iteration": 2.7900772094726562 }, { "auxiliary_loss_clip": 0.01132038, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.05052209, "balance_loss_mlp": 1.0360136, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 4.620979243079986, "language_loss": 0.8253814, "learning_rate": 3.84680750808108e-06, "loss": 0.84726053, "num_input_tokens_seen": 54799465, "step": 2522, "time_per_iteration": 2.7216525077819824 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.05645704, "balance_loss_mlp": 1.04595995, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8362305181264502, "language_loss": 0.57885599, "learning_rate": 3.846657985969922e-06, "loss": 0.59986252, "num_input_tokens_seen": 54857665, "step": 2523, "time_per_iteration": 3.2375056743621826 }, { "auxiliary_loss_clip": 0.0114147, "auxiliary_loss_mlp": 0.01057964, "balance_loss_clip": 1.05213499, "balance_loss_mlp": 1.0368042, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.8054087157705183, "language_loss": 0.74795163, "learning_rate": 3.8465083938333066e-06, "loss": 0.76994598, "num_input_tokens_seen": 54879895, "step": 2524, "time_per_iteration": 2.711557388305664 }, { "auxiliary_loss_clip": 0.01138185, "auxiliary_loss_mlp": 0.01057236, "balance_loss_clip": 1.05304718, "balance_loss_mlp": 1.03865099, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.8255227790100423, "language_loss": 0.74631184, "learning_rate": 3.8463587316769085e-06, "loss": 0.76826608, "num_input_tokens_seen": 54898245, "step": 2525, "time_per_iteration": 2.6936984062194824 }, { "auxiliary_loss_clip": 0.01144047, "auxiliary_loss_mlp": 0.01057009, "balance_loss_clip": 1.05403006, "balance_loss_mlp": 1.03747034, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 1.8907352833287865, "language_loss": 0.79600316, "learning_rate": 3.846208999506402e-06, "loss": 0.81801373, "num_input_tokens_seen": 54917060, "step": 2526, "time_per_iteration": 2.651494264602661 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.01047798, "balance_loss_clip": 1.05538774, "balance_loss_mlp": 1.03056002, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 1.7677336965262924, "language_loss": 0.8443349, "learning_rate": 3.846059197327466e-06, "loss": 0.86616516, "num_input_tokens_seen": 54936365, "step": 2527, "time_per_iteration": 2.702683448791504 }, { "auxiliary_loss_clip": 0.01124925, "auxiliary_loss_mlp": 0.01049207, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 1.02985954, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 1.85678489681458, "language_loss": 0.69361663, "learning_rate": 3.845909325145779e-06, "loss": 0.7153579, "num_input_tokens_seen": 54961365, "step": 2528, "time_per_iteration": 2.9250690937042236 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.01055056, "balance_loss_clip": 1.05266535, "balance_loss_mlp": 1.03587484, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 2.004144148858156, "language_loss": 0.86482549, "learning_rate": 3.845759382967026e-06, "loss": 0.88671696, "num_input_tokens_seen": 54980750, "step": 2529, "time_per_iteration": 2.7277863025665283 }, { "auxiliary_loss_clip": 0.01124798, "auxiliary_loss_mlp": 0.01041651, "balance_loss_clip": 1.05046487, "balance_loss_mlp": 1.02297091, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.544775548600603, "language_loss": 0.83399373, "learning_rate": 3.845609370796893e-06, "loss": 0.85565823, "num_input_tokens_seen": 54999675, "step": 2530, "time_per_iteration": 2.8717291355133057 }, { "auxiliary_loss_clip": 0.01125761, "auxiliary_loss_mlp": 0.01048121, "balance_loss_clip": 1.05035281, "balance_loss_mlp": 1.02940559, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.1410437006568723, "language_loss": 0.80404246, "learning_rate": 3.845459288641066e-06, "loss": 0.82578129, "num_input_tokens_seen": 55018295, "step": 2531, "time_per_iteration": 2.8444995880126953 }, { "auxiliary_loss_clip": 0.01143114, "auxiliary_loss_mlp": 0.01043494, "balance_loss_clip": 1.05216551, "balance_loss_mlp": 1.02613723, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.7922494378130023, "language_loss": 0.78874445, "learning_rate": 3.8453091365052394e-06, "loss": 0.81061059, "num_input_tokens_seen": 55037975, "step": 2532, "time_per_iteration": 2.9122390747070312 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.05737543, "balance_loss_mlp": 1.02676702, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.9533698136575197, "language_loss": 0.87679356, "learning_rate": 3.845158914395105e-06, "loss": 0.89874816, "num_input_tokens_seen": 55057135, "step": 2533, "time_per_iteration": 2.7987985610961914 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01048672, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.02983665, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.391026063452041, "language_loss": 0.78886449, "learning_rate": 3.84500862231636e-06, "loss": 0.81053078, "num_input_tokens_seen": 55075525, "step": 2534, "time_per_iteration": 2.7587406635284424 }, { "auxiliary_loss_clip": 0.01164218, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.05609345, "balance_loss_mlp": 1.0270381, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 2.689732363294508, "language_loss": 0.76809752, "learning_rate": 3.844858260274702e-06, "loss": 0.79021192, "num_input_tokens_seen": 55090845, "step": 2535, "time_per_iteration": 2.7494406700134277 }, { "auxiliary_loss_clip": 0.01142628, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.05345285, "balance_loss_mlp": 1.02401042, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.2235871255319446, "language_loss": 0.78301942, "learning_rate": 3.844707828275835e-06, "loss": 0.80487478, "num_input_tokens_seen": 55108750, "step": 2536, "time_per_iteration": 2.738638401031494 }, { "auxiliary_loss_clip": 0.01128919, "auxiliary_loss_mlp": 0.0105368, "balance_loss_clip": 1.05349088, "balance_loss_mlp": 1.03497589, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.311649941233105, "language_loss": 0.75824189, "learning_rate": 3.844557326325461e-06, "loss": 0.78006792, "num_input_tokens_seen": 55126750, "step": 2537, "time_per_iteration": 2.632373809814453 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.05675745, "balance_loss_mlp": 1.02331281, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 2.193148723631548, "language_loss": 0.77737647, "learning_rate": 3.8444067544292896e-06, "loss": 0.79928178, "num_input_tokens_seen": 55144690, "step": 2538, "time_per_iteration": 2.6835639476776123 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05477905, "balance_loss_mlp": 1.02480412, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 2.951423477379744, "language_loss": 0.89502335, "learning_rate": 3.844256112593029e-06, "loss": 0.91653961, "num_input_tokens_seen": 55166055, "step": 2539, "time_per_iteration": 2.7825794219970703 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.05367279, "balance_loss_mlp": 1.02721143, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 2.1073423273657044, "language_loss": 0.93423879, "learning_rate": 3.844105400822391e-06, "loss": 0.95612311, "num_input_tokens_seen": 55186285, "step": 2540, "time_per_iteration": 2.717541456222534 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.0240885, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 2.084754505375857, "language_loss": 0.75217843, "learning_rate": 3.843954619123092e-06, "loss": 0.77391309, "num_input_tokens_seen": 55207915, "step": 2541, "time_per_iteration": 2.8376123905181885 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.04877007, "balance_loss_mlp": 1.0268805, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 2.037290364787748, "language_loss": 0.80996066, "learning_rate": 3.84380376750085e-06, "loss": 0.83157599, "num_input_tokens_seen": 55227860, "step": 2542, "time_per_iteration": 2.7110376358032227 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.0566076, "balance_loss_mlp": 1.02992105, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 3.2152362880248857, "language_loss": 0.77796149, "learning_rate": 3.843652845961383e-06, "loss": 0.80005145, "num_input_tokens_seen": 55247330, "step": 2543, "time_per_iteration": 2.674131155014038 }, { "auxiliary_loss_clip": 0.01145565, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.05380869, "balance_loss_mlp": 1.02388239, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 2.4890924021550918, "language_loss": 0.85898137, "learning_rate": 3.843501854510416e-06, "loss": 0.88085836, "num_input_tokens_seen": 55266195, "step": 2544, "time_per_iteration": 2.685840606689453 }, { "auxiliary_loss_clip": 0.01149904, "auxiliary_loss_mlp": 0.01051141, "balance_loss_clip": 1.05162692, "balance_loss_mlp": 1.03061318, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 1.9817931887295275, "language_loss": 0.83159137, "learning_rate": 3.843350793153673e-06, "loss": 0.85360181, "num_input_tokens_seen": 55283305, "step": 2545, "time_per_iteration": 2.7415812015533447 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.05556524, "balance_loss_mlp": 1.02257705, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 6.0131413628182, "language_loss": 0.71669161, "learning_rate": 3.843199661896884e-06, "loss": 0.73869026, "num_input_tokens_seen": 55303035, "step": 2546, "time_per_iteration": 2.6626265048980713 }, { "auxiliary_loss_clip": 0.01130357, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.05013335, "balance_loss_mlp": 1.02688098, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 1.6563553629779504, "language_loss": 0.77438712, "learning_rate": 3.843048460745779e-06, "loss": 0.79616702, "num_input_tokens_seen": 55327570, "step": 2547, "time_per_iteration": 4.451423168182373 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01044692, "balance_loss_clip": 1.04845536, "balance_loss_mlp": 1.02517736, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.3544675813743834, "language_loss": 0.74357474, "learning_rate": 3.842897189706092e-06, "loss": 0.7650972, "num_input_tokens_seen": 55351090, "step": 2548, "time_per_iteration": 2.846991539001465 }, { "auxiliary_loss_clip": 0.01138346, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.05340147, "balance_loss_mlp": 1.03304434, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.446042531021912, "language_loss": 0.80296385, "learning_rate": 3.842745848783558e-06, "loss": 0.82487667, "num_input_tokens_seen": 55371050, "step": 2549, "time_per_iteration": 5.8849101066589355 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.05108786, "balance_loss_mlp": 1.02255249, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.6149920159034452, "language_loss": 0.74602014, "learning_rate": 3.842594437983917e-06, "loss": 0.76788169, "num_input_tokens_seen": 55390375, "step": 2550, "time_per_iteration": 2.684868812561035 }, { "auxiliary_loss_clip": 0.01149823, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05212283, "balance_loss_mlp": 1.02129996, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 2.33086854575276, "language_loss": 0.76910275, "learning_rate": 3.8424429573129115e-06, "loss": 0.79100841, "num_input_tokens_seen": 55408890, "step": 2551, "time_per_iteration": 4.415414333343506 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.01054065, "balance_loss_clip": 1.05333817, "balance_loss_mlp": 1.05116868, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9493148205555214, "language_loss": 0.5665558, "learning_rate": 3.842291406776283e-06, "loss": 0.5879637, "num_input_tokens_seen": 55463815, "step": 2552, "time_per_iteration": 3.1105730533599854 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.05131924, "balance_loss_mlp": 1.0204618, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.183188616823757, "language_loss": 0.88550794, "learning_rate": 3.84213978637978e-06, "loss": 0.90703207, "num_input_tokens_seen": 55481050, "step": 2553, "time_per_iteration": 2.748298406600952 }, { "auxiliary_loss_clip": 0.01147024, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.0232954, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.8094820084348213, "language_loss": 0.7800495, "learning_rate": 3.841988096129152e-06, "loss": 0.80194902, "num_input_tokens_seen": 55500050, "step": 2554, "time_per_iteration": 2.6555569171905518 }, { "auxiliary_loss_clip": 0.01094445, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.04876757, "balance_loss_mlp": 1.03291798, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.372022486587551, "language_loss": 0.77472258, "learning_rate": 3.841836336030151e-06, "loss": 0.79620385, "num_input_tokens_seen": 55518125, "step": 2555, "time_per_iteration": 2.7507212162017822 }, { "auxiliary_loss_clip": 0.01129555, "auxiliary_loss_mlp": 0.01046723, "balance_loss_clip": 1.05400753, "balance_loss_mlp": 1.02873409, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5517643759455655, "language_loss": 0.77453947, "learning_rate": 3.8416845060885305e-06, "loss": 0.79630232, "num_input_tokens_seen": 55540960, "step": 2556, "time_per_iteration": 2.7947654724121094 }, { "auxiliary_loss_clip": 0.01140725, "auxiliary_loss_mlp": 0.0077646, "balance_loss_clip": 1.05336452, "balance_loss_mlp": 1.00054574, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.8786460244833383, "language_loss": 0.90098578, "learning_rate": 3.84153260631005e-06, "loss": 0.92015761, "num_input_tokens_seen": 55559210, "step": 2557, "time_per_iteration": 2.702029228210449 }, { "auxiliary_loss_clip": 0.01137441, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.02862656, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.4046585493240102, "language_loss": 0.7092281, "learning_rate": 3.841380636700468e-06, "loss": 0.73108798, "num_input_tokens_seen": 55578925, "step": 2558, "time_per_iteration": 2.815653085708618 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.04937947, "balance_loss_mlp": 1.02659965, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 2.1050139676488535, "language_loss": 0.92165422, "learning_rate": 3.841228597265548e-06, "loss": 0.94343007, "num_input_tokens_seen": 55597255, "step": 2559, "time_per_iteration": 2.7363967895507812 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01057878, "balance_loss_clip": 1.05492043, "balance_loss_mlp": 1.03711152, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.149412909113977, "language_loss": 0.63330692, "learning_rate": 3.841076488011055e-06, "loss": 0.65526068, "num_input_tokens_seen": 55619515, "step": 2560, "time_per_iteration": 2.811800003051758 }, { "auxiliary_loss_clip": 0.01132154, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.04914606, "balance_loss_mlp": 1.02416182, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 2.066473237183783, "language_loss": 0.88155699, "learning_rate": 3.8409243089427574e-06, "loss": 0.90331829, "num_input_tokens_seen": 55640050, "step": 2561, "time_per_iteration": 2.7991089820861816 }, { "auxiliary_loss_clip": 0.0114054, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.05085099, "balance_loss_mlp": 1.02380693, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 1.906051405357337, "language_loss": 0.83117974, "learning_rate": 3.840772060066425e-06, "loss": 0.85300398, "num_input_tokens_seen": 55658695, "step": 2562, "time_per_iteration": 2.6410810947418213 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.00778205, "balance_loss_clip": 1.04988563, "balance_loss_mlp": 1.00058532, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 2.3547297997270906, "language_loss": 0.74647415, "learning_rate": 3.840619741387832e-06, "loss": 0.76554382, "num_input_tokens_seen": 55676340, "step": 2563, "time_per_iteration": 2.6813745498657227 }, { "auxiliary_loss_clip": 0.01116857, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.05126941, "balance_loss_mlp": 1.02444029, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 2.842824767177756, "language_loss": 0.7609179, "learning_rate": 3.8404673529127534e-06, "loss": 0.78252757, "num_input_tokens_seen": 55698890, "step": 2564, "time_per_iteration": 2.832885265350342 }, { "auxiliary_loss_clip": 0.01133461, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.05174518, "balance_loss_mlp": 1.03443313, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.0125869911748575, "language_loss": 0.70960921, "learning_rate": 3.840314894646969e-06, "loss": 0.73147273, "num_input_tokens_seen": 55718535, "step": 2565, "time_per_iteration": 2.7352514266967773 }, { "auxiliary_loss_clip": 0.01137766, "auxiliary_loss_mlp": 0.01046908, "balance_loss_clip": 1.04731965, "balance_loss_mlp": 1.02787066, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.1021891280826965, "language_loss": 0.71605748, "learning_rate": 3.840162366596259e-06, "loss": 0.73790431, "num_input_tokens_seen": 55738970, "step": 2566, "time_per_iteration": 2.681710720062256 }, { "auxiliary_loss_clip": 0.01150619, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.02271223, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.7167104030167524, "language_loss": 0.84746087, "learning_rate": 3.840009768766408e-06, "loss": 0.86937428, "num_input_tokens_seen": 55759585, "step": 2567, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.04447246, "balance_loss_mlp": 1.03164053, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.9101336164483014, "language_loss": 0.78074998, "learning_rate": 3.839857101163202e-06, "loss": 0.80229992, "num_input_tokens_seen": 55779250, "step": 2568, "time_per_iteration": 2.7385261058807373 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.04715753, "balance_loss_mlp": 1.01684201, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.852436867559063, "language_loss": 0.6991998, "learning_rate": 3.83970436379243e-06, "loss": 0.72079051, "num_input_tokens_seen": 55800470, "step": 2569, "time_per_iteration": 2.746974229812622 }, { "auxiliary_loss_clip": 0.01124209, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.04695952, "balance_loss_mlp": 1.02178574, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 1.7212875994527412, "language_loss": 0.76482332, "learning_rate": 3.839551556659884e-06, "loss": 0.78646845, "num_input_tokens_seen": 55817795, "step": 2570, "time_per_iteration": 2.7470619678497314 }, { "auxiliary_loss_clip": 0.01137702, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04993737, "balance_loss_mlp": 1.0192852, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.5033166184578066, "language_loss": 0.77997506, "learning_rate": 3.839398679771359e-06, "loss": 0.80173767, "num_input_tokens_seen": 55836125, "step": 2571, "time_per_iteration": 2.692863702774048 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0498451, "balance_loss_mlp": 1.02704597, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 4.3242380509309015, "language_loss": 0.82932413, "learning_rate": 3.839245733132652e-06, "loss": 0.85109681, "num_input_tokens_seen": 55855280, "step": 2572, "time_per_iteration": 2.8341822624206543 }, { "auxiliary_loss_clip": 0.01156188, "auxiliary_loss_mlp": 0.01042592, "balance_loss_clip": 1.05181205, "balance_loss_mlp": 1.02383995, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5874704718869805, "language_loss": 0.90373385, "learning_rate": 3.839092716749563e-06, "loss": 0.92572165, "num_input_tokens_seen": 55875695, "step": 2573, "time_per_iteration": 2.740121364593506 }, { "auxiliary_loss_clip": 0.01088424, "auxiliary_loss_mlp": 0.01049893, "balance_loss_clip": 1.04328668, "balance_loss_mlp": 1.03003311, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 1.596795561637076, "language_loss": 0.70298707, "learning_rate": 3.838939630627893e-06, "loss": 0.72437024, "num_input_tokens_seen": 55894575, "step": 2574, "time_per_iteration": 2.7629144191741943 }, { "auxiliary_loss_clip": 0.01127537, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04714394, "balance_loss_mlp": 1.02509642, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 6.018921028505516, "language_loss": 0.82426423, "learning_rate": 3.838786474773448e-06, "loss": 0.84599686, "num_input_tokens_seen": 55912855, "step": 2575, "time_per_iteration": 2.656783103942871 }, { "auxiliary_loss_clip": 0.01127415, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02584219, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8376318938002576, "language_loss": 0.85038638, "learning_rate": 3.838633249192036e-06, "loss": 0.87209404, "num_input_tokens_seen": 55932375, "step": 2576, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01152547, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.04872847, "balance_loss_mlp": 1.02499545, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 1.8027999188827728, "language_loss": 0.82271254, "learning_rate": 3.838479953889465e-06, "loss": 0.84467208, "num_input_tokens_seen": 55953970, "step": 2577, "time_per_iteration": 2.6355643272399902 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.05147958, "balance_loss_mlp": 1.02984881, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.1677069711314463, "language_loss": 0.76556361, "learning_rate": 3.8383265888715525e-06, "loss": 0.78731394, "num_input_tokens_seen": 55973120, "step": 2578, "time_per_iteration": 2.649043560028076 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.05155993, "balance_loss_mlp": 1.0253042, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.9614380224881987, "language_loss": 0.82443559, "learning_rate": 3.83817315414411e-06, "loss": 0.8461169, "num_input_tokens_seen": 55993260, "step": 2579, "time_per_iteration": 2.62631893157959 }, { "auxiliary_loss_clip": 0.01143904, "auxiliary_loss_mlp": 0.01044324, "balance_loss_clip": 1.05856657, "balance_loss_mlp": 1.02556014, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 2.610374735790095, "language_loss": 0.80465376, "learning_rate": 3.838019649712958e-06, "loss": 0.82653606, "num_input_tokens_seen": 56012130, "step": 2580, "time_per_iteration": 2.6512253284454346 }, { "auxiliary_loss_clip": 0.0107737, "auxiliary_loss_mlp": 0.01006304, "balance_loss_clip": 1.04551053, "balance_loss_mlp": 1.00360954, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.842131683094019, "language_loss": 0.58823448, "learning_rate": 3.8378660755839166e-06, "loss": 0.60907125, "num_input_tokens_seen": 56079045, "step": 2581, "time_per_iteration": 3.357855796813965 }, { "auxiliary_loss_clip": 0.01108206, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.04392648, "balance_loss_mlp": 1.0249418, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 1.9584677228939371, "language_loss": 0.84773678, "learning_rate": 3.8377124317628095e-06, "loss": 0.86928916, "num_input_tokens_seen": 56098745, "step": 2582, "time_per_iteration": 2.727062702178955 }, { "auxiliary_loss_clip": 0.01144131, "auxiliary_loss_mlp": 0.01051911, "balance_loss_clip": 1.05233002, "balance_loss_mlp": 1.03175235, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.466663791870015, "language_loss": 0.79050052, "learning_rate": 3.8375587182554625e-06, "loss": 0.81246096, "num_input_tokens_seen": 56117655, "step": 2583, "time_per_iteration": 2.664794683456421 }, { "auxiliary_loss_clip": 0.01139818, "auxiliary_loss_mlp": 0.01054771, "balance_loss_clip": 1.04957032, "balance_loss_mlp": 1.03252697, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.8743170599575527, "language_loss": 0.76320136, "learning_rate": 3.837404935067705e-06, "loss": 0.78514719, "num_input_tokens_seen": 56141960, "step": 2584, "time_per_iteration": 2.757392168045044 }, { "auxiliary_loss_clip": 0.01137324, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.04884958, "balance_loss_mlp": 1.02302885, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 1.6493041410587026, "language_loss": 0.75269651, "learning_rate": 3.837251082205368e-06, "loss": 0.77449471, "num_input_tokens_seen": 56161430, "step": 2585, "time_per_iteration": 2.6497461795806885 }, { "auxiliary_loss_clip": 0.01116144, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.02321053, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 2.068989677221064, "language_loss": 0.61187196, "learning_rate": 3.837097159674286e-06, "loss": 0.63345695, "num_input_tokens_seen": 56179390, "step": 2586, "time_per_iteration": 2.697852373123169 }, { "auxiliary_loss_clip": 0.01129408, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.04842281, "balance_loss_mlp": 1.02341127, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.8484108176722505, "language_loss": 0.81318939, "learning_rate": 3.836943167480296e-06, "loss": 0.83490539, "num_input_tokens_seen": 56198020, "step": 2587, "time_per_iteration": 4.212551593780518 }, { "auxiliary_loss_clip": 0.01160891, "auxiliary_loss_mlp": 0.01054822, "balance_loss_clip": 1.05309868, "balance_loss_mlp": 1.03325701, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.866779523391448, "language_loss": 0.88716942, "learning_rate": 3.836789105629236e-06, "loss": 0.90932655, "num_input_tokens_seen": 56218165, "step": 2588, "time_per_iteration": 4.192267894744873 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01052123, "balance_loss_clip": 1.04558384, "balance_loss_mlp": 1.03164268, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 2.018423224363699, "language_loss": 0.64624381, "learning_rate": 3.83663497412695e-06, "loss": 0.66771483, "num_input_tokens_seen": 56237160, "step": 2589, "time_per_iteration": 4.303871154785156 }, { "auxiliary_loss_clip": 0.01104407, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.04520249, "balance_loss_mlp": 1.02123344, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 1.784618480549341, "language_loss": 0.82832813, "learning_rate": 3.836480772979281e-06, "loss": 0.84979194, "num_input_tokens_seen": 56257610, "step": 2590, "time_per_iteration": 4.460350751876831 }, { "auxiliary_loss_clip": 0.011248, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.05032134, "balance_loss_mlp": 1.02694952, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.6687659077907484, "language_loss": 0.78766, "learning_rate": 3.836326502192077e-06, "loss": 0.80938083, "num_input_tokens_seen": 56275215, "step": 2591, "time_per_iteration": 2.73305606842041 }, { "auxiliary_loss_clip": 0.01143879, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05174232, "balance_loss_mlp": 1.03137255, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.0331558547393054, "language_loss": 0.65025747, "learning_rate": 3.836172161771189e-06, "loss": 0.67218637, "num_input_tokens_seen": 56297130, "step": 2592, "time_per_iteration": 2.8582632541656494 }, { "auxiliary_loss_clip": 0.01136043, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.05417228, "balance_loss_mlp": 1.0322001, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.311634250072179, "language_loss": 0.82506329, "learning_rate": 3.836017751722467e-06, "loss": 0.84694475, "num_input_tokens_seen": 56314995, "step": 2593, "time_per_iteration": 2.7230453491210938 }, { "auxiliary_loss_clip": 0.01142565, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.05237365, "balance_loss_mlp": 1.02676034, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.778410683125911, "language_loss": 0.73220694, "learning_rate": 3.8358632720517695e-06, "loss": 0.75409293, "num_input_tokens_seen": 56334005, "step": 2594, "time_per_iteration": 2.708063840866089 }, { "auxiliary_loss_clip": 0.01117989, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.0453043, "balance_loss_mlp": 1.02077007, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.1444704922101105, "language_loss": 0.81569934, "learning_rate": 3.835708722764952e-06, "loss": 0.83728027, "num_input_tokens_seen": 56353795, "step": 2595, "time_per_iteration": 2.716334581375122 }, { "auxiliary_loss_clip": 0.01155359, "auxiliary_loss_mlp": 0.01043269, "balance_loss_clip": 1.05093551, "balance_loss_mlp": 1.0238502, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 1.8943501893042642, "language_loss": 0.86674929, "learning_rate": 3.835554103867876e-06, "loss": 0.88873553, "num_input_tokens_seen": 56373195, "step": 2596, "time_per_iteration": 2.5947446823120117 }, { "auxiliary_loss_clip": 0.01144729, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05225515, "balance_loss_mlp": 1.02360725, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 1.8059460934517404, "language_loss": 0.68772388, "learning_rate": 3.835399415366404e-06, "loss": 0.70959222, "num_input_tokens_seen": 56391525, "step": 2597, "time_per_iteration": 2.8101041316986084 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01050835, "balance_loss_clip": 1.05409336, "balance_loss_mlp": 1.03165436, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.9103744906429732, "language_loss": 0.79860938, "learning_rate": 3.8352446572664035e-06, "loss": 0.82041842, "num_input_tokens_seen": 56410715, "step": 2598, "time_per_iteration": 2.695117950439453 }, { "auxiliary_loss_clip": 0.0112861, "auxiliary_loss_mlp": 0.00776118, "balance_loss_clip": 1.04750216, "balance_loss_mlp": 1.0006249, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 3.1104681024188827, "language_loss": 0.83092594, "learning_rate": 3.8350898295737405e-06, "loss": 0.84997326, "num_input_tokens_seen": 56429170, "step": 2599, "time_per_iteration": 2.665703773498535 }, { "auxiliary_loss_clip": 0.01160593, "auxiliary_loss_mlp": 0.0105002, "balance_loss_clip": 1.05274248, "balance_loss_mlp": 1.02924192, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.2910683048406266, "language_loss": 0.81530893, "learning_rate": 3.834934932294287e-06, "loss": 0.83741504, "num_input_tokens_seen": 56445685, "step": 2600, "time_per_iteration": 2.615651845932007 }, { "auxiliary_loss_clip": 0.01161023, "auxiliary_loss_mlp": 0.00776671, "balance_loss_clip": 1.05562234, "balance_loss_mlp": 1.00063944, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 1.7832591469657297, "language_loss": 0.88511437, "learning_rate": 3.834779965433917e-06, "loss": 0.90449131, "num_input_tokens_seen": 56465900, "step": 2601, "time_per_iteration": 2.6833529472351074 }, { "auxiliary_loss_clip": 0.0116257, "auxiliary_loss_mlp": 0.0106307, "balance_loss_clip": 1.05569744, "balance_loss_mlp": 1.04120743, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.9421054688538308, "language_loss": 0.78707534, "learning_rate": 3.834624928998508e-06, "loss": 0.80933177, "num_input_tokens_seen": 56485020, "step": 2602, "time_per_iteration": 2.6296608448028564 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.05035329, "balance_loss_mlp": 1.02419758, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8230718276715763, "language_loss": 0.74029547, "learning_rate": 3.8344698229939376e-06, "loss": 0.76199877, "num_input_tokens_seen": 56505205, "step": 2603, "time_per_iteration": 2.744508743286133 }, { "auxiliary_loss_clip": 0.01143305, "auxiliary_loss_mlp": 0.01051047, "balance_loss_clip": 1.04820418, "balance_loss_mlp": 1.03112721, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 4.041164356714064, "language_loss": 0.87723601, "learning_rate": 3.8343146474260865e-06, "loss": 0.89917958, "num_input_tokens_seen": 56521495, "step": 2604, "time_per_iteration": 2.682457447052002 }, { "auxiliary_loss_clip": 0.01145351, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.04976749, "balance_loss_mlp": 1.0256021, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 2.260429022209425, "language_loss": 0.8573193, "learning_rate": 3.834159402300841e-06, "loss": 0.87922043, "num_input_tokens_seen": 56540665, "step": 2605, "time_per_iteration": 2.7724974155426025 }, { "auxiliary_loss_clip": 0.0115108, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.02676356, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 1.7309636492693905, "language_loss": 0.73101914, "learning_rate": 3.834004087624087e-06, "loss": 0.75300246, "num_input_tokens_seen": 56560805, "step": 2606, "time_per_iteration": 2.7490081787109375 }, { "auxiliary_loss_clip": 0.01158388, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.0552665, "balance_loss_mlp": 1.03165627, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 2.968092109370304, "language_loss": 0.76497948, "learning_rate": 3.8338487034017145e-06, "loss": 0.78706092, "num_input_tokens_seen": 56576335, "step": 2607, "time_per_iteration": 2.6597230434417725 }, { "auxiliary_loss_clip": 0.01120645, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05131412, "balance_loss_mlp": 1.0284934, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.7981763092074996, "language_loss": 0.82107675, "learning_rate": 3.833693249639615e-06, "loss": 0.84275496, "num_input_tokens_seen": 56595880, "step": 2608, "time_per_iteration": 2.7072103023529053 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.02436399, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 1.6817301031159713, "language_loss": 0.72335941, "learning_rate": 3.833537726343684e-06, "loss": 0.74509382, "num_input_tokens_seen": 56615130, "step": 2609, "time_per_iteration": 2.690690755844116 }, { "auxiliary_loss_clip": 0.01143972, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.04901087, "balance_loss_mlp": 1.01756072, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 5.132438477880424, "language_loss": 0.72317064, "learning_rate": 3.833382133519818e-06, "loss": 0.74497753, "num_input_tokens_seen": 56634005, "step": 2610, "time_per_iteration": 2.6515614986419678 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01051513, "balance_loss_clip": 1.05216432, "balance_loss_mlp": 1.03063977, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.0600295188113935, "language_loss": 0.72915608, "learning_rate": 3.833226471173919e-06, "loss": 0.75126708, "num_input_tokens_seen": 56653480, "step": 2611, "time_per_iteration": 2.630988359451294 }, { "auxiliary_loss_clip": 0.01141924, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.04917872, "balance_loss_mlp": 1.0231905, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 2.0339762399532186, "language_loss": 0.70766544, "learning_rate": 3.833070739311887e-06, "loss": 0.72951007, "num_input_tokens_seen": 56672270, "step": 2612, "time_per_iteration": 2.6569461822509766 }, { "auxiliary_loss_clip": 0.01116284, "auxiliary_loss_mlp": 0.01051299, "balance_loss_clip": 1.04844582, "balance_loss_mlp": 1.03221321, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 1.9704781930994688, "language_loss": 0.76294881, "learning_rate": 3.83291493793963e-06, "loss": 0.78462464, "num_input_tokens_seen": 56691510, "step": 2613, "time_per_iteration": 2.7188539505004883 }, { "auxiliary_loss_clip": 0.01115155, "auxiliary_loss_mlp": 0.01049301, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.02956033, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 2.137998057111896, "language_loss": 0.65944499, "learning_rate": 3.832759067063055e-06, "loss": 0.68108952, "num_input_tokens_seen": 56712230, "step": 2614, "time_per_iteration": 2.7550084590911865 }, { "auxiliary_loss_clip": 0.01151987, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.05387104, "balance_loss_mlp": 1.02374101, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 2.2755662506820915, "language_loss": 0.75204211, "learning_rate": 3.832603126688072e-06, "loss": 0.77400374, "num_input_tokens_seen": 56727490, "step": 2615, "time_per_iteration": 2.683225154876709 }, { "auxiliary_loss_clip": 0.01138545, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.05209839, "balance_loss_mlp": 1.03078008, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 2.581872009488739, "language_loss": 0.73064095, "learning_rate": 3.832447116820594e-06, "loss": 0.75253528, "num_input_tokens_seen": 56747385, "step": 2616, "time_per_iteration": 2.6660919189453125 }, { "auxiliary_loss_clip": 0.01130717, "auxiliary_loss_mlp": 0.01047511, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.02794933, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 2.813587490853999, "language_loss": 0.72425079, "learning_rate": 3.832291037466539e-06, "loss": 0.74603307, "num_input_tokens_seen": 56768055, "step": 2617, "time_per_iteration": 2.768561363220215 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0104637, "balance_loss_clip": 1.04947805, "balance_loss_mlp": 1.02548432, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 2.3222819484870016, "language_loss": 0.74358094, "learning_rate": 3.8321348886318235e-06, "loss": 0.76542777, "num_input_tokens_seen": 56785110, "step": 2618, "time_per_iteration": 2.66121768951416 }, { "auxiliary_loss_clip": 0.01162954, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.05417252, "balance_loss_mlp": 1.02526867, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 1.8808629075569874, "language_loss": 0.78896272, "learning_rate": 3.8319786703223695e-06, "loss": 0.81105405, "num_input_tokens_seen": 56804975, "step": 2619, "time_per_iteration": 2.6743338108062744 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01055551, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.03576207, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.9082963728737496, "language_loss": 0.76517296, "learning_rate": 3.831822382544101e-06, "loss": 0.78698927, "num_input_tokens_seen": 56822470, "step": 2620, "time_per_iteration": 2.6481080055236816 }, { "auxiliary_loss_clip": 0.01136128, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.02488887, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.6603432400664486, "language_loss": 0.7136035, "learning_rate": 3.831666025302944e-06, "loss": 0.73542225, "num_input_tokens_seen": 56842100, "step": 2621, "time_per_iteration": 2.70985746383667 }, { "auxiliary_loss_clip": 0.01103274, "auxiliary_loss_mlp": 0.01052522, "balance_loss_clip": 1.04624665, "balance_loss_mlp": 1.02921629, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 2.1843515622778624, "language_loss": 0.72136736, "learning_rate": 3.831509598604828e-06, "loss": 0.74292529, "num_input_tokens_seen": 56865920, "step": 2622, "time_per_iteration": 3.024561643600464 }, { "auxiliary_loss_clip": 0.01095163, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.04474711, "balance_loss_mlp": 1.02464843, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.6586715789846178, "language_loss": 0.87637675, "learning_rate": 3.831353102455684e-06, "loss": 0.8977679, "num_input_tokens_seen": 56885265, "step": 2623, "time_per_iteration": 2.9600114822387695 }, { "auxiliary_loss_clip": 0.01158714, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05476475, "balance_loss_mlp": 1.02564478, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 1.6915331173398198, "language_loss": 0.81600082, "learning_rate": 3.831196536861448e-06, "loss": 0.83803129, "num_input_tokens_seen": 56906710, "step": 2624, "time_per_iteration": 2.6621103286743164 }, { "auxiliary_loss_clip": 0.01122344, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.04776418, "balance_loss_mlp": 1.02990842, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 2.879465237309773, "language_loss": 0.79977828, "learning_rate": 3.831039901828054e-06, "loss": 0.82149595, "num_input_tokens_seen": 56924275, "step": 2625, "time_per_iteration": 2.7291064262390137 }, { "auxiliary_loss_clip": 0.01157938, "auxiliary_loss_mlp": 0.01046203, "balance_loss_clip": 1.05403268, "balance_loss_mlp": 1.02857196, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.133783972400447, "language_loss": 0.80332482, "learning_rate": 3.830883197361445e-06, "loss": 0.8253662, "num_input_tokens_seen": 56941525, "step": 2626, "time_per_iteration": 4.252760171890259 }, { "auxiliary_loss_clip": 0.01102762, "auxiliary_loss_mlp": 0.01057658, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.03512752, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 3.9802810067864045, "language_loss": 0.73636395, "learning_rate": 3.830726423467561e-06, "loss": 0.75796819, "num_input_tokens_seen": 56962145, "step": 2627, "time_per_iteration": 4.328871250152588 }, { "auxiliary_loss_clip": 0.01117433, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.0503006, "balance_loss_mlp": 1.0351001, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 2.0211273696228216, "language_loss": 0.84589541, "learning_rate": 3.830569580152348e-06, "loss": 0.86762005, "num_input_tokens_seen": 56977505, "step": 2628, "time_per_iteration": 2.6785013675689697 }, { "auxiliary_loss_clip": 0.01129476, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.02308416, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.897214582222077, "language_loss": 0.76437485, "learning_rate": 3.830412667421752e-06, "loss": 0.78607821, "num_input_tokens_seen": 56996770, "step": 2629, "time_per_iteration": 4.2878499031066895 }, { "auxiliary_loss_clip": 0.01143973, "auxiliary_loss_mlp": 0.01046449, "balance_loss_clip": 1.0529623, "balance_loss_mlp": 1.02675569, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.252423233454998, "language_loss": 0.73337436, "learning_rate": 3.8302556852817245e-06, "loss": 0.75527859, "num_input_tokens_seen": 57014970, "step": 2630, "time_per_iteration": 4.253108263015747 }, { "auxiliary_loss_clip": 0.01156261, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.05644512, "balance_loss_mlp": 1.02615929, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.390369083551665, "language_loss": 0.83678091, "learning_rate": 3.8300986337382184e-06, "loss": 0.85879952, "num_input_tokens_seen": 57034045, "step": 2631, "time_per_iteration": 2.6145882606506348 }, { "auxiliary_loss_clip": 0.01159092, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.05313432, "balance_loss_mlp": 1.02746117, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.8755653224160422, "language_loss": 0.78415525, "learning_rate": 3.8299415127971895e-06, "loss": 0.80620384, "num_input_tokens_seen": 57053695, "step": 2632, "time_per_iteration": 2.656691551208496 }, { "auxiliary_loss_clip": 0.01151481, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05574381, "balance_loss_mlp": 1.03769732, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 2.079450153413421, "language_loss": 0.8301838, "learning_rate": 3.829784322464594e-06, "loss": 0.85227144, "num_input_tokens_seen": 57071290, "step": 2633, "time_per_iteration": 2.622725248336792 }, { "auxiliary_loss_clip": 0.01165069, "auxiliary_loss_mlp": 0.01041545, "balance_loss_clip": 1.05761647, "balance_loss_mlp": 1.02223265, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 2.1719104392782813, "language_loss": 0.77448404, "learning_rate": 3.829627062746394e-06, "loss": 0.79655015, "num_input_tokens_seen": 57091465, "step": 2634, "time_per_iteration": 2.6383235454559326 }, { "auxiliary_loss_clip": 0.01127407, "auxiliary_loss_mlp": 0.00777775, "balance_loss_clip": 1.05277348, "balance_loss_mlp": 1.00136137, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 3.5133527254089087, "language_loss": 0.88479185, "learning_rate": 3.829469733648552e-06, "loss": 0.90384364, "num_input_tokens_seen": 57110075, "step": 2635, "time_per_iteration": 2.725924491882324 }, { "auxiliary_loss_clip": 0.01096223, "auxiliary_loss_mlp": 0.01058885, "balance_loss_clip": 1.04816198, "balance_loss_mlp": 1.03847599, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.8627721083207627, "language_loss": 0.75762677, "learning_rate": 3.829312335177034e-06, "loss": 0.77917778, "num_input_tokens_seen": 57128945, "step": 2636, "time_per_iteration": 2.775310516357422 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.05117822, "balance_loss_mlp": 1.02350879, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 2.388418559522659, "language_loss": 0.71977961, "learning_rate": 3.82915486733781e-06, "loss": 0.74141967, "num_input_tokens_seen": 57152385, "step": 2637, "time_per_iteration": 2.8375279903411865 }, { "auxiliary_loss_clip": 0.0115052, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.05661607, "balance_loss_mlp": 1.02640057, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 2.1640345554565057, "language_loss": 0.78352648, "learning_rate": 3.82899733013685e-06, "loss": 0.80548006, "num_input_tokens_seen": 57172620, "step": 2638, "time_per_iteration": 2.7298176288604736 }, { "auxiliary_loss_clip": 0.01129706, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.05311394, "balance_loss_mlp": 1.03715718, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 2.325769963269074, "language_loss": 0.75845039, "learning_rate": 3.828839723580128e-06, "loss": 0.78032124, "num_input_tokens_seen": 57194680, "step": 2639, "time_per_iteration": 2.7731449604034424 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05350864, "balance_loss_mlp": 1.03772068, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.173238447343554, "language_loss": 0.81319505, "learning_rate": 3.82868204767362e-06, "loss": 0.83481646, "num_input_tokens_seen": 57214675, "step": 2640, "time_per_iteration": 2.8024139404296875 }, { "auxiliary_loss_clip": 0.01135166, "auxiliary_loss_mlp": 0.01054673, "balance_loss_clip": 1.05492401, "balance_loss_mlp": 1.03426492, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 2.013499020988034, "language_loss": 0.66893363, "learning_rate": 3.828524302423306e-06, "loss": 0.69083202, "num_input_tokens_seen": 57235830, "step": 2641, "time_per_iteration": 2.7519116401672363 }, { "auxiliary_loss_clip": 0.01149448, "auxiliary_loss_mlp": 0.01051949, "balance_loss_clip": 1.05758858, "balance_loss_mlp": 1.0326376, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.139760259286454, "language_loss": 0.7552591, "learning_rate": 3.828366487835167e-06, "loss": 0.77727306, "num_input_tokens_seen": 57255970, "step": 2642, "time_per_iteration": 2.706136465072632 }, { "auxiliary_loss_clip": 0.01156917, "auxiliary_loss_mlp": 0.01042142, "balance_loss_clip": 1.06263423, "balance_loss_mlp": 1.02323556, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 1.9419610036505286, "language_loss": 0.70564604, "learning_rate": 3.828208603915186e-06, "loss": 0.72763658, "num_input_tokens_seen": 57274435, "step": 2643, "time_per_iteration": 2.682015895843506 }, { "auxiliary_loss_clip": 0.01161783, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.05891204, "balance_loss_mlp": 1.02389312, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 1.846517711414915, "language_loss": 0.78057045, "learning_rate": 3.828050650669353e-06, "loss": 0.80260473, "num_input_tokens_seen": 57293115, "step": 2644, "time_per_iteration": 2.683790922164917 }, { "auxiliary_loss_clip": 0.01151239, "auxiliary_loss_mlp": 0.01050105, "balance_loss_clip": 1.05701637, "balance_loss_mlp": 1.03154373, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 3.757920662841351, "language_loss": 0.81961924, "learning_rate": 3.827892628103657e-06, "loss": 0.84163266, "num_input_tokens_seen": 57312565, "step": 2645, "time_per_iteration": 2.698085069656372 }, { "auxiliary_loss_clip": 0.01162748, "auxiliary_loss_mlp": 0.01048492, "balance_loss_clip": 1.05487716, "balance_loss_mlp": 1.02854836, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.056693785790565, "language_loss": 0.69412929, "learning_rate": 3.827734536224087e-06, "loss": 0.71624172, "num_input_tokens_seen": 57333360, "step": 2646, "time_per_iteration": 2.7166528701782227 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.05435526, "balance_loss_mlp": 1.02223015, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.5975497323405055, "language_loss": 0.62932581, "learning_rate": 3.827576375036642e-06, "loss": 0.65107965, "num_input_tokens_seen": 57350575, "step": 2647, "time_per_iteration": 2.7405354976654053 }, { "auxiliary_loss_clip": 0.01160144, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.05654776, "balance_loss_mlp": 1.02896523, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 2.2161421076431025, "language_loss": 0.89490473, "learning_rate": 3.827418144547318e-06, "loss": 0.91698575, "num_input_tokens_seen": 57367570, "step": 2648, "time_per_iteration": 2.6193346977233887 }, { "auxiliary_loss_clip": 0.01158791, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.05630398, "balance_loss_mlp": 1.03072906, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 1.9960039108301237, "language_loss": 0.91307199, "learning_rate": 3.827259844762114e-06, "loss": 0.93514073, "num_input_tokens_seen": 57383980, "step": 2649, "time_per_iteration": 2.6137378215789795 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.05474401, "balance_loss_mlp": 1.02439272, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3504548368335767, "language_loss": 0.71782613, "learning_rate": 3.827101475687033e-06, "loss": 0.73926663, "num_input_tokens_seen": 57400840, "step": 2650, "time_per_iteration": 2.8883376121520996 }, { "auxiliary_loss_clip": 0.01146809, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05386841, "balance_loss_mlp": 1.02476835, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 1.8238326955956992, "language_loss": 0.71427429, "learning_rate": 3.826943037328082e-06, "loss": 0.73616046, "num_input_tokens_seen": 57419230, "step": 2651, "time_per_iteration": 2.607879638671875 }, { "auxiliary_loss_clip": 0.01118842, "auxiliary_loss_mlp": 0.00777496, "balance_loss_clip": 1.05154157, "balance_loss_mlp": 1.00132799, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 1.8928974850955373, "language_loss": 0.80185902, "learning_rate": 3.8267845296912674e-06, "loss": 0.82082248, "num_input_tokens_seen": 57439315, "step": 2652, "time_per_iteration": 2.718695640563965 }, { "auxiliary_loss_clip": 0.01138048, "auxiliary_loss_mlp": 0.00775, "balance_loss_clip": 1.0567826, "balance_loss_mlp": 1.00124729, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 2.6116065834427387, "language_loss": 0.69539076, "learning_rate": 3.826625952782601e-06, "loss": 0.71452117, "num_input_tokens_seen": 57454635, "step": 2653, "time_per_iteration": 2.7088639736175537 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02050805, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 2.1937273620657307, "language_loss": 0.76670635, "learning_rate": 3.826467306608095e-06, "loss": 0.78854191, "num_input_tokens_seen": 57476805, "step": 2654, "time_per_iteration": 2.79425048828125 }, { "auxiliary_loss_clip": 0.01114313, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.04714727, "balance_loss_mlp": 1.02248931, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.0572535633716247, "language_loss": 0.81873977, "learning_rate": 3.826308591173765e-06, "loss": 0.84029424, "num_input_tokens_seen": 57496400, "step": 2655, "time_per_iteration": 2.6990878582000732 }, { "auxiliary_loss_clip": 0.01112525, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.04670715, "balance_loss_mlp": 1.02849984, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 2.0964800101687486, "language_loss": 0.73768878, "learning_rate": 3.826149806485631e-06, "loss": 0.75927746, "num_input_tokens_seen": 57513700, "step": 2656, "time_per_iteration": 2.7409873008728027 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04749918, "balance_loss_mlp": 1.02220988, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 2.516351978408242, "language_loss": 0.77637637, "learning_rate": 3.825990952549713e-06, "loss": 0.79788804, "num_input_tokens_seen": 57536180, "step": 2657, "time_per_iteration": 2.984161376953125 }, { "auxiliary_loss_clip": 0.01142397, "auxiliary_loss_mlp": 0.01048058, "balance_loss_clip": 1.05276513, "balance_loss_mlp": 1.02984321, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 2.1741432296797303, "language_loss": 0.74654955, "learning_rate": 3.825832029372035e-06, "loss": 0.76845407, "num_input_tokens_seen": 57555025, "step": 2658, "time_per_iteration": 2.6795172691345215 }, { "auxiliary_loss_clip": 0.01137294, "auxiliary_loss_mlp": 0.01047097, "balance_loss_clip": 1.05887127, "balance_loss_mlp": 1.02581763, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 2.2676743120149916, "language_loss": 0.75164986, "learning_rate": 3.825673036958624e-06, "loss": 0.77349377, "num_input_tokens_seen": 57577660, "step": 2659, "time_per_iteration": 2.885744094848633 }, { "auxiliary_loss_clip": 0.01122752, "auxiliary_loss_mlp": 0.0105323, "balance_loss_clip": 1.0512991, "balance_loss_mlp": 1.0334295, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.181311046841435, "language_loss": 0.90998709, "learning_rate": 3.825513975315508e-06, "loss": 0.93174696, "num_input_tokens_seen": 57596335, "step": 2660, "time_per_iteration": 2.7562267780303955 }, { "auxiliary_loss_clip": 0.01114547, "auxiliary_loss_mlp": 0.01058378, "balance_loss_clip": 1.05538487, "balance_loss_mlp": 1.03590751, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.746468400789071, "language_loss": 0.77724659, "learning_rate": 3.82535484444872e-06, "loss": 0.79897583, "num_input_tokens_seen": 57616830, "step": 2661, "time_per_iteration": 2.9896914958953857 }, { "auxiliary_loss_clip": 0.0113781, "auxiliary_loss_mlp": 0.00777461, "balance_loss_clip": 1.05382478, "balance_loss_mlp": 1.00132632, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 2.0483033922540086, "language_loss": 0.74442393, "learning_rate": 3.825195644364292e-06, "loss": 0.76357663, "num_input_tokens_seen": 57635515, "step": 2662, "time_per_iteration": 2.7993714809417725 }, { "auxiliary_loss_clip": 0.01135674, "auxiliary_loss_mlp": 0.00780783, "balance_loss_clip": 1.05392313, "balance_loss_mlp": 1.0016191, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 2.9903694104875984, "language_loss": 0.82515085, "learning_rate": 3.825036375068263e-06, "loss": 0.84431541, "num_input_tokens_seen": 57654250, "step": 2663, "time_per_iteration": 2.678490161895752 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.05182636, "balance_loss_mlp": 1.02574801, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.06786422122115, "language_loss": 0.7951405, "learning_rate": 3.824877036566672e-06, "loss": 0.81671166, "num_input_tokens_seen": 57672645, "step": 2664, "time_per_iteration": 2.819880962371826 }, { "auxiliary_loss_clip": 0.01151449, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05374622, "balance_loss_mlp": 1.02886605, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.6697703441146605, "language_loss": 0.93748474, "learning_rate": 3.824717628865561e-06, "loss": 0.95947945, "num_input_tokens_seen": 57691055, "step": 2665, "time_per_iteration": 2.697660446166992 }, { "auxiliary_loss_clip": 0.01127607, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.05185676, "balance_loss_mlp": 1.02774525, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.9655602739253095, "language_loss": 0.85237324, "learning_rate": 3.824558151970974e-06, "loss": 0.87412339, "num_input_tokens_seen": 57707235, "step": 2666, "time_per_iteration": 4.282273530960083 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.00777125, "balance_loss_clip": 1.05257225, "balance_loss_mlp": 1.00145936, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 1.8366839898970433, "language_loss": 0.81284773, "learning_rate": 3.8243986058889595e-06, "loss": 0.83191717, "num_input_tokens_seen": 57724190, "step": 2667, "time_per_iteration": 2.69508695602417 }, { "auxiliary_loss_clip": 0.0116556, "auxiliary_loss_mlp": 0.01046526, "balance_loss_clip": 1.06089485, "balance_loss_mlp": 1.02643883, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 1.958935842080623, "language_loss": 0.74031079, "learning_rate": 3.824238990625567e-06, "loss": 0.76243162, "num_input_tokens_seen": 57743620, "step": 2668, "time_per_iteration": 4.2559425830841064 }, { "auxiliary_loss_clip": 0.01148853, "auxiliary_loss_mlp": 0.01051992, "balance_loss_clip": 1.05547619, "balance_loss_mlp": 1.03240585, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 1.7737626564305047, "language_loss": 0.77495629, "learning_rate": 3.824079306186848e-06, "loss": 0.7969647, "num_input_tokens_seen": 57764810, "step": 2669, "time_per_iteration": 2.6424050331115723 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01012737, "balance_loss_clip": 1.06351233, "balance_loss_mlp": 1.00986385, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.8041290684345284, "language_loss": 0.5549804, "learning_rate": 3.823919552578861e-06, "loss": 0.57608116, "num_input_tokens_seen": 57824390, "step": 2670, "time_per_iteration": 4.765664100646973 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02430916, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.6306224128650464, "language_loss": 0.77778888, "learning_rate": 3.82375972980766e-06, "loss": 0.7997191, "num_input_tokens_seen": 57843665, "step": 2671, "time_per_iteration": 2.6876416206359863 }, { "auxiliary_loss_clip": 0.01151164, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05529547, "balance_loss_mlp": 1.02503204, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 1.9167251889277674, "language_loss": 0.64766788, "learning_rate": 3.8235998378793086e-06, "loss": 0.66961908, "num_input_tokens_seen": 57863305, "step": 2672, "time_per_iteration": 2.7102553844451904 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.05674481, "balance_loss_mlp": 1.02554154, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.045175098484539, "language_loss": 0.85708207, "learning_rate": 3.8234398767998675e-06, "loss": 0.87905198, "num_input_tokens_seen": 57883025, "step": 2673, "time_per_iteration": 2.656360626220703 }, { "auxiliary_loss_clip": 0.01125542, "auxiliary_loss_mlp": 0.01055838, "balance_loss_clip": 1.05366015, "balance_loss_mlp": 1.03716969, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.339006860757087, "language_loss": 0.7289716, "learning_rate": 3.823279846575403e-06, "loss": 0.75078535, "num_input_tokens_seen": 57901430, "step": 2674, "time_per_iteration": 2.7122414112091064 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.05416465, "balance_loss_mlp": 1.02464211, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.9341682597436423, "language_loss": 0.84438515, "learning_rate": 3.823119747211986e-06, "loss": 0.86634052, "num_input_tokens_seen": 57919550, "step": 2675, "time_per_iteration": 2.6646435260772705 }, { "auxiliary_loss_clip": 0.01116221, "auxiliary_loss_mlp": 0.01049343, "balance_loss_clip": 1.05220723, "balance_loss_mlp": 1.02823126, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 1.871909119220515, "language_loss": 0.82216591, "learning_rate": 3.822959578715685e-06, "loss": 0.84382153, "num_input_tokens_seen": 57939890, "step": 2676, "time_per_iteration": 2.8457534313201904 }, { "auxiliary_loss_clip": 0.01151157, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.05746996, "balance_loss_mlp": 1.03162253, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 2.1166154816193923, "language_loss": 0.73485494, "learning_rate": 3.822799341092573e-06, "loss": 0.75686526, "num_input_tokens_seen": 57957410, "step": 2677, "time_per_iteration": 2.65387225151062 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.05438483, "balance_loss_mlp": 1.02537322, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 3.229282061984371, "language_loss": 0.76305777, "learning_rate": 3.822639034348728e-06, "loss": 0.78488332, "num_input_tokens_seen": 57977900, "step": 2678, "time_per_iteration": 2.836071014404297 }, { "auxiliary_loss_clip": 0.01148252, "auxiliary_loss_mlp": 0.01047887, "balance_loss_clip": 1.05379987, "balance_loss_mlp": 1.02789569, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 8.295814069484678, "language_loss": 0.70340431, "learning_rate": 3.822478658490228e-06, "loss": 0.7253657, "num_input_tokens_seen": 57998210, "step": 2679, "time_per_iteration": 2.771185874938965 }, { "auxiliary_loss_clip": 0.01059502, "auxiliary_loss_mlp": 0.00758644, "balance_loss_clip": 1.04695845, "balance_loss_mlp": 1.00150955, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7819629653273137, "language_loss": 0.51843339, "learning_rate": 3.822318213523154e-06, "loss": 0.53661484, "num_input_tokens_seen": 58059420, "step": 2680, "time_per_iteration": 3.3107378482818604 }, { "auxiliary_loss_clip": 0.01144342, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.05360317, "balance_loss_mlp": 1.02632904, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 1.6718368455031125, "language_loss": 0.8028667, "learning_rate": 3.8221576994535925e-06, "loss": 0.82478368, "num_input_tokens_seen": 58078370, "step": 2681, "time_per_iteration": 2.6986513137817383 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01055518, "balance_loss_clip": 1.05603266, "balance_loss_mlp": 1.03602743, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 2.154781054673542, "language_loss": 0.68957973, "learning_rate": 3.821997116287627e-06, "loss": 0.71152687, "num_input_tokens_seen": 58097395, "step": 2682, "time_per_iteration": 2.794686794281006 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01052349, "balance_loss_clip": 1.05670619, "balance_loss_mlp": 1.03195262, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 1.9802191055590168, "language_loss": 0.87362224, "learning_rate": 3.821836464031348e-06, "loss": 0.89553785, "num_input_tokens_seen": 58115630, "step": 2683, "time_per_iteration": 2.703634262084961 }, { "auxiliary_loss_clip": 0.01165497, "auxiliary_loss_mlp": 0.0105575, "balance_loss_clip": 1.05714059, "balance_loss_mlp": 1.03491259, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.939499216066865, "language_loss": 0.74143028, "learning_rate": 3.821675742690849e-06, "loss": 0.76364273, "num_input_tokens_seen": 58138655, "step": 2684, "time_per_iteration": 2.7890264987945557 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.00778989, "balance_loss_clip": 1.05435085, "balance_loss_mlp": 1.00176883, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.9009911635557044, "language_loss": 0.70506597, "learning_rate": 3.821514952272223e-06, "loss": 0.72422272, "num_input_tokens_seen": 58157440, "step": 2685, "time_per_iteration": 2.803942918777466 }, { "auxiliary_loss_clip": 0.01116315, "auxiliary_loss_mlp": 0.01059092, "balance_loss_clip": 1.05291295, "balance_loss_mlp": 1.03757524, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 2.295686008167468, "language_loss": 0.72060591, "learning_rate": 3.821354092781567e-06, "loss": 0.74236, "num_input_tokens_seen": 58176660, "step": 2686, "time_per_iteration": 2.850309133529663 }, { "auxiliary_loss_clip": 0.01153803, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.05603862, "balance_loss_mlp": 1.02922952, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 2.056921120199424, "language_loss": 0.81720114, "learning_rate": 3.821193164224981e-06, "loss": 0.83922803, "num_input_tokens_seen": 58195085, "step": 2687, "time_per_iteration": 2.7077832221984863 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.05335689, "balance_loss_mlp": 1.02910483, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 1.6747986106054085, "language_loss": 0.71680355, "learning_rate": 3.821032166608568e-06, "loss": 0.73885429, "num_input_tokens_seen": 58213540, "step": 2688, "time_per_iteration": 2.700073480606079 }, { "auxiliary_loss_clip": 0.0112226, "auxiliary_loss_mlp": 0.0105252, "balance_loss_clip": 1.0517168, "balance_loss_mlp": 1.03330338, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 2.2887064413695253, "language_loss": 0.76168394, "learning_rate": 3.8208710999384325e-06, "loss": 0.78343177, "num_input_tokens_seen": 58236995, "step": 2689, "time_per_iteration": 2.846964120864868 }, { "auxiliary_loss_clip": 0.01166324, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 1.05979431, "balance_loss_mlp": 1.03308284, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 2.045037041298705, "language_loss": 0.87211925, "learning_rate": 3.820709964220683e-06, "loss": 0.89431226, "num_input_tokens_seen": 58257230, "step": 2690, "time_per_iteration": 2.704497814178467 }, { "auxiliary_loss_clip": 0.01143898, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.05318451, "balance_loss_mlp": 1.02890396, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.7518031225399346, "language_loss": 0.87899524, "learning_rate": 3.8205487594614284e-06, "loss": 0.90089989, "num_input_tokens_seen": 58277080, "step": 2691, "time_per_iteration": 2.6763153076171875 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01053114, "balance_loss_clip": 1.05237532, "balance_loss_mlp": 1.03142977, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.1723450057475313, "language_loss": 0.81989783, "learning_rate": 3.820387485666784e-06, "loss": 0.84193164, "num_input_tokens_seen": 58294815, "step": 2692, "time_per_iteration": 2.6381001472473145 }, { "auxiliary_loss_clip": 0.01167881, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.05555534, "balance_loss_mlp": 1.02499604, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 2.194958172554253, "language_loss": 0.81381011, "learning_rate": 3.820226142842862e-06, "loss": 0.83594954, "num_input_tokens_seen": 58313215, "step": 2693, "time_per_iteration": 2.6366944313049316 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01058298, "balance_loss_clip": 1.0587461, "balance_loss_mlp": 1.03991616, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 2.778189532536263, "language_loss": 0.83837044, "learning_rate": 3.820064730995783e-06, "loss": 0.86059809, "num_input_tokens_seen": 58333215, "step": 2694, "time_per_iteration": 2.7802140712738037 }, { "auxiliary_loss_clip": 0.01116209, "auxiliary_loss_mlp": 0.0105764, "balance_loss_clip": 1.04927421, "balance_loss_mlp": 1.0366354, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 1.8201511645490482, "language_loss": 0.69709098, "learning_rate": 3.819903250131667e-06, "loss": 0.71882945, "num_input_tokens_seen": 58351160, "step": 2695, "time_per_iteration": 2.756904125213623 }, { "auxiliary_loss_clip": 0.01155526, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.05799723, "balance_loss_mlp": 1.03026128, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 2.1550523064219487, "language_loss": 0.82986331, "learning_rate": 3.819741700256637e-06, "loss": 0.85192692, "num_input_tokens_seen": 58368505, "step": 2696, "time_per_iteration": 2.651510238647461 }, { "auxiliary_loss_clip": 0.01174193, "auxiliary_loss_mlp": 0.01052819, "balance_loss_clip": 1.05826569, "balance_loss_mlp": 1.03095615, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.9267990143146503, "language_loss": 0.8862049, "learning_rate": 3.8195800813768194e-06, "loss": 0.90847504, "num_input_tokens_seen": 58385085, "step": 2697, "time_per_iteration": 2.5935380458831787 }, { "auxiliary_loss_clip": 0.01158945, "auxiliary_loss_mlp": 0.01045471, "balance_loss_clip": 1.0552485, "balance_loss_mlp": 1.02719641, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.7480298293719791, "language_loss": 0.80844599, "learning_rate": 3.819418393498343e-06, "loss": 0.83049017, "num_input_tokens_seen": 58406985, "step": 2698, "time_per_iteration": 2.6685965061187744 }, { "auxiliary_loss_clip": 0.01151678, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05785704, "balance_loss_mlp": 1.03060579, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.590231062064763, "language_loss": 0.77499473, "learning_rate": 3.819256636627339e-06, "loss": 0.79701245, "num_input_tokens_seen": 58426205, "step": 2699, "time_per_iteration": 2.7206287384033203 }, { "auxiliary_loss_clip": 0.01134482, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.0504272, "balance_loss_mlp": 1.02510071, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.299083669251571, "language_loss": 0.85903585, "learning_rate": 3.81909481076994e-06, "loss": 0.88081944, "num_input_tokens_seen": 58443830, "step": 2700, "time_per_iteration": 2.6440224647521973 }, { "auxiliary_loss_clip": 0.01150266, "auxiliary_loss_mlp": 0.00778348, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.00180686, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.7679372116400307, "language_loss": 0.80424523, "learning_rate": 3.818932915932284e-06, "loss": 0.82353133, "num_input_tokens_seen": 58464405, "step": 2701, "time_per_iteration": 2.6943976879119873 }, { "auxiliary_loss_clip": 0.01144477, "auxiliary_loss_mlp": 0.01046291, "balance_loss_clip": 1.05771017, "balance_loss_mlp": 1.02664542, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.6539412057050027, "language_loss": 0.72777367, "learning_rate": 3.818770952120511e-06, "loss": 0.74968135, "num_input_tokens_seen": 58483295, "step": 2702, "time_per_iteration": 2.6914141178131104 }, { "auxiliary_loss_clip": 0.01156069, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.05802381, "balance_loss_mlp": 1.02896905, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.8265391375227176, "language_loss": 0.7273894, "learning_rate": 3.81860891934076e-06, "loss": 0.74945152, "num_input_tokens_seen": 58501205, "step": 2703, "time_per_iteration": 2.6301820278167725 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.0553968, "balance_loss_mlp": 1.02942359, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 3.0329584489902666, "language_loss": 0.70018482, "learning_rate": 3.818446817599176e-06, "loss": 0.72234988, "num_input_tokens_seen": 58522315, "step": 2704, "time_per_iteration": 2.6667227745056152 }, { "auxiliary_loss_clip": 0.01034679, "auxiliary_loss_mlp": 0.01001657, "balance_loss_clip": 1.03343439, "balance_loss_mlp": 0.99865305, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7801109588151329, "language_loss": 0.5336051, "learning_rate": 3.818284646901907e-06, "loss": 0.55396849, "num_input_tokens_seen": 58586695, "step": 2705, "time_per_iteration": 4.808594465255737 }, { "auxiliary_loss_clip": 0.01138628, "auxiliary_loss_mlp": 0.00781324, "balance_loss_clip": 1.0539608, "balance_loss_mlp": 1.00171995, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.3827832530074455, "language_loss": 0.7536028, "learning_rate": 3.818122407255102e-06, "loss": 0.77280229, "num_input_tokens_seen": 58602435, "step": 2706, "time_per_iteration": 4.126614570617676 }, { "auxiliary_loss_clip": 0.01130684, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.0523324, "balance_loss_mlp": 1.02859437, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 2.2272392184651038, "language_loss": 0.72203928, "learning_rate": 3.817960098664914e-06, "loss": 0.74382102, "num_input_tokens_seen": 58621275, "step": 2707, "time_per_iteration": 4.2739410400390625 }, { "auxiliary_loss_clip": 0.01142142, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.05433679, "balance_loss_mlp": 1.02898431, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 3.192481802987827, "language_loss": 0.83481139, "learning_rate": 3.817797721137495e-06, "loss": 0.85670936, "num_input_tokens_seen": 58637550, "step": 2708, "time_per_iteration": 2.7163965702056885 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.00781217, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.00177419, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.2850459718507654, "language_loss": 0.86162847, "learning_rate": 3.817635274679006e-06, "loss": 0.88045627, "num_input_tokens_seen": 58654135, "step": 2709, "time_per_iteration": 4.474989652633667 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.00777602, "balance_loss_clip": 1.05267572, "balance_loss_mlp": 1.00172114, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 2.581053296112052, "language_loss": 0.91410124, "learning_rate": 3.817472759295605e-06, "loss": 0.93328035, "num_input_tokens_seen": 58674320, "step": 2710, "time_per_iteration": 2.6951892375946045 }, { "auxiliary_loss_clip": 0.01118597, "auxiliary_loss_mlp": 0.01054854, "balance_loss_clip": 1.05254805, "balance_loss_mlp": 1.03451669, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.4322540773438437, "language_loss": 0.81690979, "learning_rate": 3.817310174993453e-06, "loss": 0.83864427, "num_input_tokens_seen": 58691000, "step": 2711, "time_per_iteration": 2.7854437828063965 }, { "auxiliary_loss_clip": 0.01146056, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.04954815, "balance_loss_mlp": 1.02107334, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 3.73256798888747, "language_loss": 0.8091476, "learning_rate": 3.817147521778719e-06, "loss": 0.83101463, "num_input_tokens_seen": 58710230, "step": 2712, "time_per_iteration": 2.834291458129883 }, { "auxiliary_loss_clip": 0.01171211, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.0590024, "balance_loss_mlp": 1.03273714, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 2.3460895846171996, "language_loss": 0.7681579, "learning_rate": 3.816984799657568e-06, "loss": 0.79040015, "num_input_tokens_seen": 58728610, "step": 2713, "time_per_iteration": 2.6188278198242188 }, { "auxiliary_loss_clip": 0.01156539, "auxiliary_loss_mlp": 0.0105792, "balance_loss_clip": 1.06240916, "balance_loss_mlp": 1.03832221, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.543173325075216, "language_loss": 0.79012156, "learning_rate": 3.8168220086361715e-06, "loss": 0.81226611, "num_input_tokens_seen": 58744385, "step": 2714, "time_per_iteration": 2.6534018516540527 }, { "auxiliary_loss_clip": 0.01149567, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.05467987, "balance_loss_mlp": 1.03724504, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.614702766215493, "language_loss": 0.77693665, "learning_rate": 3.816659148720702e-06, "loss": 0.79899377, "num_input_tokens_seen": 58763905, "step": 2715, "time_per_iteration": 2.856006383895874 }, { "auxiliary_loss_clip": 0.01129437, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.04810584, "balance_loss_mlp": 1.02525854, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.374975046722651, "language_loss": 0.81513858, "learning_rate": 3.816496219917336e-06, "loss": 0.83687335, "num_input_tokens_seen": 58785580, "step": 2716, "time_per_iteration": 2.6750845909118652 }, { "auxiliary_loss_clip": 0.01144393, "auxiliary_loss_mlp": 0.01055927, "balance_loss_clip": 1.05851114, "balance_loss_mlp": 1.03703237, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 1.8186679286330678, "language_loss": 0.86522418, "learning_rate": 3.816333222232251e-06, "loss": 0.88722742, "num_input_tokens_seen": 58806075, "step": 2717, "time_per_iteration": 2.761622428894043 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.05334044, "balance_loss_mlp": 1.0274632, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 1.8799656187942837, "language_loss": 0.76924133, "learning_rate": 3.816170155671629e-06, "loss": 0.79105473, "num_input_tokens_seen": 58827405, "step": 2718, "time_per_iteration": 2.7946770191192627 }, { "auxiliary_loss_clip": 0.01145146, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.05553615, "balance_loss_mlp": 1.02566922, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 2.2449478392049906, "language_loss": 0.73827291, "learning_rate": 3.816007020241652e-06, "loss": 0.76016116, "num_input_tokens_seen": 58847205, "step": 2719, "time_per_iteration": 2.719980478286743 }, { "auxiliary_loss_clip": 0.01128361, "auxiliary_loss_mlp": 0.01045887, "balance_loss_clip": 1.04900515, "balance_loss_mlp": 1.02732563, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 1.7092252575708884, "language_loss": 0.72267497, "learning_rate": 3.815843815948507e-06, "loss": 0.74441749, "num_input_tokens_seen": 58866865, "step": 2720, "time_per_iteration": 2.8737292289733887 }, { "auxiliary_loss_clip": 0.01109456, "auxiliary_loss_mlp": 0.01049703, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.02840054, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.1621365878543153, "language_loss": 0.75120997, "learning_rate": 3.8156805427983824e-06, "loss": 0.77280164, "num_input_tokens_seen": 58885200, "step": 2721, "time_per_iteration": 2.785296678543091 }, { "auxiliary_loss_clip": 0.01110342, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.04597676, "balance_loss_mlp": 1.03734064, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.9032438792006017, "language_loss": 0.79073942, "learning_rate": 3.8155172007974695e-06, "loss": 0.81244236, "num_input_tokens_seen": 58906385, "step": 2722, "time_per_iteration": 2.7850708961486816 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.00778798, "balance_loss_clip": 1.05395257, "balance_loss_mlp": 1.00171757, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.3019049903761215, "language_loss": 0.84954333, "learning_rate": 3.8153537899519624e-06, "loss": 0.86882937, "num_input_tokens_seen": 58925040, "step": 2723, "time_per_iteration": 2.7268764972686768 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.04805517, "balance_loss_mlp": 1.02493143, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 1.8985615531712963, "language_loss": 0.71018666, "learning_rate": 3.815190310268058e-06, "loss": 0.73171842, "num_input_tokens_seen": 58944790, "step": 2724, "time_per_iteration": 2.7691783905029297 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.05226958, "balance_loss_mlp": 1.02364373, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 2.1059770262776136, "language_loss": 0.70552838, "learning_rate": 3.815026761751955e-06, "loss": 0.72712779, "num_input_tokens_seen": 58962500, "step": 2725, "time_per_iteration": 2.6936957836151123 }, { "auxiliary_loss_clip": 0.01112368, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.04912174, "balance_loss_mlp": 1.028391, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.27810298992254, "language_loss": 0.88491893, "learning_rate": 3.814863144409855e-06, "loss": 0.90650856, "num_input_tokens_seen": 58980355, "step": 2726, "time_per_iteration": 2.7967143058776855 }, { "auxiliary_loss_clip": 0.01157668, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.06062055, "balance_loss_mlp": 1.03099847, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 2.0584475237926303, "language_loss": 0.7469939, "learning_rate": 3.814699458247963e-06, "loss": 0.7690773, "num_input_tokens_seen": 58999505, "step": 2727, "time_per_iteration": 2.6818623542785645 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01052077, "balance_loss_clip": 1.0570507, "balance_loss_mlp": 1.03527999, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.6112579442237729, "language_loss": 0.83097756, "learning_rate": 3.8145357032724855e-06, "loss": 0.85300803, "num_input_tokens_seen": 59017930, "step": 2728, "time_per_iteration": 2.675360918045044 }, { "auxiliary_loss_clip": 0.01156153, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.05826735, "balance_loss_mlp": 1.02602315, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 2.5738755626941106, "language_loss": 0.84892929, "learning_rate": 3.814371879489633e-06, "loss": 0.87094688, "num_input_tokens_seen": 59035130, "step": 2729, "time_per_iteration": 2.7004599571228027 }, { "auxiliary_loss_clip": 0.01167293, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.0591594, "balance_loss_mlp": 1.03053224, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 1.9897225699042427, "language_loss": 0.72895479, "learning_rate": 3.814207986905616e-06, "loss": 0.75111228, "num_input_tokens_seen": 59053080, "step": 2730, "time_per_iteration": 2.593179702758789 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.05208349, "balance_loss_mlp": 1.02908981, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.6754501336017709, "language_loss": 0.74384654, "learning_rate": 3.814044025526651e-06, "loss": 0.76577234, "num_input_tokens_seen": 59075610, "step": 2731, "time_per_iteration": 2.8702962398529053 }, { "auxiliary_loss_clip": 0.01122791, "auxiliary_loss_mlp": 0.01047176, "balance_loss_clip": 1.05006754, "balance_loss_mlp": 1.02650499, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.031351475505915, "language_loss": 0.79190683, "learning_rate": 3.8138799953589548e-06, "loss": 0.8136065, "num_input_tokens_seen": 59094555, "step": 2732, "time_per_iteration": 2.734529972076416 }, { "auxiliary_loss_clip": 0.01141118, "auxiliary_loss_mlp": 0.01047385, "balance_loss_clip": 1.05340672, "balance_loss_mlp": 1.02796555, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 2.250003976384769, "language_loss": 0.69526887, "learning_rate": 3.8137158964087473e-06, "loss": 0.71715385, "num_input_tokens_seen": 59113515, "step": 2733, "time_per_iteration": 2.672377109527588 }, { "auxiliary_loss_clip": 0.01143332, "auxiliary_loss_mlp": 0.01053232, "balance_loss_clip": 1.05603123, "balance_loss_mlp": 1.0325135, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.000873580428856, "language_loss": 0.80976766, "learning_rate": 3.8135517286822508e-06, "loss": 0.83173329, "num_input_tokens_seen": 59133275, "step": 2734, "time_per_iteration": 2.710293769836426 }, { "auxiliary_loss_clip": 0.01135758, "auxiliary_loss_mlp": 0.01056722, "balance_loss_clip": 1.05488348, "balance_loss_mlp": 1.03470409, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 2.100664117201308, "language_loss": 0.81810421, "learning_rate": 3.8133874921856914e-06, "loss": 0.840029, "num_input_tokens_seen": 59154095, "step": 2735, "time_per_iteration": 2.8074140548706055 }, { "auxiliary_loss_clip": 0.01070875, "auxiliary_loss_mlp": 0.01044313, "balance_loss_clip": 1.04323888, "balance_loss_mlp": 1.02508426, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.405088987017839, "language_loss": 0.78515649, "learning_rate": 3.813223186925296e-06, "loss": 0.80630839, "num_input_tokens_seen": 59173795, "step": 2736, "time_per_iteration": 2.839087963104248 }, { "auxiliary_loss_clip": 0.01147998, "auxiliary_loss_mlp": 0.01054659, "balance_loss_clip": 1.05859447, "balance_loss_mlp": 1.03513288, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1.9462182296456145, "language_loss": 0.81052899, "learning_rate": 3.8130588129072964e-06, "loss": 0.83255553, "num_input_tokens_seen": 59191610, "step": 2737, "time_per_iteration": 2.7328996658325195 }, { "auxiliary_loss_clip": 0.01150424, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.0559026, "balance_loss_mlp": 1.03065443, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.8596348168124566, "language_loss": 0.87449318, "learning_rate": 3.8128943701379246e-06, "loss": 0.89651948, "num_input_tokens_seen": 59213000, "step": 2738, "time_per_iteration": 2.7345526218414307 }, { "auxiliary_loss_clip": 0.01139154, "auxiliary_loss_mlp": 0.0106055, "balance_loss_clip": 1.05534518, "balance_loss_mlp": 1.04079759, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 1.728421510231393, "language_loss": 0.71997833, "learning_rate": 3.8127298586234167e-06, "loss": 0.74197543, "num_input_tokens_seen": 59232340, "step": 2739, "time_per_iteration": 2.7091422080993652 }, { "auxiliary_loss_clip": 0.01154419, "auxiliary_loss_mlp": 0.0105106, "balance_loss_clip": 1.05673754, "balance_loss_mlp": 1.0312835, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.8559436932352185, "language_loss": 0.81645715, "learning_rate": 3.8125652783700104e-06, "loss": 0.83851194, "num_input_tokens_seen": 59253950, "step": 2740, "time_per_iteration": 2.712658166885376 }, { "auxiliary_loss_clip": 0.01114061, "auxiliary_loss_mlp": 0.01068725, "balance_loss_clip": 1.04991829, "balance_loss_mlp": 1.04307163, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.0528021789830837, "language_loss": 0.69467485, "learning_rate": 3.8124006293839475e-06, "loss": 0.71650267, "num_input_tokens_seen": 59275545, "step": 2741, "time_per_iteration": 2.8629493713378906 }, { "auxiliary_loss_clip": 0.01167543, "auxiliary_loss_mlp": 0.01048721, "balance_loss_clip": 1.05907226, "balance_loss_mlp": 1.02906334, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 1.7765193730452222, "language_loss": 0.79811072, "learning_rate": 3.812235911671472e-06, "loss": 0.8202734, "num_input_tokens_seen": 59293480, "step": 2742, "time_per_iteration": 2.626775026321411 }, { "auxiliary_loss_clip": 0.01141681, "auxiliary_loss_mlp": 0.01055663, "balance_loss_clip": 1.05664062, "balance_loss_mlp": 1.03477716, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 1.91797408289014, "language_loss": 0.8499459, "learning_rate": 3.8120711252388274e-06, "loss": 0.87191939, "num_input_tokens_seen": 59313435, "step": 2743, "time_per_iteration": 2.8218302726745605 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05743837, "balance_loss_mlp": 1.03196514, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.4425200129075006, "language_loss": 0.85558498, "learning_rate": 3.811906270092265e-06, "loss": 0.87772918, "num_input_tokens_seen": 59331535, "step": 2744, "time_per_iteration": 4.206263542175293 }, { "auxiliary_loss_clip": 0.01131671, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.05206287, "balance_loss_mlp": 1.02812767, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.6285200980820358, "language_loss": 0.82770813, "learning_rate": 3.811741346238036e-06, "loss": 0.84949243, "num_input_tokens_seen": 59350680, "step": 2745, "time_per_iteration": 4.331594467163086 }, { "auxiliary_loss_clip": 0.011344, "auxiliary_loss_mlp": 0.01057242, "balance_loss_clip": 1.05874014, "balance_loss_mlp": 1.03825223, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 6.766690288332402, "language_loss": 0.76811314, "learning_rate": 3.8115763536823923e-06, "loss": 0.79002959, "num_input_tokens_seen": 59367020, "step": 2746, "time_per_iteration": 4.225586414337158 }, { "auxiliary_loss_clip": 0.01164296, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.05781221, "balance_loss_mlp": 1.03533494, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.9760186874049024, "language_loss": 0.80818808, "learning_rate": 3.811411292431592e-06, "loss": 0.83038735, "num_input_tokens_seen": 59386075, "step": 2747, "time_per_iteration": 2.6862480640411377 }, { "auxiliary_loss_clip": 0.01157975, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.05990267, "balance_loss_mlp": 1.02664328, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.0608482379031337, "language_loss": 0.69433749, "learning_rate": 3.8112461624918945e-06, "loss": 0.71638453, "num_input_tokens_seen": 59402690, "step": 2748, "time_per_iteration": 2.6520986557006836 }, { "auxiliary_loss_clip": 0.01169692, "auxiliary_loss_mlp": 0.00778195, "balance_loss_clip": 1.06237423, "balance_loss_mlp": 1.00173104, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.259215537482641, "language_loss": 0.88012803, "learning_rate": 3.811080963869561e-06, "loss": 0.89960694, "num_input_tokens_seen": 59421130, "step": 2749, "time_per_iteration": 4.260679244995117 }, { "auxiliary_loss_clip": 0.01154179, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.02542281, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 2.0880864906339864, "language_loss": 0.79240286, "learning_rate": 3.8109156965708557e-06, "loss": 0.81440079, "num_input_tokens_seen": 59438970, "step": 2750, "time_per_iteration": 2.6335251331329346 }, { "auxiliary_loss_clip": 0.01153343, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.0579437, "balance_loss_mlp": 1.02602625, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 1.6952801391084946, "language_loss": 0.94854712, "learning_rate": 3.8107503606020455e-06, "loss": 0.97053963, "num_input_tokens_seen": 59458510, "step": 2751, "time_per_iteration": 2.697174310684204 }, { "auxiliary_loss_clip": 0.0106803, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.04625726, "balance_loss_mlp": 1.03247619, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 2.614588592950962, "language_loss": 0.71231711, "learning_rate": 3.8105849559693997e-06, "loss": 0.73352098, "num_input_tokens_seen": 59477110, "step": 2752, "time_per_iteration": 2.7780745029449463 }, { "auxiliary_loss_clip": 0.01090521, "auxiliary_loss_mlp": 0.01022104, "balance_loss_clip": 1.05741131, "balance_loss_mlp": 1.01941013, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7721529651221379, "language_loss": 0.54058975, "learning_rate": 3.810419482679192e-06, "loss": 0.56171602, "num_input_tokens_seen": 59541155, "step": 2753, "time_per_iteration": 3.3371469974517822 }, { "auxiliary_loss_clip": 0.01163808, "auxiliary_loss_mlp": 0.00778536, "balance_loss_clip": 1.05587018, "balance_loss_mlp": 1.00172091, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.6411537728312637, "language_loss": 0.75436741, "learning_rate": 3.8102539407376954e-06, "loss": 0.7737909, "num_input_tokens_seen": 59561155, "step": 2754, "time_per_iteration": 2.6382133960723877 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01060584, "balance_loss_clip": 1.05406713, "balance_loss_mlp": 1.03768396, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.4067479946694137, "language_loss": 0.86654639, "learning_rate": 3.810088330151188e-06, "loss": 0.88854647, "num_input_tokens_seen": 59580460, "step": 2755, "time_per_iteration": 2.6590075492858887 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01053169, "balance_loss_clip": 1.04948378, "balance_loss_mlp": 1.03293943, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.7268487777137649, "language_loss": 0.73350251, "learning_rate": 3.80992265092595e-06, "loss": 0.75526237, "num_input_tokens_seen": 59600025, "step": 2756, "time_per_iteration": 2.771820545196533 }, { "auxiliary_loss_clip": 0.01128662, "auxiliary_loss_mlp": 0.01049666, "balance_loss_clip": 1.05550277, "balance_loss_mlp": 1.02969813, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5540667033085804, "language_loss": 0.75308084, "learning_rate": 3.8097569030682636e-06, "loss": 0.77486414, "num_input_tokens_seen": 59620600, "step": 2757, "time_per_iteration": 2.8106157779693604 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01054064, "balance_loss_clip": 1.057634, "balance_loss_mlp": 1.03390563, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.8675154897424497, "language_loss": 0.84604371, "learning_rate": 3.8095910865844137e-06, "loss": 0.86802036, "num_input_tokens_seen": 59641385, "step": 2758, "time_per_iteration": 2.8663368225097656 }, { "auxiliary_loss_clip": 0.01168186, "auxiliary_loss_mlp": 0.01058337, "balance_loss_clip": 1.06166434, "balance_loss_mlp": 1.03952527, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.0824774555850243, "language_loss": 0.78848934, "learning_rate": 3.809425201480689e-06, "loss": 0.81075454, "num_input_tokens_seen": 59659865, "step": 2759, "time_per_iteration": 2.655371904373169 }, { "auxiliary_loss_clip": 0.01098973, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.0491066, "balance_loss_mlp": 1.02846527, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 2.4005603702739613, "language_loss": 0.75130272, "learning_rate": 3.8092592477633793e-06, "loss": 0.77278036, "num_input_tokens_seen": 59678780, "step": 2760, "time_per_iteration": 2.767866611480713 }, { "auxiliary_loss_clip": 0.01117278, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.05129814, "balance_loss_mlp": 1.02867997, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 1.5792623632565632, "language_loss": 0.73425764, "learning_rate": 3.8090932254387774e-06, "loss": 0.75591272, "num_input_tokens_seen": 59698795, "step": 2761, "time_per_iteration": 2.762836456298828 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.03018475, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 2.9515424803015033, "language_loss": 0.88832974, "learning_rate": 3.8089271345131788e-06, "loss": 0.91018462, "num_input_tokens_seen": 59718795, "step": 2762, "time_per_iteration": 2.766324281692505 }, { "auxiliary_loss_clip": 0.01115163, "auxiliary_loss_mlp": 0.01050144, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.03080845, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 1.84507980271118, "language_loss": 0.87992418, "learning_rate": 3.8087609749928822e-06, "loss": 0.90157735, "num_input_tokens_seen": 59737555, "step": 2763, "time_per_iteration": 2.7734055519104004 }, { "auxiliary_loss_clip": 0.01086152, "auxiliary_loss_mlp": 0.01013622, "balance_loss_clip": 1.0448606, "balance_loss_mlp": 1.01065338, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7790832079967882, "language_loss": 0.59799927, "learning_rate": 3.8085947468841885e-06, "loss": 0.61899698, "num_input_tokens_seen": 59800915, "step": 2764, "time_per_iteration": 3.1728692054748535 }, { "auxiliary_loss_clip": 0.01152232, "auxiliary_loss_mlp": 0.01053607, "balance_loss_clip": 1.05467176, "balance_loss_mlp": 1.03254318, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.7436496772383425, "language_loss": 0.82260036, "learning_rate": 3.808428450193401e-06, "loss": 0.84465873, "num_input_tokens_seen": 59822910, "step": 2765, "time_per_iteration": 2.72440767288208 }, { "auxiliary_loss_clip": 0.01171844, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.05882454, "balance_loss_mlp": 1.02746069, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.128015994498251, "language_loss": 0.69980019, "learning_rate": 3.8082620849268244e-06, "loss": 0.72200948, "num_input_tokens_seen": 59838805, "step": 2766, "time_per_iteration": 2.5810647010803223 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01047665, "balance_loss_clip": 1.05772817, "balance_loss_mlp": 1.02792454, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.107381123394178, "language_loss": 0.8845337, "learning_rate": 3.808095651090769e-06, "loss": 0.90651393, "num_input_tokens_seen": 59855345, "step": 2767, "time_per_iteration": 2.659240245819092 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01002999, "balance_loss_clip": 1.046556, "balance_loss_mlp": 1.00020981, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6403612433239105, "language_loss": 0.5289067, "learning_rate": 3.8079291486915447e-06, "loss": 0.54970956, "num_input_tokens_seen": 59917710, "step": 2768, "time_per_iteration": 3.28488826751709 }, { "auxiliary_loss_clip": 0.01137637, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.05451822, "balance_loss_mlp": 1.03034163, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.4342686570828267, "language_loss": 0.84962058, "learning_rate": 3.8077625777354667e-06, "loss": 0.87151396, "num_input_tokens_seen": 59935105, "step": 2769, "time_per_iteration": 2.753257989883423 }, { "auxiliary_loss_clip": 0.01068987, "auxiliary_loss_mlp": 0.0100573, "balance_loss_clip": 1.04678345, "balance_loss_mlp": 1.00316668, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.8107434108728753, "language_loss": 0.57455683, "learning_rate": 3.80759593822885e-06, "loss": 0.59530401, "num_input_tokens_seen": 59984085, "step": 2770, "time_per_iteration": 3.2202906608581543 }, { "auxiliary_loss_clip": 0.01054548, "auxiliary_loss_mlp": 0.01003676, "balance_loss_clip": 1.04637623, "balance_loss_mlp": 1.00086308, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8940719168038874, "language_loss": 0.56241393, "learning_rate": 3.807429230178015e-06, "loss": 0.58299619, "num_input_tokens_seen": 60043470, "step": 2771, "time_per_iteration": 3.3302085399627686 }, { "auxiliary_loss_clip": 0.01110714, "auxiliary_loss_mlp": 0.01053994, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.03316772, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.9137693497887778, "language_loss": 0.70419657, "learning_rate": 3.8072624535892817e-06, "loss": 0.72584367, "num_input_tokens_seen": 60063045, "step": 2772, "time_per_iteration": 2.845414161682129 }, { "auxiliary_loss_clip": 0.0114592, "auxiliary_loss_mlp": 0.01049708, "balance_loss_clip": 1.05082583, "balance_loss_mlp": 1.02923954, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 2.20945076195277, "language_loss": 0.86324167, "learning_rate": 3.807095608468975e-06, "loss": 0.88519788, "num_input_tokens_seen": 60081945, "step": 2773, "time_per_iteration": 2.669412851333618 }, { "auxiliary_loss_clip": 0.01095425, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.04436934, "balance_loss_mlp": 1.0300827, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.0211952616678937, "language_loss": 0.82141376, "learning_rate": 3.8069286948234224e-06, "loss": 0.84285897, "num_input_tokens_seen": 60096820, "step": 2774, "time_per_iteration": 2.7111308574676514 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.05252421, "balance_loss_mlp": 1.02446127, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 3.3781068524499, "language_loss": 0.8298822, "learning_rate": 3.806761712658952e-06, "loss": 0.85156858, "num_input_tokens_seen": 60116140, "step": 2775, "time_per_iteration": 2.7367632389068604 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.05761933, "balance_loss_mlp": 1.03264022, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 1.8115651629444076, "language_loss": 0.80919641, "learning_rate": 3.806594661981897e-06, "loss": 0.8312161, "num_input_tokens_seen": 60134235, "step": 2776, "time_per_iteration": 2.651723623275757 }, { "auxiliary_loss_clip": 0.0113775, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.05518723, "balance_loss_mlp": 1.0346483, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 2.7510345221850336, "language_loss": 0.80203485, "learning_rate": 3.8064275427985906e-06, "loss": 0.82395434, "num_input_tokens_seen": 60153275, "step": 2777, "time_per_iteration": 2.6380929946899414 }, { "auxiliary_loss_clip": 0.01147967, "auxiliary_loss_mlp": 0.01045166, "balance_loss_clip": 1.05270481, "balance_loss_mlp": 1.02640271, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 1.6179722336290305, "language_loss": 0.85384095, "learning_rate": 3.806260355115371e-06, "loss": 0.87577224, "num_input_tokens_seen": 60173215, "step": 2778, "time_per_iteration": 2.754652500152588 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.0531714, "balance_loss_mlp": 1.02148652, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 3.2091470007324414, "language_loss": 0.74180603, "learning_rate": 3.8060930989385778e-06, "loss": 0.76358056, "num_input_tokens_seen": 60190515, "step": 2779, "time_per_iteration": 2.777193784713745 }, { "auxiliary_loss_clip": 0.01112683, "auxiliary_loss_mlp": 0.00777451, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.0015173, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.127789274190337, "language_loss": 0.6557346, "learning_rate": 3.805925774274554e-06, "loss": 0.67463589, "num_input_tokens_seen": 60211655, "step": 2780, "time_per_iteration": 2.896976947784424 }, { "auxiliary_loss_clip": 0.01120921, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.04843462, "balance_loss_mlp": 1.02547836, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 2.46647860258999, "language_loss": 0.78422606, "learning_rate": 3.805758381129643e-06, "loss": 0.80589032, "num_input_tokens_seen": 60230860, "step": 2781, "time_per_iteration": 2.725782632827759 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.04439843, "balance_loss_mlp": 1.03056526, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 26.23767952829368, "language_loss": 0.75119764, "learning_rate": 3.805590919510193e-06, "loss": 0.77264553, "num_input_tokens_seen": 60250535, "step": 2782, "time_per_iteration": 2.7064197063446045 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.05152631, "balance_loss_mlp": 1.02764392, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 2.116531296279042, "language_loss": 0.67398441, "learning_rate": 3.8054233894225547e-06, "loss": 0.69575214, "num_input_tokens_seen": 60269530, "step": 2783, "time_per_iteration": 2.7901556491851807 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.05460215, "balance_loss_mlp": 1.03271747, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.7768362036873409, "language_loss": 0.69919086, "learning_rate": 3.805255790873081e-06, "loss": 0.72129631, "num_input_tokens_seen": 60289900, "step": 2784, "time_per_iteration": 5.714844226837158 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01056022, "balance_loss_clip": 1.05217624, "balance_loss_mlp": 1.03539932, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 4.741795209709136, "language_loss": 0.60970068, "learning_rate": 3.805088123868126e-06, "loss": 0.6316371, "num_input_tokens_seen": 60310025, "step": 2785, "time_per_iteration": 4.219547510147095 }, { "auxiliary_loss_clip": 0.01057886, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.03758883, "balance_loss_mlp": 1.00141752, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.773077721474628, "language_loss": 0.58780885, "learning_rate": 3.8049203884140492e-06, "loss": 0.60842752, "num_input_tokens_seen": 60377800, "step": 2786, "time_per_iteration": 3.2306320667266846 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.0496738, "balance_loss_mlp": 1.02589226, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 1.7333132735183339, "language_loss": 0.76308596, "learning_rate": 3.80475258451721e-06, "loss": 0.78492826, "num_input_tokens_seen": 60398215, "step": 2787, "time_per_iteration": 2.6434125900268555 }, { "auxiliary_loss_clip": 0.01146924, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.0529089, "balance_loss_mlp": 1.02544546, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 1.7210472408736244, "language_loss": 0.7717936, "learning_rate": 3.804584712183972e-06, "loss": 0.79369676, "num_input_tokens_seen": 60416910, "step": 2788, "time_per_iteration": 4.359618425369263 }, { "auxiliary_loss_clip": 0.01054629, "auxiliary_loss_mlp": 0.00999991, "balance_loss_clip": 1.03482509, "balance_loss_mlp": 0.99746382, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8596744797543817, "language_loss": 0.59331679, "learning_rate": 3.8044167714207013e-06, "loss": 0.61386299, "num_input_tokens_seen": 60468660, "step": 2789, "time_per_iteration": 3.0742650032043457 }, { "auxiliary_loss_clip": 0.01148272, "auxiliary_loss_mlp": 0.01053856, "balance_loss_clip": 1.05450928, "balance_loss_mlp": 1.03428209, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 1.689036486923415, "language_loss": 0.7012763, "learning_rate": 3.804248762233765e-06, "loss": 0.7232976, "num_input_tokens_seen": 60492370, "step": 2790, "time_per_iteration": 2.872232437133789 }, { "auxiliary_loss_clip": 0.0112492, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.0497216, "balance_loss_mlp": 1.0334661, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 1.864386369112868, "language_loss": 0.79464513, "learning_rate": 3.8040806846295356e-06, "loss": 0.81641054, "num_input_tokens_seen": 60512655, "step": 2791, "time_per_iteration": 2.7180140018463135 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01050939, "balance_loss_clip": 1.04977369, "balance_loss_mlp": 1.03106701, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 1.705849915566178, "language_loss": 0.71547955, "learning_rate": 3.8039125386143853e-06, "loss": 0.73721349, "num_input_tokens_seen": 60533090, "step": 2792, "time_per_iteration": 2.9221818447113037 }, { "auxiliary_loss_clip": 0.01131469, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.05479562, "balance_loss_mlp": 1.02551246, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 1.9301593564774673, "language_loss": 0.71581644, "learning_rate": 3.803744324194691e-06, "loss": 0.73757172, "num_input_tokens_seen": 60553190, "step": 2793, "time_per_iteration": 2.75104022026062 }, { "auxiliary_loss_clip": 0.01143072, "auxiliary_loss_mlp": 0.01053231, "balance_loss_clip": 1.05276942, "balance_loss_mlp": 1.03452659, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 2.3859650274226833, "language_loss": 0.7717455, "learning_rate": 3.803576041376831e-06, "loss": 0.79370856, "num_input_tokens_seen": 60571995, "step": 2794, "time_per_iteration": 2.6007745265960693 }, { "auxiliary_loss_clip": 0.01137828, "auxiliary_loss_mlp": 0.0104987, "balance_loss_clip": 1.05250025, "balance_loss_mlp": 1.03010476, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 2.7692472240964747, "language_loss": 0.71609265, "learning_rate": 3.803407690167187e-06, "loss": 0.73796958, "num_input_tokens_seen": 60591275, "step": 2795, "time_per_iteration": 2.693826198577881 }, { "auxiliary_loss_clip": 0.01131865, "auxiliary_loss_mlp": 0.01041012, "balance_loss_clip": 1.04973865, "balance_loss_mlp": 1.02302384, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.990096863808903, "language_loss": 0.84230494, "learning_rate": 3.803239270572142e-06, "loss": 0.8640337, "num_input_tokens_seen": 60609235, "step": 2796, "time_per_iteration": 2.697253465652466 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01045196, "balance_loss_clip": 1.04877055, "balance_loss_mlp": 1.0262773, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.9272276676322646, "language_loss": 0.81609607, "learning_rate": 3.8030707825980838e-06, "loss": 0.83756441, "num_input_tokens_seen": 60629880, "step": 2797, "time_per_iteration": 2.8784244060516357 }, { "auxiliary_loss_clip": 0.0114057, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.05136061, "balance_loss_mlp": 1.02448523, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 1.7015769336052518, "language_loss": 0.74811113, "learning_rate": 3.802902226251401e-06, "loss": 0.76992965, "num_input_tokens_seen": 60651175, "step": 2798, "time_per_iteration": 2.700727939605713 }, { "auxiliary_loss_clip": 0.01161342, "auxiliary_loss_mlp": 0.01048462, "balance_loss_clip": 1.05728281, "balance_loss_mlp": 1.03075945, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.5964091182578661, "language_loss": 0.79693568, "learning_rate": 3.8027336015384845e-06, "loss": 0.81903368, "num_input_tokens_seen": 60670210, "step": 2799, "time_per_iteration": 2.6582021713256836 }, { "auxiliary_loss_clip": 0.01077177, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.04514158, "balance_loss_mlp": 1.02374637, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 4.227726163531211, "language_loss": 0.70963746, "learning_rate": 3.8025649084657296e-06, "loss": 0.73086143, "num_input_tokens_seen": 60690895, "step": 2800, "time_per_iteration": 2.8856699466705322 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.00777078, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.00161195, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.9902029671619985, "language_loss": 0.83663505, "learning_rate": 3.8023961470395326e-06, "loss": 0.85549408, "num_input_tokens_seen": 60708280, "step": 2801, "time_per_iteration": 2.6917035579681396 }, { "auxiliary_loss_clip": 0.01128148, "auxiliary_loss_mlp": 0.01049324, "balance_loss_clip": 1.05011535, "balance_loss_mlp": 1.03084683, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.4052305427948735, "language_loss": 0.82509923, "learning_rate": 3.8022273172662933e-06, "loss": 0.84687394, "num_input_tokens_seen": 60724150, "step": 2802, "time_per_iteration": 2.882611036300659 }, { "auxiliary_loss_clip": 0.01150156, "auxiliary_loss_mlp": 0.01048717, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.02885723, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 3.107584498439891, "language_loss": 0.80643189, "learning_rate": 3.802058419152413e-06, "loss": 0.8284207, "num_input_tokens_seen": 60746485, "step": 2803, "time_per_iteration": 2.7886922359466553 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.0556829, "balance_loss_mlp": 1.02918339, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 2.2127389669880713, "language_loss": 0.76168799, "learning_rate": 3.801889452704297e-06, "loss": 0.7836476, "num_input_tokens_seen": 60762875, "step": 2804, "time_per_iteration": 2.7588601112365723 }, { "auxiliary_loss_clip": 0.01045171, "auxiliary_loss_mlp": 0.01013955, "balance_loss_clip": 1.03581083, "balance_loss_mlp": 1.01078367, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8536034833258724, "language_loss": 0.55464876, "learning_rate": 3.8017204179283526e-06, "loss": 0.57524002, "num_input_tokens_seen": 60825510, "step": 2805, "time_per_iteration": 3.2089412212371826 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05013156, "balance_loss_mlp": 1.02239537, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 2.2836767274778427, "language_loss": 0.73090243, "learning_rate": 3.8015513148309892e-06, "loss": 0.75268269, "num_input_tokens_seen": 60844440, "step": 2806, "time_per_iteration": 2.643596649169922 }, { "auxiliary_loss_clip": 0.01117063, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.05330753, "balance_loss_mlp": 1.02766335, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.8406859431587912, "language_loss": 0.69773197, "learning_rate": 3.80138214341862e-06, "loss": 0.71935666, "num_input_tokens_seen": 60863210, "step": 2807, "time_per_iteration": 2.6946568489074707 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.04842246, "balance_loss_mlp": 1.02794707, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 3.042021842274248, "language_loss": 0.70280695, "learning_rate": 3.8012129036976587e-06, "loss": 0.72458601, "num_input_tokens_seen": 60882510, "step": 2808, "time_per_iteration": 2.6656088829040527 }, { "auxiliary_loss_clip": 0.01119025, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.05019665, "balance_loss_mlp": 1.02164018, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.0835789337145965, "language_loss": 0.79903001, "learning_rate": 3.8010435956745236e-06, "loss": 0.8206377, "num_input_tokens_seen": 60901105, "step": 2809, "time_per_iteration": 2.7665679454803467 }, { "auxiliary_loss_clip": 0.01155146, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.0557605, "balance_loss_mlp": 1.02252758, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.0672093223845245, "language_loss": 0.88076419, "learning_rate": 3.8008742193556358e-06, "loss": 0.90273583, "num_input_tokens_seen": 60915340, "step": 2810, "time_per_iteration": 2.6186363697052 }, { "auxiliary_loss_clip": 0.01149997, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.05503082, "balance_loss_mlp": 1.02715337, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 1.8921026809528976, "language_loss": 0.92376304, "learning_rate": 3.800704774747416e-06, "loss": 0.9457261, "num_input_tokens_seen": 60933735, "step": 2811, "time_per_iteration": 2.6567442417144775 }, { "auxiliary_loss_clip": 0.01140053, "auxiliary_loss_mlp": 0.01049063, "balance_loss_clip": 1.05383325, "balance_loss_mlp": 1.03039432, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 2.116573413654177, "language_loss": 0.78582352, "learning_rate": 3.800535261856291e-06, "loss": 0.8077147, "num_input_tokens_seen": 60953105, "step": 2812, "time_per_iteration": 2.6796023845672607 }, { "auxiliary_loss_clip": 0.01147895, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.05772316, "balance_loss_mlp": 1.02653646, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.5483899062625093, "language_loss": 0.75195068, "learning_rate": 3.8003656806886887e-06, "loss": 0.7738688, "num_input_tokens_seen": 60969150, "step": 2813, "time_per_iteration": 2.621772050857544 }, { "auxiliary_loss_clip": 0.01136313, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05311871, "balance_loss_mlp": 1.02599943, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 3.0041182480764554, "language_loss": 0.69118392, "learning_rate": 3.8001960312510396e-06, "loss": 0.7129975, "num_input_tokens_seen": 60982825, "step": 2814, "time_per_iteration": 2.837264060974121 }, { "auxiliary_loss_clip": 0.01163835, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.05900145, "balance_loss_mlp": 1.02134776, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 3.1079956206415833, "language_loss": 0.61439502, "learning_rate": 3.800026313549776e-06, "loss": 0.63643175, "num_input_tokens_seen": 61000875, "step": 2815, "time_per_iteration": 2.6967194080352783 }, { "auxiliary_loss_clip": 0.01129827, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.05139673, "balance_loss_mlp": 1.02382088, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7930623183302479, "language_loss": 0.82490849, "learning_rate": 3.7998565275913342e-06, "loss": 0.84663367, "num_input_tokens_seen": 61021940, "step": 2816, "time_per_iteration": 2.7227163314819336 }, { "auxiliary_loss_clip": 0.01133129, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.05375743, "balance_loss_mlp": 1.02853012, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 3.083808689594852, "language_loss": 0.87322289, "learning_rate": 3.799686673382153e-06, "loss": 0.89503324, "num_input_tokens_seen": 61040285, "step": 2817, "time_per_iteration": 2.733180522918701 }, { "auxiliary_loss_clip": 0.01141455, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.05800366, "balance_loss_mlp": 1.03352427, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.8594303503608436, "language_loss": 0.81247765, "learning_rate": 3.799516750928672e-06, "loss": 0.83441973, "num_input_tokens_seen": 61059020, "step": 2818, "time_per_iteration": 2.7384097576141357 }, { "auxiliary_loss_clip": 0.01160132, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.05699944, "balance_loss_mlp": 1.02496791, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 2.739998367204505, "language_loss": 0.80788404, "learning_rate": 3.799346760237336e-06, "loss": 0.82992733, "num_input_tokens_seen": 61074245, "step": 2819, "time_per_iteration": 2.609870672225952 }, { "auxiliary_loss_clip": 0.01069019, "auxiliary_loss_mlp": 0.01015301, "balance_loss_clip": 1.0485003, "balance_loss_mlp": 1.0125947, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9309223426502673, "language_loss": 0.61031163, "learning_rate": 3.7991767013145902e-06, "loss": 0.63115478, "num_input_tokens_seen": 61127080, "step": 2820, "time_per_iteration": 3.161051034927368 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.05106986, "balance_loss_mlp": 1.03207326, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 1.8682266790688726, "language_loss": 0.78265435, "learning_rate": 3.7990065741668844e-06, "loss": 0.80441403, "num_input_tokens_seen": 61146955, "step": 2821, "time_per_iteration": 2.838730573654175 }, { "auxiliary_loss_clip": 0.0113863, "auxiliary_loss_mlp": 0.01055528, "balance_loss_clip": 1.05282724, "balance_loss_mlp": 1.03494084, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 2.1667405259997516, "language_loss": 0.78521514, "learning_rate": 3.7988363788006685e-06, "loss": 0.80715668, "num_input_tokens_seen": 61166605, "step": 2822, "time_per_iteration": 2.783385753631592 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.00777154, "balance_loss_clip": 1.05367076, "balance_loss_mlp": 1.00129986, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 1.8038457392731222, "language_loss": 0.74939907, "learning_rate": 3.7986661152223967e-06, "loss": 0.76860654, "num_input_tokens_seen": 61186535, "step": 2823, "time_per_iteration": 4.329328298568726 }, { "auxiliary_loss_clip": 0.01129469, "auxiliary_loss_mlp": 0.0105385, "balance_loss_clip": 1.05166912, "balance_loss_mlp": 1.03496754, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 3.336653609493179, "language_loss": 0.60266119, "learning_rate": 3.7984957834385257e-06, "loss": 0.62449437, "num_input_tokens_seen": 61208965, "step": 2824, "time_per_iteration": 5.892346620559692 }, { "auxiliary_loss_clip": 0.01138249, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.05565047, "balance_loss_mlp": 1.02287912, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 2.152838804074104, "language_loss": 0.73322558, "learning_rate": 3.7983253834555144e-06, "loss": 0.75503135, "num_input_tokens_seen": 61230670, "step": 2825, "time_per_iteration": 2.834482431411743 }, { "auxiliary_loss_clip": 0.01161467, "auxiliary_loss_mlp": 0.01047701, "balance_loss_clip": 1.05502653, "balance_loss_mlp": 1.02762675, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 2.05671259677731, "language_loss": 0.85638934, "learning_rate": 3.7981549152798245e-06, "loss": 0.87848103, "num_input_tokens_seen": 61249510, "step": 2826, "time_per_iteration": 2.6443135738372803 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01047749, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02856779, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 1.9562557148441426, "language_loss": 0.82465482, "learning_rate": 3.7979843789179196e-06, "loss": 0.84652597, "num_input_tokens_seen": 61269440, "step": 2827, "time_per_iteration": 2.7683157920837402 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.0104561, "balance_loss_clip": 1.05320346, "balance_loss_mlp": 1.02536786, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 1.7386401818136152, "language_loss": 0.73704529, "learning_rate": 3.797813774376267e-06, "loss": 0.75882024, "num_input_tokens_seen": 61288195, "step": 2828, "time_per_iteration": 4.465311288833618 }, { "auxiliary_loss_clip": 0.01061458, "auxiliary_loss_mlp": 0.01009538, "balance_loss_clip": 1.04764342, "balance_loss_mlp": 1.00620067, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.7670168832041738, "language_loss": 0.56426483, "learning_rate": 3.797643101661336e-06, "loss": 0.58497471, "num_input_tokens_seen": 61350850, "step": 2829, "time_per_iteration": 3.3114631175994873 }, { "auxiliary_loss_clip": 0.01111753, "auxiliary_loss_mlp": 0.01051557, "balance_loss_clip": 1.04527223, "balance_loss_mlp": 1.03088641, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 1.7961285206560338, "language_loss": 0.83465374, "learning_rate": 3.7974723607795983e-06, "loss": 0.85628688, "num_input_tokens_seen": 61370765, "step": 2830, "time_per_iteration": 2.795253038406372 }, { "auxiliary_loss_clip": 0.01121533, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.04901659, "balance_loss_mlp": 1.02442193, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 2.4873654173451727, "language_loss": 0.78360993, "learning_rate": 3.797301551737529e-06, "loss": 0.80526608, "num_input_tokens_seen": 61388935, "step": 2831, "time_per_iteration": 2.7864232063293457 }, { "auxiliary_loss_clip": 0.01123612, "auxiliary_loss_mlp": 0.01051154, "balance_loss_clip": 1.05275893, "balance_loss_mlp": 1.0311985, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.532473263441992, "language_loss": 0.79668158, "learning_rate": 3.7971306745416044e-06, "loss": 0.81842923, "num_input_tokens_seen": 61407350, "step": 2832, "time_per_iteration": 2.842217206954956 }, { "auxiliary_loss_clip": 0.01127135, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.05029321, "balance_loss_mlp": 1.02984488, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.8387196201649116, "language_loss": 0.88638175, "learning_rate": 3.7969597291983046e-06, "loss": 0.90814275, "num_input_tokens_seen": 61429010, "step": 2833, "time_per_iteration": 2.75942325592041 }, { "auxiliary_loss_clip": 0.01158799, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.05633831, "balance_loss_mlp": 1.02842951, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.49094605220443, "language_loss": 0.71924698, "learning_rate": 3.7967887157141115e-06, "loss": 0.74130386, "num_input_tokens_seen": 61450040, "step": 2834, "time_per_iteration": 2.9035184383392334 }, { "auxiliary_loss_clip": 0.01119873, "auxiliary_loss_mlp": 0.01052215, "balance_loss_clip": 1.05165124, "balance_loss_mlp": 1.03428626, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 1.9093816511111852, "language_loss": 0.86831236, "learning_rate": 3.7966176340955106e-06, "loss": 0.89003325, "num_input_tokens_seen": 61468585, "step": 2835, "time_per_iteration": 2.7627484798431396 }, { "auxiliary_loss_clip": 0.01149332, "auxiliary_loss_mlp": 0.01049844, "balance_loss_clip": 1.0536654, "balance_loss_mlp": 1.02887547, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.1227367002258153, "language_loss": 0.74483943, "learning_rate": 3.796446484348989e-06, "loss": 0.76683116, "num_input_tokens_seen": 61486330, "step": 2836, "time_per_iteration": 2.6748619079589844 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.04775679, "balance_loss_mlp": 1.02790809, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.1718385109372824, "language_loss": 0.79959226, "learning_rate": 3.796275266481036e-06, "loss": 0.82111007, "num_input_tokens_seen": 61503950, "step": 2837, "time_per_iteration": 2.757340908050537 }, { "auxiliary_loss_clip": 0.01144378, "auxiliary_loss_mlp": 0.01044803, "balance_loss_clip": 1.05493581, "balance_loss_mlp": 1.02644491, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 1.6825251002952497, "language_loss": 0.83258498, "learning_rate": 3.7961039804981456e-06, "loss": 0.85447681, "num_input_tokens_seen": 61523550, "step": 2838, "time_per_iteration": 2.705357551574707 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.05217135, "balance_loss_mlp": 1.02685261, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 1.7789799751303759, "language_loss": 0.93788463, "learning_rate": 3.795932626406812e-06, "loss": 0.95939398, "num_input_tokens_seen": 61542720, "step": 2839, "time_per_iteration": 2.7881791591644287 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05183244, "balance_loss_mlp": 1.0250175, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 2.3337760403585435, "language_loss": 0.83974946, "learning_rate": 3.7957612042135336e-06, "loss": 0.86147022, "num_input_tokens_seen": 61563040, "step": 2840, "time_per_iteration": 2.7564892768859863 }, { "auxiliary_loss_clip": 0.01151834, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.05555129, "balance_loss_mlp": 1.02449679, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.9037435592597944, "language_loss": 0.76307738, "learning_rate": 3.79558971392481e-06, "loss": 0.7850399, "num_input_tokens_seen": 61581890, "step": 2841, "time_per_iteration": 2.695525646209717 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.0527097, "balance_loss_mlp": 1.02744126, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.7844240011089845, "language_loss": 0.77076876, "learning_rate": 3.7954181555471443e-06, "loss": 0.79258937, "num_input_tokens_seen": 61602095, "step": 2842, "time_per_iteration": 2.773792266845703 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.05616069, "balance_loss_mlp": 1.02503705, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.8430349199993477, "language_loss": 0.85694385, "learning_rate": 3.795246529087043e-06, "loss": 0.87894201, "num_input_tokens_seen": 61620400, "step": 2843, "time_per_iteration": 2.5860671997070312 }, { "auxiliary_loss_clip": 0.01154742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05549574, "balance_loss_mlp": 1.02608204, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 2.0353470349004485, "language_loss": 0.68646181, "learning_rate": 3.7950748345510126e-06, "loss": 0.70844984, "num_input_tokens_seen": 61637680, "step": 2844, "time_per_iteration": 2.5961523056030273 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.00778162, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.00112617, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 2.027694794878894, "language_loss": 0.78771943, "learning_rate": 3.7949030719455646e-06, "loss": 0.806835, "num_input_tokens_seen": 61655630, "step": 2845, "time_per_iteration": 2.720193386077881 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01047407, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.02914453, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.2586144454646306, "language_loss": 0.7811147, "learning_rate": 3.7947312412772127e-06, "loss": 0.80303913, "num_input_tokens_seen": 61673475, "step": 2846, "time_per_iteration": 2.691033363342285 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.05425262, "balance_loss_mlp": 1.02865243, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 2.2208975060456426, "language_loss": 0.79762948, "learning_rate": 3.794559342552472e-06, "loss": 0.8195321, "num_input_tokens_seen": 61693370, "step": 2847, "time_per_iteration": 2.7504522800445557 }, { "auxiliary_loss_clip": 0.01142651, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.05101562, "balance_loss_mlp": 1.02668071, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 2.4457083156230017, "language_loss": 0.8665086, "learning_rate": 3.7943873757778614e-06, "loss": 0.88839209, "num_input_tokens_seen": 61710820, "step": 2848, "time_per_iteration": 2.642946720123291 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.02559662, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 3.6033710399461856, "language_loss": 0.75238276, "learning_rate": 3.794215340959902e-06, "loss": 0.77394426, "num_input_tokens_seen": 61729855, "step": 2849, "time_per_iteration": 2.7511017322540283 }, { "auxiliary_loss_clip": 0.0103263, "auxiliary_loss_mlp": 0.01006833, "balance_loss_clip": 1.02775574, "balance_loss_mlp": 1.00413883, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.7881928427119427, "language_loss": 0.57514679, "learning_rate": 3.7940432381051163e-06, "loss": 0.59554148, "num_input_tokens_seen": 61790290, "step": 2850, "time_per_iteration": 3.234609603881836 }, { "auxiliary_loss_clip": 0.01115021, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.05049884, "balance_loss_mlp": 1.02661848, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.962731712990184, "language_loss": 0.81328994, "learning_rate": 3.793871067220031e-06, "loss": 0.83488399, "num_input_tokens_seen": 61809265, "step": 2851, "time_per_iteration": 2.78957200050354 }, { "auxiliary_loss_clip": 0.01114419, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.05193233, "balance_loss_mlp": 1.02592039, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.049906502724323, "language_loss": 0.93085313, "learning_rate": 3.7936988283111764e-06, "loss": 0.95243311, "num_input_tokens_seen": 61828980, "step": 2852, "time_per_iteration": 2.8247029781341553 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.04961288, "balance_loss_mlp": 1.03045225, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.8770741979814063, "language_loss": 0.69465554, "learning_rate": 3.7935265213850817e-06, "loss": 0.71637762, "num_input_tokens_seen": 61847915, "step": 2853, "time_per_iteration": 2.814162492752075 }, { "auxiliary_loss_clip": 0.01120856, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05593121, "balance_loss_mlp": 1.02899122, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.5884803351111705, "language_loss": 0.66611075, "learning_rate": 3.7933541464482815e-06, "loss": 0.68778855, "num_input_tokens_seen": 61865570, "step": 2854, "time_per_iteration": 2.7968995571136475 }, { "auxiliary_loss_clip": 0.01120742, "auxiliary_loss_mlp": 0.01052217, "balance_loss_clip": 1.04853106, "balance_loss_mlp": 1.0349679, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.705510390491261, "language_loss": 0.8929621, "learning_rate": 3.7931817035073124e-06, "loss": 0.91469175, "num_input_tokens_seen": 61883340, "step": 2855, "time_per_iteration": 2.7045016288757324 }, { "auxiliary_loss_clip": 0.01157319, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.05505848, "balance_loss_mlp": 1.02662265, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.117219134143716, "language_loss": 0.83963835, "learning_rate": 3.7930091925687134e-06, "loss": 0.86164963, "num_input_tokens_seen": 61900610, "step": 2856, "time_per_iteration": 2.7349936962127686 }, { "auxiliary_loss_clip": 0.01150108, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.05812418, "balance_loss_mlp": 1.02783966, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 2.234025867710235, "language_loss": 0.86309886, "learning_rate": 3.792836613639026e-06, "loss": 0.88506144, "num_input_tokens_seen": 61916795, "step": 2857, "time_per_iteration": 2.749356746673584 }, { "auxiliary_loss_clip": 0.01144467, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 1.05469525, "balance_loss_mlp": 1.0324626, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 2.069122070501307, "language_loss": 0.78334701, "learning_rate": 3.7926639667247947e-06, "loss": 0.80529737, "num_input_tokens_seen": 61936665, "step": 2858, "time_per_iteration": 2.6673583984375 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.05591416, "balance_loss_mlp": 1.03263378, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 2.1629422323642453, "language_loss": 0.77565676, "learning_rate": 3.7924912518325663e-06, "loss": 0.79766762, "num_input_tokens_seen": 61954415, "step": 2859, "time_per_iteration": 2.646648645401001 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01047481, "balance_loss_clip": 1.05317724, "balance_loss_mlp": 1.02887201, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 2.088627069497316, "language_loss": 0.77088714, "learning_rate": 3.7923184689688902e-06, "loss": 0.79244983, "num_input_tokens_seen": 61973940, "step": 2860, "time_per_iteration": 2.7671573162078857 }, { "auxiliary_loss_clip": 0.01145562, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05316472, "balance_loss_mlp": 1.02416611, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 2.1608688480628304, "language_loss": 0.81384242, "learning_rate": 3.792145618140317e-06, "loss": 0.83571851, "num_input_tokens_seen": 61991845, "step": 2861, "time_per_iteration": 2.6492061614990234 }, { "auxiliary_loss_clip": 0.011306, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05280077, "balance_loss_mlp": 1.0335927, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 2.0128324416816192, "language_loss": 0.85691392, "learning_rate": 3.7919726993534038e-06, "loss": 0.87873554, "num_input_tokens_seen": 62009395, "step": 2862, "time_per_iteration": 4.290126323699951 }, { "auxiliary_loss_clip": 0.01116765, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.05126834, "balance_loss_mlp": 1.02655208, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 3.7047120479299993, "language_loss": 0.78047049, "learning_rate": 3.7917997126147054e-06, "loss": 0.80207253, "num_input_tokens_seen": 62029005, "step": 2863, "time_per_iteration": 4.275500774383545 }, { "auxiliary_loss_clip": 0.01122315, "auxiliary_loss_mlp": 0.00776596, "balance_loss_clip": 1.05132961, "balance_loss_mlp": 1.00090909, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.7350128683820358, "language_loss": 0.72135127, "learning_rate": 3.7916266579307823e-06, "loss": 0.74034035, "num_input_tokens_seen": 62048730, "step": 2864, "time_per_iteration": 4.414710998535156 }, { "auxiliary_loss_clip": 0.01121488, "auxiliary_loss_mlp": 0.01049611, "balance_loss_clip": 1.05114079, "balance_loss_mlp": 1.03099, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 1.9270646210248614, "language_loss": 0.73002023, "learning_rate": 3.7914535353081973e-06, "loss": 0.75173128, "num_input_tokens_seen": 62069000, "step": 2865, "time_per_iteration": 2.7463715076446533 }, { "auxiliary_loss_clip": 0.01145037, "auxiliary_loss_mlp": 0.0077644, "balance_loss_clip": 1.05669165, "balance_loss_mlp": 1.00120521, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 2.669585642962841, "language_loss": 0.78357804, "learning_rate": 3.7912803447535145e-06, "loss": 0.80279285, "num_input_tokens_seen": 62086750, "step": 2866, "time_per_iteration": 2.785146713256836 }, { "auxiliary_loss_clip": 0.01157272, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.05600274, "balance_loss_mlp": 1.02536821, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 2.551277931358127, "language_loss": 0.79755104, "learning_rate": 3.7911070862733016e-06, "loss": 0.81956732, "num_input_tokens_seen": 62106240, "step": 2867, "time_per_iteration": 4.3145318031311035 }, { "auxiliary_loss_clip": 0.01132297, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.0529356, "balance_loss_mlp": 1.02274013, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.8689780270661371, "language_loss": 0.79206991, "learning_rate": 3.7909337598741276e-06, "loss": 0.81380683, "num_input_tokens_seen": 62124895, "step": 2868, "time_per_iteration": 2.7683827877044678 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.05331647, "balance_loss_mlp": 1.02427697, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.0344588273772923, "language_loss": 0.84221756, "learning_rate": 3.7907603655625674e-06, "loss": 0.86375177, "num_input_tokens_seen": 62143510, "step": 2869, "time_per_iteration": 2.729156970977783 }, { "auxiliary_loss_clip": 0.01132999, "auxiliary_loss_mlp": 0.01048405, "balance_loss_clip": 1.0535363, "balance_loss_mlp": 1.02955842, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.8935704627114847, "language_loss": 0.77299273, "learning_rate": 3.7905869033451932e-06, "loss": 0.79480684, "num_input_tokens_seen": 62162285, "step": 2870, "time_per_iteration": 2.752739191055298 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.05671024, "balance_loss_mlp": 1.02110744, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 2.0115587398764396, "language_loss": 0.77409238, "learning_rate": 3.7904133732285857e-06, "loss": 0.79601026, "num_input_tokens_seen": 62180970, "step": 2871, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.0222379, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.203011669690562, "language_loss": 0.74197829, "learning_rate": 3.7902397752193228e-06, "loss": 0.76371384, "num_input_tokens_seen": 62198965, "step": 2872, "time_per_iteration": 2.6959900856018066 }, { "auxiliary_loss_clip": 0.01150773, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.05359554, "balance_loss_mlp": 1.02362645, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.7914171074077658, "language_loss": 0.82336062, "learning_rate": 3.790066109323988e-06, "loss": 0.84528345, "num_input_tokens_seen": 62219890, "step": 2873, "time_per_iteration": 2.603564977645874 }, { "auxiliary_loss_clip": 0.01108819, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.04744792, "balance_loss_mlp": 1.02522969, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 3.7341652608759297, "language_loss": 0.75355422, "learning_rate": 3.7898923755491678e-06, "loss": 0.77509236, "num_input_tokens_seen": 62237140, "step": 2874, "time_per_iteration": 2.8438260555267334 }, { "auxiliary_loss_clip": 0.01159322, "auxiliary_loss_mlp": 0.01044415, "balance_loss_clip": 1.05658269, "balance_loss_mlp": 1.02404249, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 2.7053876793207037, "language_loss": 0.80239916, "learning_rate": 3.7897185739014487e-06, "loss": 0.82443655, "num_input_tokens_seen": 62255405, "step": 2875, "time_per_iteration": 2.625183343887329 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.0535475, "balance_loss_mlp": 1.03297722, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 3.840653645811056, "language_loss": 0.87621164, "learning_rate": 3.7895447043874217e-06, "loss": 0.8980962, "num_input_tokens_seen": 62271280, "step": 2876, "time_per_iteration": 2.6782751083374023 }, { "auxiliary_loss_clip": 0.01136898, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.05730534, "balance_loss_mlp": 1.02559566, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 1.8931416121171032, "language_loss": 0.84386718, "learning_rate": 3.789370767013681e-06, "loss": 0.86567843, "num_input_tokens_seen": 62289140, "step": 2877, "time_per_iteration": 2.681131362915039 }, { "auxiliary_loss_clip": 0.01120759, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05222571, "balance_loss_mlp": 1.02499604, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.106635210245156, "language_loss": 0.79660022, "learning_rate": 3.7891967617868204e-06, "loss": 0.81824744, "num_input_tokens_seen": 62307490, "step": 2878, "time_per_iteration": 2.8118834495544434 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01047222, "balance_loss_clip": 1.05593777, "balance_loss_mlp": 1.02953172, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.9675557254753375, "language_loss": 0.70236337, "learning_rate": 3.78902268871344e-06, "loss": 0.72419673, "num_input_tokens_seen": 62328570, "step": 2879, "time_per_iteration": 2.7998502254486084 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.05183411, "balance_loss_mlp": 1.03337598, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 2.0545155253910163, "language_loss": 0.82884222, "learning_rate": 3.78884854780014e-06, "loss": 0.85066462, "num_input_tokens_seen": 62345735, "step": 2880, "time_per_iteration": 2.6707684993743896 }, { "auxiliary_loss_clip": 0.01110706, "auxiliary_loss_mlp": 0.01054327, "balance_loss_clip": 1.05214918, "balance_loss_mlp": 1.03303647, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 1.9029231217608267, "language_loss": 0.80879176, "learning_rate": 3.7886743390535236e-06, "loss": 0.83044201, "num_input_tokens_seen": 62365525, "step": 2881, "time_per_iteration": 2.7851576805114746 }, { "auxiliary_loss_clip": 0.01135983, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.05544055, "balance_loss_mlp": 1.02921653, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 2.753231520615002, "language_loss": 0.77268815, "learning_rate": 3.788500062480197e-06, "loss": 0.79451692, "num_input_tokens_seen": 62385160, "step": 2882, "time_per_iteration": 2.7785212993621826 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01047516, "balance_loss_clip": 1.0633558, "balance_loss_mlp": 1.02947998, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 2.096311926604511, "language_loss": 0.76714236, "learning_rate": 3.788325718086769e-06, "loss": 0.78885853, "num_input_tokens_seen": 62405280, "step": 2883, "time_per_iteration": 2.838848352432251 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.04929209, "balance_loss_mlp": 1.02821302, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.1194201700326873, "language_loss": 0.8555252, "learning_rate": 3.7881513058798503e-06, "loss": 0.87709635, "num_input_tokens_seen": 62423665, "step": 2884, "time_per_iteration": 2.829376220703125 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.00775817, "balance_loss_clip": 1.05472779, "balance_loss_mlp": 1.00088096, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.7131036779262108, "language_loss": 0.74756771, "learning_rate": 3.787976825866055e-06, "loss": 0.76666546, "num_input_tokens_seen": 62445170, "step": 2885, "time_per_iteration": 2.8710989952087402 }, { "auxiliary_loss_clip": 0.01128977, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.05498922, "balance_loss_mlp": 1.0280925, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 2.374438581614022, "language_loss": 0.7107017, "learning_rate": 3.7878022780519998e-06, "loss": 0.73244053, "num_input_tokens_seen": 62466135, "step": 2886, "time_per_iteration": 2.726621150970459 }, { "auxiliary_loss_clip": 0.01142411, "auxiliary_loss_mlp": 0.01041857, "balance_loss_clip": 1.05233932, "balance_loss_mlp": 1.02408338, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 2.0566537172661747, "language_loss": 0.69906294, "learning_rate": 3.7876276624443024e-06, "loss": 0.72090566, "num_input_tokens_seen": 62483910, "step": 2887, "time_per_iteration": 2.7066688537597656 }, { "auxiliary_loss_clip": 0.01116425, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.05328536, "balance_loss_mlp": 1.02728677, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 2.038016964464323, "language_loss": 0.85257947, "learning_rate": 3.787452979049585e-06, "loss": 0.87419748, "num_input_tokens_seen": 62501530, "step": 2888, "time_per_iteration": 2.7514970302581787 }, { "auxiliary_loss_clip": 0.01095063, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.05020595, "balance_loss_mlp": 1.02822983, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.196318077733749, "language_loss": 0.78491282, "learning_rate": 3.7872782278744718e-06, "loss": 0.80635762, "num_input_tokens_seen": 62521295, "step": 2889, "time_per_iteration": 2.8221559524536133 }, { "auxiliary_loss_clip": 0.01112139, "auxiliary_loss_mlp": 0.0077601, "balance_loss_clip": 1.05236733, "balance_loss_mlp": 1.00114667, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.333227367674716, "language_loss": 0.84076989, "learning_rate": 3.7871034089255883e-06, "loss": 0.85965133, "num_input_tokens_seen": 62539615, "step": 2890, "time_per_iteration": 2.7213382720947266 }, { "auxiliary_loss_clip": 0.01142218, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.05530691, "balance_loss_mlp": 1.03752589, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 2.7278091568285596, "language_loss": 0.82205319, "learning_rate": 3.7869285222095653e-06, "loss": 0.84402454, "num_input_tokens_seen": 62556820, "step": 2891, "time_per_iteration": 2.625162363052368 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01050012, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.02876878, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 1.9017653264876209, "language_loss": 0.81200826, "learning_rate": 3.7867535677330334e-06, "loss": 0.83348203, "num_input_tokens_seen": 62572450, "step": 2892, "time_per_iteration": 2.7682459354400635 }, { "auxiliary_loss_clip": 0.01148834, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.05707812, "balance_loss_mlp": 1.03631687, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 2.0056711213447436, "language_loss": 0.73950225, "learning_rate": 3.786578545502627e-06, "loss": 0.76154572, "num_input_tokens_seen": 62592580, "step": 2893, "time_per_iteration": 2.8463022708892822 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.05198765, "balance_loss_mlp": 1.02443516, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 4.010773627073901, "language_loss": 0.82507658, "learning_rate": 3.7864034555249828e-06, "loss": 0.84683645, "num_input_tokens_seen": 62611220, "step": 2894, "time_per_iteration": 2.719564914703369 }, { "auxiliary_loss_clip": 0.01113951, "auxiliary_loss_mlp": 0.01046249, "balance_loss_clip": 1.0506922, "balance_loss_mlp": 1.02463603, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 2.3322053123967574, "language_loss": 0.73826683, "learning_rate": 3.786228297806741e-06, "loss": 0.7598688, "num_input_tokens_seen": 62629185, "step": 2895, "time_per_iteration": 2.743992805480957 }, { "auxiliary_loss_clip": 0.01037578, "auxiliary_loss_mlp": 0.01011099, "balance_loss_clip": 1.0404408, "balance_loss_mlp": 1.00788069, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8765647158253519, "language_loss": 0.62754023, "learning_rate": 3.7860530723545435e-06, "loss": 0.64802706, "num_input_tokens_seen": 62691895, "step": 2896, "time_per_iteration": 3.345099687576294 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.00776588, "balance_loss_clip": 1.05246758, "balance_loss_mlp": 1.00102258, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7338863964520728, "language_loss": 0.75822324, "learning_rate": 3.785877779175034e-06, "loss": 0.77729923, "num_input_tokens_seen": 62713790, "step": 2897, "time_per_iteration": 2.772292137145996 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.0545547, "balance_loss_mlp": 1.02512598, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 1.944569306659421, "language_loss": 0.6883949, "learning_rate": 3.7857024182748606e-06, "loss": 0.71023834, "num_input_tokens_seen": 62736285, "step": 2898, "time_per_iteration": 2.7278554439544678 }, { "auxiliary_loss_clip": 0.01128715, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.05251193, "balance_loss_mlp": 1.02504694, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.99011081330885, "language_loss": 0.76445562, "learning_rate": 3.7855269896606717e-06, "loss": 0.78618491, "num_input_tokens_seen": 62756240, "step": 2899, "time_per_iteration": 2.8052010536193848 }, { "auxiliary_loss_clip": 0.01095069, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.04680347, "balance_loss_mlp": 1.02632213, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 3.2965812335226357, "language_loss": 0.72860038, "learning_rate": 3.785351493339121e-06, "loss": 0.75001007, "num_input_tokens_seen": 62775910, "step": 2900, "time_per_iteration": 2.868218421936035 }, { "auxiliary_loss_clip": 0.01110522, "auxiliary_loss_mlp": 0.00776698, "balance_loss_clip": 1.05202782, "balance_loss_mlp": 1.000983, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.5488662608930523, "language_loss": 0.69946706, "learning_rate": 3.785175929316863e-06, "loss": 0.71833932, "num_input_tokens_seen": 62799385, "step": 2901, "time_per_iteration": 4.407040596008301 }, { "auxiliary_loss_clip": 0.01129098, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.05246592, "balance_loss_mlp": 1.02764344, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.1785959913748965, "language_loss": 0.76588804, "learning_rate": 3.7850002976005543e-06, "loss": 0.78763425, "num_input_tokens_seen": 62819380, "step": 2902, "time_per_iteration": 4.2244462966918945 }, { "auxiliary_loss_clip": 0.01145685, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.0531354, "balance_loss_mlp": 1.02567625, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.2508699895191073, "language_loss": 0.81588745, "learning_rate": 3.7848245981968558e-06, "loss": 0.83778256, "num_input_tokens_seen": 62836205, "step": 2903, "time_per_iteration": 4.132925271987915 }, { "auxiliary_loss_clip": 0.01126443, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02135992, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 2.4085694554154187, "language_loss": 0.73316491, "learning_rate": 3.784648831112429e-06, "loss": 0.75482351, "num_input_tokens_seen": 62854045, "step": 2904, "time_per_iteration": 2.7033374309539795 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.0104577, "balance_loss_clip": 1.05250716, "balance_loss_mlp": 1.02822256, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 1.8783326609306377, "language_loss": 0.64233291, "learning_rate": 3.7844729963539406e-06, "loss": 0.66384256, "num_input_tokens_seen": 62873075, "step": 2905, "time_per_iteration": 2.8325791358947754 }, { "auxiliary_loss_clip": 0.01135256, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.05869055, "balance_loss_mlp": 1.03370619, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 2.820817719352069, "language_loss": 0.79504299, "learning_rate": 3.7842970939280566e-06, "loss": 0.81692564, "num_input_tokens_seen": 62892675, "step": 2906, "time_per_iteration": 4.491498231887817 }, { "auxiliary_loss_clip": 0.01146195, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.05623174, "balance_loss_mlp": 1.03258538, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 2.262709441571415, "language_loss": 0.81318873, "learning_rate": 3.784121123841449e-06, "loss": 0.83515799, "num_input_tokens_seen": 62910675, "step": 2907, "time_per_iteration": 2.6855854988098145 }, { "auxiliary_loss_clip": 0.01143202, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.0253861, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.068635027461873, "language_loss": 0.81342787, "learning_rate": 3.7839450861007886e-06, "loss": 0.83529305, "num_input_tokens_seen": 62928130, "step": 2908, "time_per_iteration": 2.6449570655822754 }, { "auxiliary_loss_clip": 0.01127136, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.05178046, "balance_loss_mlp": 1.03163743, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 3.147433356867123, "language_loss": 0.80020624, "learning_rate": 3.7837689807127518e-06, "loss": 0.82198691, "num_input_tokens_seen": 62944290, "step": 2909, "time_per_iteration": 2.6820569038391113 }, { "auxiliary_loss_clip": 0.0109059, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.05020881, "balance_loss_mlp": 1.0310595, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.6978440546881337, "language_loss": 0.76742244, "learning_rate": 3.783592807684017e-06, "loss": 0.7888546, "num_input_tokens_seen": 62963505, "step": 2910, "time_per_iteration": 2.6980416774749756 }, { "auxiliary_loss_clip": 0.01158552, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05618358, "balance_loss_mlp": 1.03059566, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.9812610358315632, "language_loss": 0.8698765, "learning_rate": 3.7834165670212645e-06, "loss": 0.89195609, "num_input_tokens_seen": 62985020, "step": 2911, "time_per_iteration": 2.692662477493286 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.00777232, "balance_loss_clip": 1.05323184, "balance_loss_mlp": 1.00110698, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 3.030740090796483, "language_loss": 0.89883876, "learning_rate": 3.7832402587311764e-06, "loss": 0.91815847, "num_input_tokens_seen": 63001745, "step": 2912, "time_per_iteration": 2.600738763809204 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01045616, "balance_loss_clip": 1.0538094, "balance_loss_mlp": 1.02655411, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 2.03479884577424, "language_loss": 0.72818935, "learning_rate": 3.783063882820439e-06, "loss": 0.75010711, "num_input_tokens_seen": 63019750, "step": 2913, "time_per_iteration": 2.623342275619507 }, { "auxiliary_loss_clip": 0.01140074, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.05781865, "balance_loss_mlp": 1.02557003, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 2.137073079496124, "language_loss": 0.6891731, "learning_rate": 3.782887439295741e-06, "loss": 0.71101314, "num_input_tokens_seen": 63039500, "step": 2914, "time_per_iteration": 2.7065770626068115 }, { "auxiliary_loss_clip": 0.01142434, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05532789, "balance_loss_mlp": 1.02649403, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 2.051329837479214, "language_loss": 0.93125081, "learning_rate": 3.782710928163772e-06, "loss": 0.9531256, "num_input_tokens_seen": 63059785, "step": 2915, "time_per_iteration": 2.659029245376587 }, { "auxiliary_loss_clip": 0.01114731, "auxiliary_loss_mlp": 0.01040999, "balance_loss_clip": 1.04957223, "balance_loss_mlp": 1.02243853, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 1.604344576738792, "language_loss": 0.81092978, "learning_rate": 3.782534349431226e-06, "loss": 0.83248705, "num_input_tokens_seen": 63079385, "step": 2916, "time_per_iteration": 2.7099549770355225 }, { "auxiliary_loss_clip": 0.0114211, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.05090034, "balance_loss_mlp": 1.02780342, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 3.7582760939418716, "language_loss": 0.73829222, "learning_rate": 3.782357703104799e-06, "loss": 0.76017547, "num_input_tokens_seen": 63098970, "step": 2917, "time_per_iteration": 2.666717767715454 }, { "auxiliary_loss_clip": 0.01133449, "auxiliary_loss_mlp": 0.01047353, "balance_loss_clip": 1.05319786, "balance_loss_mlp": 1.02821994, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.813699779869167, "language_loss": 0.76739681, "learning_rate": 3.7821809891911897e-06, "loss": 0.78920484, "num_input_tokens_seen": 63118750, "step": 2918, "time_per_iteration": 2.647634744644165 }, { "auxiliary_loss_clip": 0.01093958, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.0476644, "balance_loss_mlp": 1.02425694, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.436739755969174, "language_loss": 0.73624814, "learning_rate": 3.782004207697098e-06, "loss": 0.75764406, "num_input_tokens_seen": 63136865, "step": 2919, "time_per_iteration": 2.7904632091522217 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.04938293, "balance_loss_mlp": 1.02805829, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 2.5113730227003814, "language_loss": 0.74840331, "learning_rate": 3.781827358629228e-06, "loss": 0.77011508, "num_input_tokens_seen": 63158325, "step": 2920, "time_per_iteration": 2.727890968322754 }, { "auxiliary_loss_clip": 0.01117257, "auxiliary_loss_mlp": 0.01042893, "balance_loss_clip": 1.0462867, "balance_loss_mlp": 1.02371216, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 3.6617213109535536, "language_loss": 0.79731411, "learning_rate": 3.7816504419942873e-06, "loss": 0.81891561, "num_input_tokens_seen": 63173115, "step": 2921, "time_per_iteration": 2.753817558288574 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.05232286, "balance_loss_mlp": 1.02679133, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 2.6301689129577546, "language_loss": 0.87826073, "learning_rate": 3.7814734577989823e-06, "loss": 0.89994025, "num_input_tokens_seen": 63192880, "step": 2922, "time_per_iteration": 2.7411837577819824 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.05196273, "balance_loss_mlp": 1.02778149, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 4.4893841411537085, "language_loss": 0.62347209, "learning_rate": 3.7812964060500253e-06, "loss": 0.64538622, "num_input_tokens_seen": 63214395, "step": 2923, "time_per_iteration": 2.7666683197021484 }, { "auxiliary_loss_clip": 0.01134872, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.05887377, "balance_loss_mlp": 1.02847457, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 2.8552131957437914, "language_loss": 0.80392253, "learning_rate": 3.78111928675413e-06, "loss": 0.82575822, "num_input_tokens_seen": 63231020, "step": 2924, "time_per_iteration": 2.729403257369995 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01051456, "balance_loss_clip": 1.05193377, "balance_loss_mlp": 1.03082108, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 5.080042666316876, "language_loss": 0.71374178, "learning_rate": 3.7809420999180126e-06, "loss": 0.73558426, "num_input_tokens_seen": 63246245, "step": 2925, "time_per_iteration": 2.9538233280181885 }, { "auxiliary_loss_clip": 0.01117196, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.05052948, "balance_loss_mlp": 1.02744341, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.6620026542608322, "language_loss": 0.71931666, "learning_rate": 3.7807648455483934e-06, "loss": 0.74094564, "num_input_tokens_seen": 63267790, "step": 2926, "time_per_iteration": 2.7738964557647705 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04915071, "balance_loss_mlp": 1.02253425, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 2.6318732447225837, "language_loss": 0.84724289, "learning_rate": 3.7805875236519918e-06, "loss": 0.86875963, "num_input_tokens_seen": 63286830, "step": 2927, "time_per_iteration": 2.704437494277954 }, { "auxiliary_loss_clip": 0.01100437, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05039644, "balance_loss_mlp": 1.02887452, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.9547597089289632, "language_loss": 0.72147644, "learning_rate": 3.7804101342355336e-06, "loss": 0.74294758, "num_input_tokens_seen": 63308870, "step": 2928, "time_per_iteration": 2.793802261352539 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.0516876, "balance_loss_mlp": 1.02679992, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 1.8474008440192304, "language_loss": 0.83097279, "learning_rate": 3.780232677305744e-06, "loss": 0.85263157, "num_input_tokens_seen": 63329005, "step": 2929, "time_per_iteration": 2.733339786529541 }, { "auxiliary_loss_clip": 0.01124127, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04853475, "balance_loss_mlp": 1.02479422, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 2.4427170552109163, "language_loss": 0.79211783, "learning_rate": 3.7800551528693535e-06, "loss": 0.81378424, "num_input_tokens_seen": 63349390, "step": 2930, "time_per_iteration": 2.748080015182495 }, { "auxiliary_loss_clip": 0.01160654, "auxiliary_loss_mlp": 0.01047281, "balance_loss_clip": 1.05925918, "balance_loss_mlp": 1.02758813, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 2.504124366499191, "language_loss": 0.76502466, "learning_rate": 3.7798775609330927e-06, "loss": 0.78710401, "num_input_tokens_seen": 63368835, "step": 2931, "time_per_iteration": 2.6691603660583496 }, { "auxiliary_loss_clip": 0.01076453, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.04577017, "balance_loss_mlp": 1.02478647, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.941321746162514, "language_loss": 0.76070881, "learning_rate": 3.779699901503696e-06, "loss": 0.78190923, "num_input_tokens_seen": 63385220, "step": 2932, "time_per_iteration": 2.809630870819092 }, { "auxiliary_loss_clip": 0.01148627, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.05284405, "balance_loss_mlp": 1.0229789, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 5.168612276821382, "language_loss": 0.90027422, "learning_rate": 3.7795221745879016e-06, "loss": 0.92219198, "num_input_tokens_seen": 63400865, "step": 2933, "time_per_iteration": 2.6665337085723877 }, { "auxiliary_loss_clip": 0.01154114, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05539656, "balance_loss_mlp": 1.03766203, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 2.009210784374188, "language_loss": 0.88323247, "learning_rate": 3.779344380192448e-06, "loss": 0.90533352, "num_input_tokens_seen": 63421390, "step": 2934, "time_per_iteration": 2.6649580001831055 }, { "auxiliary_loss_clip": 0.01128495, "auxiliary_loss_mlp": 0.01048067, "balance_loss_clip": 1.05581188, "balance_loss_mlp": 1.03028131, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.6302121247923247, "language_loss": 0.70403945, "learning_rate": 3.779166518324077e-06, "loss": 0.72580504, "num_input_tokens_seen": 63444715, "step": 2935, "time_per_iteration": 3.006019115447998 }, { "auxiliary_loss_clip": 0.01126189, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.05360174, "balance_loss_mlp": 1.02135396, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.5931578566124807, "language_loss": 0.69721985, "learning_rate": 3.7789885889895325e-06, "loss": 0.71888208, "num_input_tokens_seen": 63465525, "step": 2936, "time_per_iteration": 2.7517428398132324 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.05023837, "balance_loss_mlp": 1.02737129, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 1.9170676229980566, "language_loss": 0.71288073, "learning_rate": 3.7788105921955634e-06, "loss": 0.73439616, "num_input_tokens_seen": 63485815, "step": 2937, "time_per_iteration": 2.837181329727173 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05945122, "balance_loss_mlp": 1.02674472, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.267148270780071, "language_loss": 0.75439745, "learning_rate": 3.7786325279489184e-06, "loss": 0.77627593, "num_input_tokens_seen": 63503905, "step": 2938, "time_per_iteration": 2.883162021636963 }, { "auxiliary_loss_clip": 0.01147345, "auxiliary_loss_mlp": 0.01043976, "balance_loss_clip": 1.05576169, "balance_loss_mlp": 1.02553487, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.921726967662053, "language_loss": 0.71015209, "learning_rate": 3.7784543962563495e-06, "loss": 0.73206532, "num_input_tokens_seen": 63521985, "step": 2939, "time_per_iteration": 2.6938419342041016 }, { "auxiliary_loss_clip": 0.01160437, "auxiliary_loss_mlp": 0.01046921, "balance_loss_clip": 1.05818558, "balance_loss_mlp": 1.02794337, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 3.114901170192376, "language_loss": 0.73513985, "learning_rate": 3.7782761971246115e-06, "loss": 0.75721341, "num_input_tokens_seen": 63539830, "step": 2940, "time_per_iteration": 4.145469665527344 }, { "auxiliary_loss_clip": 0.0112582, "auxiliary_loss_mlp": 0.01046611, "balance_loss_clip": 1.05631542, "balance_loss_mlp": 1.02731109, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 3.071469776016301, "language_loss": 0.85375023, "learning_rate": 3.7780979305604616e-06, "loss": 0.87547457, "num_input_tokens_seen": 63555495, "step": 2941, "time_per_iteration": 4.279599666595459 }, { "auxiliary_loss_clip": 0.01161068, "auxiliary_loss_mlp": 0.01045254, "balance_loss_clip": 1.05717027, "balance_loss_mlp": 1.0257628, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.434766510066968, "language_loss": 0.76885259, "learning_rate": 3.7779195965706607e-06, "loss": 0.79091585, "num_input_tokens_seen": 63575290, "step": 2942, "time_per_iteration": 4.2280871868133545 }, { "auxiliary_loss_clip": 0.01106234, "auxiliary_loss_mlp": 0.00780676, "balance_loss_clip": 1.04992843, "balance_loss_mlp": 1.00087166, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 3.301743041114179, "language_loss": 0.8024286, "learning_rate": 3.77774119516197e-06, "loss": 0.82129776, "num_input_tokens_seen": 63594670, "step": 2943, "time_per_iteration": 2.8921029567718506 }, { "auxiliary_loss_clip": 0.01132848, "auxiliary_loss_mlp": 0.01052225, "balance_loss_clip": 1.05352235, "balance_loss_mlp": 1.03124392, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 5.7613375603973465, "language_loss": 0.80809408, "learning_rate": 3.777562726341155e-06, "loss": 0.82994485, "num_input_tokens_seen": 63614780, "step": 2944, "time_per_iteration": 2.692831039428711 }, { "auxiliary_loss_clip": 0.01161854, "auxiliary_loss_mlp": 0.01056825, "balance_loss_clip": 1.05807233, "balance_loss_mlp": 1.03796625, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 2.4257754996125227, "language_loss": 0.73812854, "learning_rate": 3.7773841901149835e-06, "loss": 0.7603153, "num_input_tokens_seen": 63637190, "step": 2945, "time_per_iteration": 2.782910108566284 }, { "auxiliary_loss_clip": 0.011481, "auxiliary_loss_mlp": 0.01047361, "balance_loss_clip": 1.05756998, "balance_loss_mlp": 1.02862108, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.8106797532110637, "language_loss": 0.7793628, "learning_rate": 3.7772055864902256e-06, "loss": 0.80131739, "num_input_tokens_seen": 63652140, "step": 2946, "time_per_iteration": 4.278741121292114 }, { "auxiliary_loss_clip": 0.01109059, "auxiliary_loss_mlp": 0.01052842, "balance_loss_clip": 1.04997015, "balance_loss_mlp": 1.03341079, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 2.172386857191393, "language_loss": 0.76068008, "learning_rate": 3.7770269154736535e-06, "loss": 0.7822991, "num_input_tokens_seen": 63671700, "step": 2947, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.0114934, "auxiliary_loss_mlp": 0.01044342, "balance_loss_clip": 1.05480659, "balance_loss_mlp": 1.025388, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.6793588646204745, "language_loss": 0.72557831, "learning_rate": 3.7768481770720424e-06, "loss": 0.74751514, "num_input_tokens_seen": 63691685, "step": 2948, "time_per_iteration": 2.901662826538086 }, { "auxiliary_loss_clip": 0.01151572, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.05921662, "balance_loss_mlp": 1.03236949, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.8296543316983853, "language_loss": 0.81782824, "learning_rate": 3.776669371292171e-06, "loss": 0.8398509, "num_input_tokens_seen": 63711720, "step": 2949, "time_per_iteration": 2.7284891605377197 }, { "auxiliary_loss_clip": 0.01080853, "auxiliary_loss_mlp": 0.0100651, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.00226629, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.768126622018234, "language_loss": 0.64989161, "learning_rate": 3.7764904981408186e-06, "loss": 0.67076528, "num_input_tokens_seen": 63776280, "step": 2950, "time_per_iteration": 3.2761552333831787 }, { "auxiliary_loss_clip": 0.01121454, "auxiliary_loss_mlp": 0.01045861, "balance_loss_clip": 1.05373287, "balance_loss_mlp": 1.02743077, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 2.9882590699755927, "language_loss": 0.83619881, "learning_rate": 3.7763115576247686e-06, "loss": 0.85787189, "num_input_tokens_seen": 63797535, "step": 2951, "time_per_iteration": 2.7637627124786377 }, { "auxiliary_loss_clip": 0.01125929, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.02682269, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 2.3133151959471796, "language_loss": 0.80395055, "learning_rate": 3.776132549750806e-06, "loss": 0.82567012, "num_input_tokens_seen": 63817045, "step": 2952, "time_per_iteration": 2.7605957984924316 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01044862, "balance_loss_clip": 1.05858529, "balance_loss_mlp": 1.02513337, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.8185319653472116, "language_loss": 0.79273909, "learning_rate": 3.7759534745257194e-06, "loss": 0.81481451, "num_input_tokens_seen": 63837665, "step": 2953, "time_per_iteration": 2.798912525177002 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.02470589, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 2.017710353628998, "language_loss": 0.87963271, "learning_rate": 3.7757743319562994e-06, "loss": 0.90130568, "num_input_tokens_seen": 63858455, "step": 2954, "time_per_iteration": 2.838931083679199 }, { "auxiliary_loss_clip": 0.01144028, "auxiliary_loss_mlp": 0.01052958, "balance_loss_clip": 1.06043494, "balance_loss_mlp": 1.03296697, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.9130853947826985, "language_loss": 0.85313326, "learning_rate": 3.7755951220493386e-06, "loss": 0.87510312, "num_input_tokens_seen": 63876935, "step": 2955, "time_per_iteration": 2.7965714931488037 }, { "auxiliary_loss_clip": 0.01127677, "auxiliary_loss_mlp": 0.01047004, "balance_loss_clip": 1.05093336, "balance_loss_mlp": 1.02660692, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 18.24238703278013, "language_loss": 0.71152055, "learning_rate": 3.7754158448116327e-06, "loss": 0.73326737, "num_input_tokens_seen": 63896815, "step": 2956, "time_per_iteration": 2.8358442783355713 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.010506, "balance_loss_clip": 1.05813813, "balance_loss_mlp": 1.03156281, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 2.981126112172262, "language_loss": 0.82881534, "learning_rate": 3.7752365002499795e-06, "loss": 0.85082197, "num_input_tokens_seen": 63916140, "step": 2957, "time_per_iteration": 2.7034976482391357 }, { "auxiliary_loss_clip": 0.01100452, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.04976833, "balance_loss_mlp": 1.02789164, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 2.7180995933425622, "language_loss": 0.75164193, "learning_rate": 3.7750570883711807e-06, "loss": 0.77311885, "num_input_tokens_seen": 63935220, "step": 2958, "time_per_iteration": 2.8312718868255615 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.06117964, "balance_loss_mlp": 1.02502513, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 9.439636088267013, "language_loss": 0.80363399, "learning_rate": 3.7748776091820397e-06, "loss": 0.82552785, "num_input_tokens_seen": 63954550, "step": 2959, "time_per_iteration": 2.722102642059326 }, { "auxiliary_loss_clip": 0.01164621, "auxiliary_loss_mlp": 0.01049069, "balance_loss_clip": 1.05812871, "balance_loss_mlp": 1.02938771, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 2.62580469975692, "language_loss": 0.51511085, "learning_rate": 3.774698062689362e-06, "loss": 0.53724772, "num_input_tokens_seen": 63972425, "step": 2960, "time_per_iteration": 2.6222047805786133 }, { "auxiliary_loss_clip": 0.01111843, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.05275989, "balance_loss_mlp": 1.03228474, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 1.7626913000215665, "language_loss": 0.88908094, "learning_rate": 3.7745184488999548e-06, "loss": 0.91072738, "num_input_tokens_seen": 63992165, "step": 2961, "time_per_iteration": 2.8088786602020264 }, { "auxiliary_loss_clip": 0.01116231, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.05181062, "balance_loss_mlp": 1.03385067, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.716412227369414, "language_loss": 0.79170465, "learning_rate": 3.774338767820631e-06, "loss": 0.81341565, "num_input_tokens_seen": 64013470, "step": 2962, "time_per_iteration": 2.7546913623809814 }, { "auxiliary_loss_clip": 0.01145526, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.05649889, "balance_loss_mlp": 1.03104997, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 2.3241756501763446, "language_loss": 0.74910223, "learning_rate": 3.774159019458203e-06, "loss": 0.77108544, "num_input_tokens_seen": 64030975, "step": 2963, "time_per_iteration": 2.680356979370117 }, { "auxiliary_loss_clip": 0.01140656, "auxiliary_loss_mlp": 0.01043225, "balance_loss_clip": 1.05769885, "balance_loss_mlp": 1.02347231, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 1.747536927551571, "language_loss": 0.78837025, "learning_rate": 3.7739792038194877e-06, "loss": 0.81020904, "num_input_tokens_seen": 64050075, "step": 2964, "time_per_iteration": 2.748398780822754 }, { "auxiliary_loss_clip": 0.01151685, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.05950594, "balance_loss_mlp": 1.00098181, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 3.046027397796258, "language_loss": 0.81160808, "learning_rate": 3.7737993209113027e-06, "loss": 0.83089471, "num_input_tokens_seen": 64071920, "step": 2965, "time_per_iteration": 2.8090012073516846 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01047086, "balance_loss_clip": 1.06002402, "balance_loss_mlp": 1.02916884, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.554359630612449, "language_loss": 0.95307338, "learning_rate": 3.7736193707404698e-06, "loss": 0.97506082, "num_input_tokens_seen": 64086835, "step": 2966, "time_per_iteration": 2.7159550189971924 }, { "auxiliary_loss_clip": 0.01112928, "auxiliary_loss_mlp": 0.00777395, "balance_loss_clip": 1.05336046, "balance_loss_mlp": 1.00083637, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 7.5683867487642065, "language_loss": 0.72833109, "learning_rate": 3.7734393533138127e-06, "loss": 0.74723434, "num_input_tokens_seen": 64107360, "step": 2967, "time_per_iteration": 2.9540669918060303 }, { "auxiliary_loss_clip": 0.01129124, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02775562, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 2.1617023205672523, "language_loss": 0.76897681, "learning_rate": 3.773259268638157e-06, "loss": 0.7907362, "num_input_tokens_seen": 64124690, "step": 2968, "time_per_iteration": 2.752717971801758 }, { "auxiliary_loss_clip": 0.01085006, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.04640651, "balance_loss_mlp": 1.02559829, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 2.039560504387258, "language_loss": 0.75839806, "learning_rate": 3.7730791167203333e-06, "loss": 0.77969772, "num_input_tokens_seen": 64146315, "step": 2969, "time_per_iteration": 2.9161994457244873 }, { "auxiliary_loss_clip": 0.01075271, "auxiliary_loss_mlp": 0.01013071, "balance_loss_clip": 1.06177902, "balance_loss_mlp": 1.00932813, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8520394227890811, "language_loss": 0.69012916, "learning_rate": 3.772898897567171e-06, "loss": 0.7110126, "num_input_tokens_seen": 64210875, "step": 2970, "time_per_iteration": 3.3269262313842773 }, { "auxiliary_loss_clip": 0.011313, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.05561864, "balance_loss_mlp": 1.02493763, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 1.9951166568015506, "language_loss": 0.67617297, "learning_rate": 3.772718611185505e-06, "loss": 0.69792765, "num_input_tokens_seen": 64230740, "step": 2971, "time_per_iteration": 2.8691961765289307 }, { "auxiliary_loss_clip": 0.01110831, "auxiliary_loss_mlp": 0.01052779, "balance_loss_clip": 1.05309939, "balance_loss_mlp": 1.03266823, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 1.5664358375440484, "language_loss": 0.8971802, "learning_rate": 3.7725382575821717e-06, "loss": 0.91881633, "num_input_tokens_seen": 64252300, "step": 2972, "time_per_iteration": 2.893923759460449 }, { "auxiliary_loss_clip": 0.01124705, "auxiliary_loss_mlp": 0.01055871, "balance_loss_clip": 1.05635929, "balance_loss_mlp": 1.03466403, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.4611679901229153, "language_loss": 0.88593906, "learning_rate": 3.77235783676401e-06, "loss": 0.90774482, "num_input_tokens_seen": 64270105, "step": 2973, "time_per_iteration": 2.7340333461761475 }, { "auxiliary_loss_clip": 0.01164127, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.06285155, "balance_loss_mlp": 1.0283215, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 3.4039298885336557, "language_loss": 0.7668556, "learning_rate": 3.7721773487378615e-06, "loss": 0.78896761, "num_input_tokens_seen": 64287250, "step": 2974, "time_per_iteration": 2.632495403289795 }, { "auxiliary_loss_clip": 0.0114187, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.06101942, "balance_loss_mlp": 1.03390288, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 2.484949778027245, "language_loss": 0.74701655, "learning_rate": 3.7719967935105705e-06, "loss": 0.76896524, "num_input_tokens_seen": 64307140, "step": 2975, "time_per_iteration": 2.704012870788574 }, { "auxiliary_loss_clip": 0.01149026, "auxiliary_loss_mlp": 0.01048788, "balance_loss_clip": 1.05678535, "balance_loss_mlp": 1.03004813, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.518747487377626, "language_loss": 0.73032069, "learning_rate": 3.7718161710889833e-06, "loss": 0.75229883, "num_input_tokens_seen": 64328760, "step": 2976, "time_per_iteration": 2.7357017993927 }, { "auxiliary_loss_clip": 0.01150398, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.06239033, "balance_loss_mlp": 1.0229373, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.4579507247258654, "language_loss": 0.770594, "learning_rate": 3.7716354814799495e-06, "loss": 0.79248488, "num_input_tokens_seen": 64348800, "step": 2977, "time_per_iteration": 2.727318286895752 }, { "auxiliary_loss_clip": 0.01131521, "auxiliary_loss_mlp": 0.01045834, "balance_loss_clip": 1.06618452, "balance_loss_mlp": 1.02841735, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 2.7286854986191282, "language_loss": 0.80235189, "learning_rate": 3.7714547246903203e-06, "loss": 0.82412547, "num_input_tokens_seen": 64367955, "step": 2978, "time_per_iteration": 2.8178791999816895 }, { "auxiliary_loss_clip": 0.0114307, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.05818772, "balance_loss_mlp": 1.03330874, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4967765935497133, "language_loss": 0.76192784, "learning_rate": 3.7712739007269508e-06, "loss": 0.7838884, "num_input_tokens_seen": 64389805, "step": 2979, "time_per_iteration": 4.241487741470337 }, { "auxiliary_loss_clip": 0.01122958, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.0590893, "balance_loss_mlp": 1.02660525, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 1.9491816848203256, "language_loss": 0.68945503, "learning_rate": 3.7710930095966976e-06, "loss": 0.71113026, "num_input_tokens_seen": 64408220, "step": 2980, "time_per_iteration": 2.6817352771759033 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0588038, "balance_loss_mlp": 1.02497244, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.9134992191513662, "language_loss": 0.70793843, "learning_rate": 3.7709120513064196e-06, "loss": 0.72992027, "num_input_tokens_seen": 64426380, "step": 2981, "time_per_iteration": 4.310532331466675 }, { "auxiliary_loss_clip": 0.01137747, "auxiliary_loss_mlp": 0.01056086, "balance_loss_clip": 1.06083858, "balance_loss_mlp": 1.03686976, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.529665562311581, "language_loss": 0.8190546, "learning_rate": 3.7707310258629796e-06, "loss": 0.84099293, "num_input_tokens_seen": 64444355, "step": 2982, "time_per_iteration": 2.710726261138916 }, { "auxiliary_loss_clip": 0.01162978, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.06181359, "balance_loss_mlp": 1.02306128, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.6440716861921114, "language_loss": 0.83123535, "learning_rate": 3.7705499332732413e-06, "loss": 0.85327524, "num_input_tokens_seen": 64467800, "step": 2983, "time_per_iteration": 2.700378656387329 }, { "auxiliary_loss_clip": 0.01153001, "auxiliary_loss_mlp": 0.01048341, "balance_loss_clip": 1.05694914, "balance_loss_mlp": 1.02932739, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.6703280507743268, "language_loss": 0.85149562, "learning_rate": 3.7703687735440718e-06, "loss": 0.87350899, "num_input_tokens_seen": 64487230, "step": 2984, "time_per_iteration": 2.6529407501220703 }, { "auxiliary_loss_clip": 0.01126981, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05520201, "balance_loss_mlp": 1.02424896, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 2.4609160562432053, "language_loss": 0.8935222, "learning_rate": 3.7701875466823416e-06, "loss": 0.9152264, "num_input_tokens_seen": 64509165, "step": 2985, "time_per_iteration": 4.528426170349121 }, { "auxiliary_loss_clip": 0.01160091, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.06142831, "balance_loss_mlp": 1.02434587, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 2.095497349072142, "language_loss": 0.69538593, "learning_rate": 3.770006252694922e-06, "loss": 0.71739429, "num_input_tokens_seen": 64527940, "step": 2986, "time_per_iteration": 2.6890172958374023 }, { "auxiliary_loss_clip": 0.01158556, "auxiliary_loss_mlp": 0.00776, "balance_loss_clip": 1.05752599, "balance_loss_mlp": 1.00081134, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4599229747435123, "language_loss": 0.77855188, "learning_rate": 3.769824891588688e-06, "loss": 0.79789746, "num_input_tokens_seen": 64545230, "step": 2987, "time_per_iteration": 2.650761842727661 }, { "auxiliary_loss_clip": 0.0116216, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02441502, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 2.0190394876224467, "language_loss": 0.77958816, "learning_rate": 3.7696434633705164e-06, "loss": 0.80164748, "num_input_tokens_seen": 64563820, "step": 2988, "time_per_iteration": 2.6151437759399414 }, { "auxiliary_loss_clip": 0.01059513, "auxiliary_loss_mlp": 0.00756906, "balance_loss_clip": 1.07071137, "balance_loss_mlp": 1.00131369, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.7650122273387262, "language_loss": 0.62709254, "learning_rate": 3.7694619680472875e-06, "loss": 0.64525676, "num_input_tokens_seen": 64621315, "step": 2989, "time_per_iteration": 3.1990275382995605 }, { "auxiliary_loss_clip": 0.01137168, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02128983, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.3566032567209483, "language_loss": 0.71070904, "learning_rate": 3.7692804056258837e-06, "loss": 0.73246896, "num_input_tokens_seen": 64639885, "step": 2990, "time_per_iteration": 2.7275335788726807 }, { "auxiliary_loss_clip": 0.01135847, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.05398035, "balance_loss_mlp": 1.02639365, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 1.8035266350414116, "language_loss": 0.68888462, "learning_rate": 3.7690987761131893e-06, "loss": 0.7106927, "num_input_tokens_seen": 64661220, "step": 2991, "time_per_iteration": 2.8237311840057373 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01046061, "balance_loss_clip": 1.05156851, "balance_loss_mlp": 1.02663028, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.6063564491400402, "language_loss": 0.82933879, "learning_rate": 3.7689170795160924e-06, "loss": 0.85084313, "num_input_tokens_seen": 64682530, "step": 2992, "time_per_iteration": 2.8303778171539307 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.05302262, "balance_loss_mlp": 1.0187583, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 2.076285453641059, "language_loss": 0.82228035, "learning_rate": 3.7687353158414822e-06, "loss": 0.84404445, "num_input_tokens_seen": 64701025, "step": 2993, "time_per_iteration": 2.710369110107422 }, { "auxiliary_loss_clip": 0.01135151, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.05135202, "balance_loss_mlp": 1.02236176, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.7027458997386926, "language_loss": 0.78129464, "learning_rate": 3.7685534850962517e-06, "loss": 0.80305111, "num_input_tokens_seen": 64719570, "step": 2994, "time_per_iteration": 2.6666738986968994 }, { "auxiliary_loss_clip": 0.01158877, "auxiliary_loss_mlp": 0.01045455, "balance_loss_clip": 1.05657315, "balance_loss_mlp": 1.02819359, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.4198973911698434, "language_loss": 0.81139499, "learning_rate": 3.768371587287296e-06, "loss": 0.83343828, "num_input_tokens_seen": 64738110, "step": 2995, "time_per_iteration": 2.699521541595459 }, { "auxiliary_loss_clip": 0.01142902, "auxiliary_loss_mlp": 0.01047606, "balance_loss_clip": 1.05350447, "balance_loss_mlp": 1.0310601, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 1.8607496799697536, "language_loss": 0.84162772, "learning_rate": 3.768189622421512e-06, "loss": 0.86353278, "num_input_tokens_seen": 64756345, "step": 2996, "time_per_iteration": 2.696723461151123 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.06094205, "balance_loss_mlp": 1.02273917, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 2.1291201116421283, "language_loss": 0.88189137, "learning_rate": 3.7680075905058006e-06, "loss": 0.90356302, "num_input_tokens_seen": 64776375, "step": 2997, "time_per_iteration": 2.785522699356079 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.04949927, "balance_loss_mlp": 1.02753246, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.7579499924576911, "language_loss": 0.85068727, "learning_rate": 3.7678254915470643e-06, "loss": 0.87249064, "num_input_tokens_seen": 64796210, "step": 2998, "time_per_iteration": 2.6912384033203125 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.01044427, "balance_loss_clip": 1.06019807, "balance_loss_mlp": 1.02641416, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.8075624565441775, "language_loss": 0.84176779, "learning_rate": 3.7676433255522084e-06, "loss": 0.86380744, "num_input_tokens_seen": 64818590, "step": 2999, "time_per_iteration": 2.722447395324707 }, { "auxiliary_loss_clip": 0.01143605, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.05324686, "balance_loss_mlp": 1.02870023, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 1.8789697336390492, "language_loss": 0.75206578, "learning_rate": 3.76746109252814e-06, "loss": 0.77397501, "num_input_tokens_seen": 64838350, "step": 3000, "time_per_iteration": 2.669875144958496 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.00775745, "balance_loss_clip": 1.0526886, "balance_loss_mlp": 1.00060582, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 2.1714361871851704, "language_loss": 0.71088028, "learning_rate": 3.76727879248177e-06, "loss": 0.72993821, "num_input_tokens_seen": 64858065, "step": 3001, "time_per_iteration": 2.7207603454589844 }, { "auxiliary_loss_clip": 0.01150091, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.05701649, "balance_loss_mlp": 1.02605033, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 2.218812983953599, "language_loss": 0.8849982, "learning_rate": 3.767096425420011e-06, "loss": 0.90694606, "num_input_tokens_seen": 64877305, "step": 3002, "time_per_iteration": 2.6577625274658203 }, { "auxiliary_loss_clip": 0.01157827, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.05624068, "balance_loss_mlp": 1.03076851, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 1.6287780165264572, "language_loss": 0.80328667, "learning_rate": 3.7669139913497788e-06, "loss": 0.8253476, "num_input_tokens_seen": 64896955, "step": 3003, "time_per_iteration": 2.6274783611297607 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.01043654, "balance_loss_clip": 1.05622995, "balance_loss_mlp": 1.02596307, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 2.3308952017896956, "language_loss": 0.67250973, "learning_rate": 3.7667314902779907e-06, "loss": 0.69452989, "num_input_tokens_seen": 64917080, "step": 3004, "time_per_iteration": 2.6652631759643555 }, { "auxiliary_loss_clip": 0.01147517, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.05606318, "balance_loss_mlp": 1.03528929, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 2.592432277036083, "language_loss": 0.85111535, "learning_rate": 3.7665489222115677e-06, "loss": 0.87313569, "num_input_tokens_seen": 64935215, "step": 3005, "time_per_iteration": 2.654977560043335 }, { "auxiliary_loss_clip": 0.0114499, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.05690646, "balance_loss_mlp": 1.02489829, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.5217876402754629, "language_loss": 0.83215338, "learning_rate": 3.766366287157432e-06, "loss": 0.85402322, "num_input_tokens_seen": 64956275, "step": 3006, "time_per_iteration": 2.7118306159973145 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05063033, "balance_loss_mlp": 1.03105807, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.6327495611050657, "language_loss": 0.77377248, "learning_rate": 3.7661835851225103e-06, "loss": 0.79554498, "num_input_tokens_seen": 64979390, "step": 3007, "time_per_iteration": 2.7996537685394287 }, { "auxiliary_loss_clip": 0.01070026, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.04936945, "balance_loss_mlp": 1.02712655, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.801982400183398, "language_loss": 0.56987137, "learning_rate": 3.7660008161137294e-06, "loss": 0.5908761, "num_input_tokens_seen": 65043135, "step": 3008, "time_per_iteration": 3.4269092082977295 }, { "auxiliary_loss_clip": 0.01130838, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.05308366, "balance_loss_mlp": 1.02686691, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.8424126412451678, "language_loss": 0.67248082, "learning_rate": 3.765817980138021e-06, "loss": 0.69426012, "num_input_tokens_seen": 65062845, "step": 3009, "time_per_iteration": 2.7875866889953613 }, { "auxiliary_loss_clip": 0.01161719, "auxiliary_loss_mlp": 0.01044187, "balance_loss_clip": 1.0595516, "balance_loss_mlp": 1.02673507, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.4429360498363986, "language_loss": 0.75690198, "learning_rate": 3.7656350772023177e-06, "loss": 0.778961, "num_input_tokens_seen": 65082110, "step": 3010, "time_per_iteration": 2.6060268878936768 }, { "auxiliary_loss_clip": 0.01127916, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.05715132, "balance_loss_mlp": 1.02063942, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.6324915654296899, "language_loss": 0.67356348, "learning_rate": 3.7654521073135553e-06, "loss": 0.69522083, "num_input_tokens_seen": 65101985, "step": 3011, "time_per_iteration": 2.763596534729004 }, { "auxiliary_loss_clip": 0.01105034, "auxiliary_loss_mlp": 0.00777475, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.00078559, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.551526807882757, "language_loss": 0.71288514, "learning_rate": 3.7652690704786723e-06, "loss": 0.73171026, "num_input_tokens_seen": 65129295, "step": 3012, "time_per_iteration": 3.037775993347168 }, { "auxiliary_loss_clip": 0.01132189, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.05564284, "balance_loss_mlp": 1.03348863, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 2.095737131475866, "language_loss": 0.62309992, "learning_rate": 3.765085966704609e-06, "loss": 0.64494264, "num_input_tokens_seen": 65150625, "step": 3013, "time_per_iteration": 2.7692227363586426 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.0105253, "balance_loss_clip": 1.05343401, "balance_loss_mlp": 1.03486276, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.6679267545988328, "language_loss": 0.76147234, "learning_rate": 3.764902795998309e-06, "loss": 0.78331089, "num_input_tokens_seen": 65170880, "step": 3014, "time_per_iteration": 2.7296786308288574 }, { "auxiliary_loss_clip": 0.01163543, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.05964816, "balance_loss_mlp": 1.02987087, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 2.1234423596691796, "language_loss": 0.66310829, "learning_rate": 3.7647195583667184e-06, "loss": 0.6852442, "num_input_tokens_seen": 65192530, "step": 3015, "time_per_iteration": 2.7575571537017822 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.00776613, "balance_loss_clip": 1.05429327, "balance_loss_mlp": 1.00067461, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 1.7837261279259933, "language_loss": 0.78152305, "learning_rate": 3.764536253816785e-06, "loss": 0.80058956, "num_input_tokens_seen": 65211675, "step": 3016, "time_per_iteration": 2.6718828678131104 }, { "auxiliary_loss_clip": 0.01145073, "auxiliary_loss_mlp": 0.01049504, "balance_loss_clip": 1.05684161, "balance_loss_mlp": 1.03068125, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.7248072345223011, "language_loss": 0.8351965, "learning_rate": 3.7643528823554602e-06, "loss": 0.85714233, "num_input_tokens_seen": 65231185, "step": 3017, "time_per_iteration": 2.6879045963287354 }, { "auxiliary_loss_clip": 0.0114091, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.05404854, "balance_loss_mlp": 1.02539897, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 2.2664795482488924, "language_loss": 0.6769017, "learning_rate": 3.764169443989697e-06, "loss": 0.69874066, "num_input_tokens_seen": 65251645, "step": 3018, "time_per_iteration": 4.31333327293396 }, { "auxiliary_loss_clip": 0.01147629, "auxiliary_loss_mlp": 0.00776661, "balance_loss_clip": 1.05706179, "balance_loss_mlp": 1.00074184, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 1.8935259017451227, "language_loss": 0.76396847, "learning_rate": 3.7639859387264518e-06, "loss": 0.78321135, "num_input_tokens_seen": 65271125, "step": 3019, "time_per_iteration": 2.7667160034179688 }, { "auxiliary_loss_clip": 0.01121465, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.05550635, "balance_loss_mlp": 1.02722728, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.042490471678265, "language_loss": 0.81550395, "learning_rate": 3.7638023665726834e-06, "loss": 0.83718598, "num_input_tokens_seen": 65290600, "step": 3020, "time_per_iteration": 4.3900346755981445 }, { "auxiliary_loss_clip": 0.01136424, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.05758023, "balance_loss_mlp": 1.02567708, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.9628186536024828, "language_loss": 0.7757082, "learning_rate": 3.763618727535352e-06, "loss": 0.79753458, "num_input_tokens_seen": 65311040, "step": 3021, "time_per_iteration": 4.3029396533966064 }, { "auxiliary_loss_clip": 0.01143245, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.05453348, "balance_loss_mlp": 1.02907431, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.725306643191844, "language_loss": 0.84863859, "learning_rate": 3.763435021621422e-06, "loss": 0.87054378, "num_input_tokens_seen": 65332115, "step": 3022, "time_per_iteration": 2.7353312969207764 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.05769348, "balance_loss_mlp": 1.0235188, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 2.230341519134859, "language_loss": 0.69367266, "learning_rate": 3.763251248837859e-06, "loss": 0.71542448, "num_input_tokens_seen": 65352210, "step": 3023, "time_per_iteration": 2.775200605392456 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.04900002, "balance_loss_mlp": 1.02556491, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 2.150764188548567, "language_loss": 0.74107385, "learning_rate": 3.7630674091916317e-06, "loss": 0.76278937, "num_input_tokens_seen": 65370600, "step": 3024, "time_per_iteration": 2.7364041805267334 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.01046837, "balance_loss_clip": 1.05719447, "balance_loss_mlp": 1.02900314, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.148591016046099, "language_loss": 0.8835662, "learning_rate": 3.7628835026897123e-06, "loss": 0.90548658, "num_input_tokens_seen": 65387270, "step": 3025, "time_per_iteration": 4.274658679962158 }, { "auxiliary_loss_clip": 0.01133667, "auxiliary_loss_mlp": 0.01050575, "balance_loss_clip": 1.05470932, "balance_loss_mlp": 1.03137028, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 3.6399614210311206, "language_loss": 0.79041791, "learning_rate": 3.7626995293390735e-06, "loss": 0.81226033, "num_input_tokens_seen": 65406550, "step": 3026, "time_per_iteration": 2.7589778900146484 }, { "auxiliary_loss_clip": 0.01132736, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.05774415, "balance_loss_mlp": 1.03679442, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 1.6980721374313217, "language_loss": 0.759978, "learning_rate": 3.762515489146692e-06, "loss": 0.78185904, "num_input_tokens_seen": 65425955, "step": 3027, "time_per_iteration": 2.7347826957702637 }, { "auxiliary_loss_clip": 0.01163558, "auxiliary_loss_mlp": 0.01053369, "balance_loss_clip": 1.05835891, "balance_loss_mlp": 1.03378284, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 2.2893837743041368, "language_loss": 0.85592651, "learning_rate": 3.762331382119546e-06, "loss": 0.87809575, "num_input_tokens_seen": 65442820, "step": 3028, "time_per_iteration": 2.598905563354492 }, { "auxiliary_loss_clip": 0.01156921, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.0578618, "balance_loss_mlp": 1.0260129, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 1.8897570500397638, "language_loss": 0.82807779, "learning_rate": 3.7621472082646183e-06, "loss": 0.85009193, "num_input_tokens_seen": 65461825, "step": 3029, "time_per_iteration": 2.677332639694214 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01050232, "balance_loss_clip": 1.05223596, "balance_loss_mlp": 1.02931094, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 10.840079090220346, "language_loss": 0.78091359, "learning_rate": 3.761962967588891e-06, "loss": 0.80256593, "num_input_tokens_seen": 65479480, "step": 3030, "time_per_iteration": 2.6865499019622803 }, { "auxiliary_loss_clip": 0.01139676, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.05401075, "balance_loss_mlp": 1.0240562, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 2.05958060196279, "language_loss": 0.85162055, "learning_rate": 3.761778660099352e-06, "loss": 0.87345004, "num_input_tokens_seen": 65497775, "step": 3031, "time_per_iteration": 2.6336488723754883 }, { "auxiliary_loss_clip": 0.01116657, "auxiliary_loss_mlp": 0.00776186, "balance_loss_clip": 1.0497843, "balance_loss_mlp": 1.00052071, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 1.83501853384953, "language_loss": 0.79992211, "learning_rate": 3.76159428580299e-06, "loss": 0.81885058, "num_input_tokens_seen": 65516505, "step": 3032, "time_per_iteration": 2.6879780292510986 }, { "auxiliary_loss_clip": 0.01166412, "auxiliary_loss_mlp": 0.01048902, "balance_loss_clip": 1.06163025, "balance_loss_mlp": 1.03038836, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 1.8132660189598853, "language_loss": 0.81316388, "learning_rate": 3.761409844706795e-06, "loss": 0.83531702, "num_input_tokens_seen": 65536160, "step": 3033, "time_per_iteration": 2.628100872039795 }, { "auxiliary_loss_clip": 0.01048591, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 0.99850291, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.8825814513625035, "language_loss": 0.63439631, "learning_rate": 3.7612253368177625e-06, "loss": 0.65489495, "num_input_tokens_seen": 65589375, "step": 3034, "time_per_iteration": 3.2329187393188477 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.05698252, "balance_loss_mlp": 1.02384114, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 3.107937736318082, "language_loss": 0.79893476, "learning_rate": 3.7610407621428893e-06, "loss": 0.82063049, "num_input_tokens_seen": 65606720, "step": 3035, "time_per_iteration": 2.7644357681274414 }, { "auxiliary_loss_clip": 0.01134115, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.05675578, "balance_loss_mlp": 1.02906322, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 1.870086430131469, "language_loss": 0.85076666, "learning_rate": 3.7608561206891735e-06, "loss": 0.87257177, "num_input_tokens_seen": 65625495, "step": 3036, "time_per_iteration": 2.7102303504943848 }, { "auxiliary_loss_clip": 0.01140083, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.05572963, "balance_loss_mlp": 1.02192414, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 2.1821496235124727, "language_loss": 0.80254716, "learning_rate": 3.760671412463617e-06, "loss": 0.82433879, "num_input_tokens_seen": 65643515, "step": 3037, "time_per_iteration": 2.6703832149505615 }, { "auxiliary_loss_clip": 0.01139652, "auxiliary_loss_mlp": 0.00776941, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.00062871, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 3.0764011293768023, "language_loss": 0.7950514, "learning_rate": 3.7604866374732246e-06, "loss": 0.81421733, "num_input_tokens_seen": 65658155, "step": 3038, "time_per_iteration": 2.7410895824432373 }, { "auxiliary_loss_clip": 0.01125628, "auxiliary_loss_mlp": 0.01044597, "balance_loss_clip": 1.05254972, "balance_loss_mlp": 1.02551126, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.9524772610579864, "language_loss": 0.67722493, "learning_rate": 3.7603017957250023e-06, "loss": 0.69892722, "num_input_tokens_seen": 65679310, "step": 3039, "time_per_iteration": 2.756833076477051 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01051065, "balance_loss_clip": 1.053087, "balance_loss_mlp": 1.03304029, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.8757227718998248, "language_loss": 0.73394251, "learning_rate": 3.7601168872259593e-06, "loss": 0.75575823, "num_input_tokens_seen": 65705235, "step": 3040, "time_per_iteration": 3.026679039001465 }, { "auxiliary_loss_clip": 0.01143558, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 1.02373624, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 2.017308993436446, "language_loss": 0.60348576, "learning_rate": 3.7599319119831075e-06, "loss": 0.62534392, "num_input_tokens_seen": 65727575, "step": 3041, "time_per_iteration": 2.738554000854492 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.03544497, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 2.3558133433802104, "language_loss": 0.59825706, "learning_rate": 3.7597468700034616e-06, "loss": 0.61996508, "num_input_tokens_seen": 65751370, "step": 3042, "time_per_iteration": 3.0009193420410156 }, { "auxiliary_loss_clip": 0.0112422, "auxiliary_loss_mlp": 0.01046569, "balance_loss_clip": 1.05319464, "balance_loss_mlp": 1.02917695, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.5313119565207096, "language_loss": 0.8757726, "learning_rate": 3.7595617612940374e-06, "loss": 0.89748049, "num_input_tokens_seen": 65771040, "step": 3043, "time_per_iteration": 2.7406487464904785 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.04592645, "balance_loss_mlp": 1.03712869, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 2.144378235575635, "language_loss": 0.70980251, "learning_rate": 3.7593765858618552e-06, "loss": 0.73093396, "num_input_tokens_seen": 65789345, "step": 3044, "time_per_iteration": 2.785931348800659 }, { "auxiliary_loss_clip": 0.01105073, "auxiliary_loss_mlp": 0.01059118, "balance_loss_clip": 1.05111921, "balance_loss_mlp": 1.0381608, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 3.097061979225562, "language_loss": 0.64460731, "learning_rate": 3.7591913437139365e-06, "loss": 0.66624922, "num_input_tokens_seen": 65810990, "step": 3045, "time_per_iteration": 2.8085720539093018 }, { "auxiliary_loss_clip": 0.01155246, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.05604315, "balance_loss_mlp": 1.02780676, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 11.455833434854163, "language_loss": 0.78461385, "learning_rate": 3.7590060348573066e-06, "loss": 0.80661607, "num_input_tokens_seen": 65827230, "step": 3046, "time_per_iteration": 2.603299140930176 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.04837, "balance_loss_mlp": 1.0240643, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 1.9889932097770582, "language_loss": 0.78733194, "learning_rate": 3.7588206592989903e-06, "loss": 0.8089478, "num_input_tokens_seen": 65845900, "step": 3047, "time_per_iteration": 2.7109453678131104 }, { "auxiliary_loss_clip": 0.01144516, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.05723858, "balance_loss_mlp": 1.0254705, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5191744259185578, "language_loss": 0.80704039, "learning_rate": 3.7586352170460194e-06, "loss": 0.82890975, "num_input_tokens_seen": 65868730, "step": 3048, "time_per_iteration": 2.7485053539276123 }, { "auxiliary_loss_clip": 0.01139433, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.05405188, "balance_loss_mlp": 1.02552414, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 2.1437824577601354, "language_loss": 0.86579728, "learning_rate": 3.758449708105424e-06, "loss": 0.88765168, "num_input_tokens_seen": 65888420, "step": 3049, "time_per_iteration": 2.6876962184906006 }, { "auxiliary_loss_clip": 0.01143881, "auxiliary_loss_mlp": 0.01045208, "balance_loss_clip": 1.05379057, "balance_loss_mlp": 1.02544308, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 2.616661567020713, "language_loss": 0.77827966, "learning_rate": 3.75826413248424e-06, "loss": 0.80017054, "num_input_tokens_seen": 65905840, "step": 3050, "time_per_iteration": 2.5814058780670166 }, { "auxiliary_loss_clip": 0.01126116, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.04954183, "balance_loss_mlp": 1.0238502, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.3686375880611656, "language_loss": 0.99064422, "learning_rate": 3.7580784901895035e-06, "loss": 1.01231837, "num_input_tokens_seen": 65922845, "step": 3051, "time_per_iteration": 2.701848268508911 }, { "auxiliary_loss_clip": 0.01125492, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.05189931, "balance_loss_mlp": 1.02078128, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 2.0338529701436237, "language_loss": 0.8607648, "learning_rate": 3.7578927812282542e-06, "loss": 0.88241673, "num_input_tokens_seen": 65945555, "step": 3052, "time_per_iteration": 2.7252042293548584 }, { "auxiliary_loss_clip": 0.01152967, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02737474, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 1.8649432496703628, "language_loss": 0.73393309, "learning_rate": 3.7577070056075356e-06, "loss": 0.7559092, "num_input_tokens_seen": 65963965, "step": 3053, "time_per_iteration": 2.6331369876861572 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.05783379, "balance_loss_mlp": 1.02565801, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.5358769917973574, "language_loss": 0.61891186, "learning_rate": 3.7575211633343902e-06, "loss": 0.64093965, "num_input_tokens_seen": 65985965, "step": 3054, "time_per_iteration": 2.6792421340942383 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.05558836, "balance_loss_mlp": 1.02502322, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.2474279661883667, "language_loss": 0.78218341, "learning_rate": 3.7573352544158663e-06, "loss": 0.80369824, "num_input_tokens_seen": 66005645, "step": 3055, "time_per_iteration": 2.778691053390503 }, { "auxiliary_loss_clip": 0.01096638, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.05003095, "balance_loss_mlp": 1.03211594, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.8043720478204575, "language_loss": 0.7022509, "learning_rate": 3.757149278859014e-06, "loss": 0.72372401, "num_input_tokens_seen": 66025675, "step": 3056, "time_per_iteration": 2.794254779815674 }, { "auxiliary_loss_clip": 0.01140367, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.05211461, "balance_loss_mlp": 1.02181149, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.8709784760841586, "language_loss": 0.80357504, "learning_rate": 3.7569632366708842e-06, "loss": 0.82536227, "num_input_tokens_seen": 66046125, "step": 3057, "time_per_iteration": 2.644728899002075 }, { "auxiliary_loss_clip": 0.01150041, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.05482352, "balance_loss_mlp": 1.02332497, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 7.225766788646501, "language_loss": 0.82570755, "learning_rate": 3.756777127858533e-06, "loss": 0.84764576, "num_input_tokens_seen": 66064375, "step": 3058, "time_per_iteration": 4.136845588684082 }, { "auxiliary_loss_clip": 0.01119139, "auxiliary_loss_mlp": 0.00776668, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.00066566, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.277694088171661, "language_loss": 0.85071868, "learning_rate": 3.756590952429017e-06, "loss": 0.86967677, "num_input_tokens_seen": 66084590, "step": 3059, "time_per_iteration": 2.745020866394043 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.00775088, "balance_loss_clip": 1.05359423, "balance_loss_mlp": 1.00077426, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 2.3540516696336216, "language_loss": 0.72983348, "learning_rate": 3.756404710389396e-06, "loss": 0.74910271, "num_input_tokens_seen": 66107105, "step": 3060, "time_per_iteration": 5.792214393615723 }, { "auxiliary_loss_clip": 0.01149482, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.05812132, "balance_loss_mlp": 1.02266574, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.5810457302838978, "language_loss": 0.73126459, "learning_rate": 3.7562184017467323e-06, "loss": 0.75317359, "num_input_tokens_seen": 66129295, "step": 3061, "time_per_iteration": 2.754167318344116 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.05435956, "balance_loss_mlp": 1.02379823, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.8413104246803462, "language_loss": 0.81937188, "learning_rate": 3.7560320265080906e-06, "loss": 0.8411907, "num_input_tokens_seen": 66146910, "step": 3062, "time_per_iteration": 2.7545394897460938 }, { "auxiliary_loss_clip": 0.01144664, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05668104, "balance_loss_mlp": 1.02259111, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 2.011374259171591, "language_loss": 0.72994816, "learning_rate": 3.7558455846805383e-06, "loss": 0.75180125, "num_input_tokens_seen": 66165370, "step": 3063, "time_per_iteration": 2.738293170928955 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.05164194, "balance_loss_mlp": 1.02490544, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 2.2975785147287953, "language_loss": 0.65614092, "learning_rate": 3.7556590762711463e-06, "loss": 0.67788899, "num_input_tokens_seen": 66186210, "step": 3064, "time_per_iteration": 4.404583930969238 }, { "auxiliary_loss_clip": 0.01141547, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02498376, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 2.1874829734431898, "language_loss": 0.68347883, "learning_rate": 3.7554725012869853e-06, "loss": 0.70532429, "num_input_tokens_seen": 66204800, "step": 3065, "time_per_iteration": 2.7149577140808105 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.05518305, "balance_loss_mlp": 1.02674615, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 2.2758854533642925, "language_loss": 0.73142231, "learning_rate": 3.7552858597351318e-06, "loss": 0.75324523, "num_input_tokens_seen": 66222195, "step": 3066, "time_per_iteration": 2.672675609588623 }, { "auxiliary_loss_clip": 0.01125186, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.04947495, "balance_loss_mlp": 1.0256983, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.1067167513095444, "language_loss": 0.82191038, "learning_rate": 3.7550991516226622e-06, "loss": 0.8435961, "num_input_tokens_seen": 66239505, "step": 3067, "time_per_iteration": 2.697768211364746 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.00756782, "balance_loss_clip": 1.04466891, "balance_loss_mlp": 1.00113225, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.7960107429271657, "language_loss": 0.59750569, "learning_rate": 3.754912376956657e-06, "loss": 0.61589694, "num_input_tokens_seen": 66295695, "step": 3068, "time_per_iteration": 3.0305213928222656 }, { "auxiliary_loss_clip": 0.01127048, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.05452299, "balance_loss_mlp": 1.02356791, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 3.7299324256794244, "language_loss": 0.76434112, "learning_rate": 3.7547255357441987e-06, "loss": 0.78602457, "num_input_tokens_seen": 66315315, "step": 3069, "time_per_iteration": 2.6757962703704834 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.05468106, "balance_loss_mlp": 1.02798057, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.9225240149566294, "language_loss": 0.8491416, "learning_rate": 3.7545386279923718e-06, "loss": 0.87106168, "num_input_tokens_seen": 66333675, "step": 3070, "time_per_iteration": 2.617023229598999 }, { "auxiliary_loss_clip": 0.01127789, "auxiliary_loss_mlp": 0.01043452, "balance_loss_clip": 1.0553112, "balance_loss_mlp": 1.02510571, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 6.700503585098448, "language_loss": 0.77807182, "learning_rate": 3.754351653708265e-06, "loss": 0.79978424, "num_input_tokens_seen": 66354075, "step": 3071, "time_per_iteration": 2.847329616546631 }, { "auxiliary_loss_clip": 0.01109458, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.05054557, "balance_loss_mlp": 1.03154778, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.0836336776071565, "language_loss": 0.77414191, "learning_rate": 3.7541646128989674e-06, "loss": 0.79573631, "num_input_tokens_seen": 66372520, "step": 3072, "time_per_iteration": 2.780921220779419 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.05106127, "balance_loss_mlp": 1.02465141, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 4.959080593148226, "language_loss": 0.86546457, "learning_rate": 3.7539775055715715e-06, "loss": 0.88729048, "num_input_tokens_seen": 66390745, "step": 3073, "time_per_iteration": 2.631913661956787 }, { "auxiliary_loss_clip": 0.01158717, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.05862749, "balance_loss_mlp": 1.02366686, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.162700927804164, "language_loss": 0.91831195, "learning_rate": 3.7537903317331732e-06, "loss": 0.94030046, "num_input_tokens_seen": 66410525, "step": 3074, "time_per_iteration": 2.6152567863464355 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.04757643, "balance_loss_mlp": 1.02763104, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.9967983521568784, "language_loss": 0.64783108, "learning_rate": 3.75360309139087e-06, "loss": 0.66935796, "num_input_tokens_seen": 66432535, "step": 3075, "time_per_iteration": 2.763559103012085 }, { "auxiliary_loss_clip": 0.01135247, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.05689573, "balance_loss_mlp": 1.02913702, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 1.8996898495981898, "language_loss": 0.72803432, "learning_rate": 3.753415784551761e-06, "loss": 0.74985278, "num_input_tokens_seen": 66450620, "step": 3076, "time_per_iteration": 2.76629376411438 }, { "auxiliary_loss_clip": 0.01124833, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.0584389, "balance_loss_mlp": 1.0249157, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 2.4862024108169556, "language_loss": 0.80772626, "learning_rate": 3.7532284112229507e-06, "loss": 0.82939804, "num_input_tokens_seen": 66467865, "step": 3077, "time_per_iteration": 2.7296142578125 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.05401397, "balance_loss_mlp": 1.02428079, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.8214336253769514, "language_loss": 0.78693211, "learning_rate": 3.7530409714115424e-06, "loss": 0.80863178, "num_input_tokens_seen": 66486245, "step": 3078, "time_per_iteration": 2.715838670730591 }, { "auxiliary_loss_clip": 0.01154963, "auxiliary_loss_mlp": 0.01043373, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02655268, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 1.7455066055145632, "language_loss": 0.77326959, "learning_rate": 3.7528534651246453e-06, "loss": 0.79525292, "num_input_tokens_seen": 66506510, "step": 3079, "time_per_iteration": 2.674128770828247 }, { "auxiliary_loss_clip": 0.01119079, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.04717147, "balance_loss_mlp": 1.02328515, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.885086933557342, "language_loss": 0.82143807, "learning_rate": 3.752665892369369e-06, "loss": 0.84304404, "num_input_tokens_seen": 66530960, "step": 3080, "time_per_iteration": 2.906940460205078 }, { "auxiliary_loss_clip": 0.01123637, "auxiliary_loss_mlp": 0.01044031, "balance_loss_clip": 1.05894399, "balance_loss_mlp": 1.02563691, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 2.065822240576764, "language_loss": 0.73973286, "learning_rate": 3.7524782531528266e-06, "loss": 0.76140958, "num_input_tokens_seen": 66550275, "step": 3081, "time_per_iteration": 2.7960739135742188 }, { "auxiliary_loss_clip": 0.01126977, "auxiliary_loss_mlp": 0.01051674, "balance_loss_clip": 1.05360913, "balance_loss_mlp": 1.03286242, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 1.9854893879184425, "language_loss": 0.71991849, "learning_rate": 3.7522905474821334e-06, "loss": 0.74170506, "num_input_tokens_seen": 66569040, "step": 3082, "time_per_iteration": 2.6965079307556152 }, { "auxiliary_loss_clip": 0.01124933, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.05649543, "balance_loss_mlp": 1.02694798, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 2.0424653419479886, "language_loss": 0.69580144, "learning_rate": 3.752102775364407e-06, "loss": 0.71751374, "num_input_tokens_seen": 66587775, "step": 3083, "time_per_iteration": 2.727252721786499 }, { "auxiliary_loss_clip": 0.01122388, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.05204451, "balance_loss_mlp": 1.02964258, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 2.185713468975319, "language_loss": 0.68965334, "learning_rate": 3.751914936806767e-06, "loss": 0.71134722, "num_input_tokens_seen": 66610800, "step": 3084, "time_per_iteration": 2.95849871635437 }, { "auxiliary_loss_clip": 0.01155184, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.05578482, "balance_loss_mlp": 1.0257436, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.6859724806626923, "language_loss": 0.77390355, "learning_rate": 3.7517270318163377e-06, "loss": 0.79588568, "num_input_tokens_seen": 66630960, "step": 3085, "time_per_iteration": 2.68961501121521 }, { "auxiliary_loss_clip": 0.01152089, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05316019, "balance_loss_mlp": 1.03142118, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 1.993169596996871, "language_loss": 0.73752379, "learning_rate": 3.751539060400244e-06, "loss": 0.75953472, "num_input_tokens_seen": 66650585, "step": 3086, "time_per_iteration": 2.652475595474243 }, { "auxiliary_loss_clip": 0.01142754, "auxiliary_loss_mlp": 0.01049865, "balance_loss_clip": 1.05530787, "balance_loss_mlp": 1.03134012, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 7.927127736744579, "language_loss": 0.69762361, "learning_rate": 3.7513510225656132e-06, "loss": 0.71954978, "num_input_tokens_seen": 66670045, "step": 3087, "time_per_iteration": 2.668849229812622 }, { "auxiliary_loss_clip": 0.01119022, "auxiliary_loss_mlp": 0.01055302, "balance_loss_clip": 1.05543649, "balance_loss_mlp": 1.03546548, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 2.1117122734340263, "language_loss": 0.72513628, "learning_rate": 3.7511629183195764e-06, "loss": 0.74687952, "num_input_tokens_seen": 66688790, "step": 3088, "time_per_iteration": 2.7150719165802 }, { "auxiliary_loss_clip": 0.0112638, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04933047, "balance_loss_mlp": 1.02616334, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 2.112009927874319, "language_loss": 0.91859758, "learning_rate": 3.7509747476692663e-06, "loss": 0.94030321, "num_input_tokens_seen": 66708090, "step": 3089, "time_per_iteration": 2.7239248752593994 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.0494597, "balance_loss_mlp": 1.02919531, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 2.490831087537115, "language_loss": 0.57275403, "learning_rate": 3.7507865106218176e-06, "loss": 0.59427136, "num_input_tokens_seen": 66727320, "step": 3090, "time_per_iteration": 2.8263309001922607 }, { "auxiliary_loss_clip": 0.01125877, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.02636242, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.7797305478565062, "language_loss": 0.81704801, "learning_rate": 3.7505982071843695e-06, "loss": 0.83875453, "num_input_tokens_seen": 66747505, "step": 3091, "time_per_iteration": 2.697525978088379 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.03277707, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.0826959244757832, "language_loss": 0.83704746, "learning_rate": 3.7504098373640617e-06, "loss": 0.8585732, "num_input_tokens_seen": 66766425, "step": 3092, "time_per_iteration": 2.8379435539245605 }, { "auxiliary_loss_clip": 0.01136846, "auxiliary_loss_mlp": 0.01048758, "balance_loss_clip": 1.05389428, "balance_loss_mlp": 1.03036356, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 5.439917179387958, "language_loss": 0.93443698, "learning_rate": 3.750221401168038e-06, "loss": 0.95629299, "num_input_tokens_seen": 66781130, "step": 3093, "time_per_iteration": 2.8053483963012695 }, { "auxiliary_loss_clip": 0.01130362, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.05440521, "balance_loss_mlp": 1.02464092, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.7318887555782294, "language_loss": 0.77516603, "learning_rate": 3.750032898603443e-06, "loss": 0.7968933, "num_input_tokens_seen": 66797535, "step": 3094, "time_per_iteration": 2.7402310371398926 }, { "auxiliary_loss_clip": 0.0109741, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.0519228, "balance_loss_mlp": 1.0323391, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.7033453736007413, "language_loss": 0.69854707, "learning_rate": 3.749844329677425e-06, "loss": 0.72001338, "num_input_tokens_seen": 66821720, "step": 3095, "time_per_iteration": 3.133192777633667 }, { "auxiliary_loss_clip": 0.01113224, "auxiliary_loss_mlp": 0.010546, "balance_loss_clip": 1.0511899, "balance_loss_mlp": 1.03415525, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.2828801406167307, "language_loss": 0.81214821, "learning_rate": 3.749655694397135e-06, "loss": 0.83382642, "num_input_tokens_seen": 66839060, "step": 3096, "time_per_iteration": 2.7599101066589355 }, { "auxiliary_loss_clip": 0.01147399, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.05678356, "balance_loss_mlp": 1.02810192, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 2.430947734084612, "language_loss": 0.75326216, "learning_rate": 3.7494669927697255e-06, "loss": 0.77520448, "num_input_tokens_seen": 66857760, "step": 3097, "time_per_iteration": 4.255983114242554 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01050365, "balance_loss_clip": 1.05756521, "balance_loss_mlp": 1.03228104, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.553895603581972, "language_loss": 0.66602015, "learning_rate": 3.749278224802352e-06, "loss": 0.68786132, "num_input_tokens_seen": 66876460, "step": 3098, "time_per_iteration": 2.723567247390747 }, { "auxiliary_loss_clip": 0.01163461, "auxiliary_loss_mlp": 0.01052357, "balance_loss_clip": 1.05991709, "balance_loss_mlp": 1.03212702, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.6168121451860142, "language_loss": 0.69838905, "learning_rate": 3.7490893905021733e-06, "loss": 0.7205472, "num_input_tokens_seen": 66897960, "step": 3099, "time_per_iteration": 5.687380075454712 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.05713868, "balance_loss_mlp": 1.03243458, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.7060244708994476, "language_loss": 0.71840072, "learning_rate": 3.7489004898763494e-06, "loss": 0.74039996, "num_input_tokens_seen": 66917675, "step": 3100, "time_per_iteration": 2.6711015701293945 }, { "auxiliary_loss_clip": 0.01138377, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.05749035, "balance_loss_mlp": 1.03133154, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 1.9639279354826686, "language_loss": 0.80343997, "learning_rate": 3.7487115229320444e-06, "loss": 0.82533038, "num_input_tokens_seen": 66936000, "step": 3101, "time_per_iteration": 2.6996583938598633 }, { "auxiliary_loss_clip": 0.01112778, "auxiliary_loss_mlp": 0.01042097, "balance_loss_clip": 1.05307627, "balance_loss_mlp": 1.02478826, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.8804860702941575, "language_loss": 0.77053607, "learning_rate": 3.7485224896764222e-06, "loss": 0.79208481, "num_input_tokens_seen": 66955700, "step": 3102, "time_per_iteration": 2.726146936416626 }, { "auxiliary_loss_clip": 0.01150817, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.057688, "balance_loss_mlp": 1.0213027, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.314682178811096, "language_loss": 0.76689744, "learning_rate": 3.7483333901166525e-06, "loss": 0.78879869, "num_input_tokens_seen": 66972815, "step": 3103, "time_per_iteration": 4.374122619628906 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.05477643, "balance_loss_mlp": 1.02671361, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 1.6956506235876265, "language_loss": 0.79252636, "learning_rate": 3.7481442242599054e-06, "loss": 0.8142997, "num_input_tokens_seen": 66992280, "step": 3104, "time_per_iteration": 2.695012092590332 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01050273, "balance_loss_clip": 1.05117702, "balance_loss_mlp": 1.03096056, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 2.065624302338532, "language_loss": 0.8496474, "learning_rate": 3.747954992113354e-06, "loss": 0.87121809, "num_input_tokens_seen": 67012220, "step": 3105, "time_per_iteration": 2.761521816253662 }, { "auxiliary_loss_clip": 0.0112324, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.05166531, "balance_loss_mlp": 1.02407932, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.8352441384571676, "language_loss": 0.86880243, "learning_rate": 3.7477656936841742e-06, "loss": 0.8904717, "num_input_tokens_seen": 67032030, "step": 3106, "time_per_iteration": 2.785738706588745 }, { "auxiliary_loss_clip": 0.01150222, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.0566026, "balance_loss_mlp": 1.02281737, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 2.128833658771433, "language_loss": 0.78226906, "learning_rate": 3.7475763289795445e-06, "loss": 0.80418098, "num_input_tokens_seen": 67048920, "step": 3107, "time_per_iteration": 2.693995237350464 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.05873394, "balance_loss_mlp": 1.03341043, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 3.0927798335187506, "language_loss": 0.74159014, "learning_rate": 3.7473868980066446e-06, "loss": 0.7636584, "num_input_tokens_seen": 67068645, "step": 3108, "time_per_iteration": 2.795715570449829 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.05207491, "balance_loss_mlp": 1.02451098, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 1.6837485322309411, "language_loss": 0.74348569, "learning_rate": 3.747197400772658e-06, "loss": 0.76498872, "num_input_tokens_seen": 67087075, "step": 3109, "time_per_iteration": 2.7627830505371094 }, { "auxiliary_loss_clip": 0.01145572, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.05631042, "balance_loss_mlp": 1.02526462, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.499459601293056, "language_loss": 0.84250218, "learning_rate": 3.747007837284772e-06, "loss": 0.86439908, "num_input_tokens_seen": 67108040, "step": 3110, "time_per_iteration": 2.7665328979492188 }, { "auxiliary_loss_clip": 0.01147578, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.05929494, "balance_loss_mlp": 1.02381575, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.9108380391903876, "language_loss": 0.84738445, "learning_rate": 3.7468182075501737e-06, "loss": 0.86928415, "num_input_tokens_seen": 67127605, "step": 3111, "time_per_iteration": 2.729233741760254 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01044544, "balance_loss_clip": 1.05348754, "balance_loss_mlp": 1.02635229, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 1.8704338434966796, "language_loss": 0.76875687, "learning_rate": 3.7466285115760536e-06, "loss": 0.79048228, "num_input_tokens_seen": 67145785, "step": 3112, "time_per_iteration": 2.7392494678497314 }, { "auxiliary_loss_clip": 0.0114846, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.05636978, "balance_loss_mlp": 1.02913654, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 1.8996972204761096, "language_loss": 0.64466536, "learning_rate": 3.7464387493696046e-06, "loss": 0.66662085, "num_input_tokens_seen": 67165930, "step": 3113, "time_per_iteration": 2.7393765449523926 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.05685568, "balance_loss_mlp": 1.02900672, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 6.483287708452815, "language_loss": 0.817972, "learning_rate": 3.746248920938024e-06, "loss": 0.83999759, "num_input_tokens_seen": 67185830, "step": 3114, "time_per_iteration": 2.740229368209839 }, { "auxiliary_loss_clip": 0.01104278, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.04921412, "balance_loss_mlp": 1.03024614, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 2.3064843449079175, "language_loss": 0.57413173, "learning_rate": 3.74605902628851e-06, "loss": 0.59568191, "num_input_tokens_seen": 67206930, "step": 3115, "time_per_iteration": 2.811549663543701 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05446446, "balance_loss_mlp": 1.03241396, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 2.577640519639585, "language_loss": 0.70842528, "learning_rate": 3.745869065428261e-06, "loss": 0.73017788, "num_input_tokens_seen": 67226290, "step": 3116, "time_per_iteration": 2.8053951263427734 }, { "auxiliary_loss_clip": 0.0115042, "auxiliary_loss_mlp": 0.01035569, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.01787841, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 3.010261965906642, "language_loss": 0.78994375, "learning_rate": 3.7456790383644833e-06, "loss": 0.81180358, "num_input_tokens_seen": 67244410, "step": 3117, "time_per_iteration": 2.819415330886841 }, { "auxiliary_loss_clip": 0.01132901, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05260777, "balance_loss_mlp": 1.03047204, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 2.2828109389679865, "language_loss": 0.83903432, "learning_rate": 3.745488945104381e-06, "loss": 0.86086059, "num_input_tokens_seen": 67264470, "step": 3118, "time_per_iteration": 2.783804416656494 }, { "auxiliary_loss_clip": 0.01144867, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.05412436, "balance_loss_mlp": 1.02688873, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 3.566737352043019, "language_loss": 0.76283264, "learning_rate": 3.7452987856551636e-06, "loss": 0.78472656, "num_input_tokens_seen": 67284315, "step": 3119, "time_per_iteration": 2.6872506141662598 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.05519438, "balance_loss_mlp": 1.02899814, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.7224942549361077, "language_loss": 0.82017547, "learning_rate": 3.7451085600240406e-06, "loss": 0.84221041, "num_input_tokens_seen": 67302780, "step": 3120, "time_per_iteration": 2.637505292892456 }, { "auxiliary_loss_clip": 0.0113033, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.05060756, "balance_loss_mlp": 1.01828837, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 2.5027223446471982, "language_loss": 0.84992659, "learning_rate": 3.7449182682182263e-06, "loss": 0.87158525, "num_input_tokens_seen": 67323405, "step": 3121, "time_per_iteration": 2.788353681564331 }, { "auxiliary_loss_clip": 0.01096681, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.045645, "balance_loss_mlp": 1.02599168, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 2.1738591443482362, "language_loss": 0.70032287, "learning_rate": 3.744727910244937e-06, "loss": 0.72173256, "num_input_tokens_seen": 67345800, "step": 3122, "time_per_iteration": 3.0225250720977783 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.05445123, "balance_loss_mlp": 1.02288795, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 4.839579375412361, "language_loss": 0.70661515, "learning_rate": 3.7445374861113905e-06, "loss": 0.72857308, "num_input_tokens_seen": 67363575, "step": 3123, "time_per_iteration": 2.779904365539551 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.02454507, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 2.057520579072589, "language_loss": 0.74103826, "learning_rate": 3.7443469958248066e-06, "loss": 0.76287514, "num_input_tokens_seen": 67381765, "step": 3124, "time_per_iteration": 2.6336071491241455 }, { "auxiliary_loss_clip": 0.01157579, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05653572, "balance_loss_mlp": 1.03333998, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 3.0670363966795096, "language_loss": 0.80654436, "learning_rate": 3.7441564393924106e-06, "loss": 0.82864523, "num_input_tokens_seen": 67405000, "step": 3125, "time_per_iteration": 2.7224199771881104 }, { "auxiliary_loss_clip": 0.01046615, "auxiliary_loss_mlp": 0.01006504, "balance_loss_clip": 1.04444218, "balance_loss_mlp": 1.00435853, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9424570711133922, "language_loss": 0.63647306, "learning_rate": 3.7439658168214273e-06, "loss": 0.65700436, "num_input_tokens_seen": 67467140, "step": 3126, "time_per_iteration": 3.313321113586426 }, { "auxiliary_loss_clip": 0.01128308, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.05377257, "balance_loss_mlp": 1.02236164, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.8734163453478039, "language_loss": 0.81308508, "learning_rate": 3.7437751281190857e-06, "loss": 0.83477271, "num_input_tokens_seen": 67487980, "step": 3127, "time_per_iteration": 2.7137866020202637 }, { "auxiliary_loss_clip": 0.01088267, "auxiliary_loss_mlp": 0.0101138, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.00912714, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7699217277386954, "language_loss": 0.61922526, "learning_rate": 3.7435843732926164e-06, "loss": 0.64022171, "num_input_tokens_seen": 67552500, "step": 3128, "time_per_iteration": 3.264270782470703 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01049422, "balance_loss_clip": 1.04763842, "balance_loss_mlp": 1.02907288, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.4867495334212175, "language_loss": 0.70985162, "learning_rate": 3.7433935523492536e-06, "loss": 0.73142785, "num_input_tokens_seen": 67573295, "step": 3129, "time_per_iteration": 2.79929256439209 }, { "auxiliary_loss_clip": 0.01158485, "auxiliary_loss_mlp": 0.01050611, "balance_loss_clip": 1.05767536, "balance_loss_mlp": 1.03109634, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 2.4831518001798676, "language_loss": 0.85035253, "learning_rate": 3.7432026652962314e-06, "loss": 0.87244344, "num_input_tokens_seen": 67590010, "step": 3130, "time_per_iteration": 2.60624361038208 }, { "auxiliary_loss_clip": 0.01107202, "auxiliary_loss_mlp": 0.01049966, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.03023696, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 9.096753382647533, "language_loss": 0.7643525, "learning_rate": 3.7430117121407897e-06, "loss": 0.7859242, "num_input_tokens_seen": 67611110, "step": 3131, "time_per_iteration": 2.759230136871338 }, { "auxiliary_loss_clip": 0.0112329, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.05344164, "balance_loss_mlp": 1.03014708, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 2.109252219381847, "language_loss": 0.80713749, "learning_rate": 3.74282069289017e-06, "loss": 0.82886261, "num_input_tokens_seen": 67631990, "step": 3132, "time_per_iteration": 2.773817777633667 }, { "auxiliary_loss_clip": 0.01093588, "auxiliary_loss_mlp": 0.00779094, "balance_loss_clip": 1.04652429, "balance_loss_mlp": 1.00091529, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 2.092242478448591, "language_loss": 0.79653811, "learning_rate": 3.742629607551614e-06, "loss": 0.81526494, "num_input_tokens_seen": 67650490, "step": 3133, "time_per_iteration": 2.7873754501342773 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.01059381, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03921056, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 1.9069857551930867, "language_loss": 0.83001804, "learning_rate": 3.7424384561323698e-06, "loss": 0.85181063, "num_input_tokens_seen": 67668860, "step": 3134, "time_per_iteration": 2.9284298419952393 }, { "auxiliary_loss_clip": 0.01131578, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.05168402, "balance_loss_mlp": 1.02802503, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 2.0376543711114152, "language_loss": 0.82859468, "learning_rate": 3.742247238639684e-06, "loss": 0.85037726, "num_input_tokens_seen": 67690220, "step": 3135, "time_per_iteration": 2.8006811141967773 }, { "auxiliary_loss_clip": 0.01143148, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.05505157, "balance_loss_mlp": 1.03146911, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.9728388324049713, "language_loss": 0.78658557, "learning_rate": 3.7420559550808083e-06, "loss": 0.80851901, "num_input_tokens_seen": 67709820, "step": 3136, "time_per_iteration": 4.256143569946289 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.05388892, "balance_loss_mlp": 1.03006911, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 1.7483697887361769, "language_loss": 0.80820233, "learning_rate": 3.741864605462996e-06, "loss": 0.83004391, "num_input_tokens_seen": 67729490, "step": 3137, "time_per_iteration": 2.7538130283355713 }, { "auxiliary_loss_clip": 0.01159054, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05827475, "balance_loss_mlp": 1.03107548, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.9799764624272802, "language_loss": 0.81274408, "learning_rate": 3.741673189793504e-06, "loss": 0.83481836, "num_input_tokens_seen": 67749665, "step": 3138, "time_per_iteration": 4.143909931182861 }, { "auxiliary_loss_clip": 0.01150082, "auxiliary_loss_mlp": 0.01056444, "balance_loss_clip": 1.05626798, "balance_loss_mlp": 1.03713167, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 2.326218248348143, "language_loss": 0.63655496, "learning_rate": 3.7414817080795896e-06, "loss": 0.65862024, "num_input_tokens_seen": 67776230, "step": 3139, "time_per_iteration": 4.30991268157959 }, { "auxiliary_loss_clip": 0.0115289, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.05286491, "balance_loss_mlp": 1.02356625, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 2.1185902638296525, "language_loss": 0.7148211, "learning_rate": 3.741290160328514e-06, "loss": 0.73678017, "num_input_tokens_seen": 67795080, "step": 3140, "time_per_iteration": 2.6880578994750977 }, { "auxiliary_loss_clip": 0.01154738, "auxiliary_loss_mlp": 0.01043099, "balance_loss_clip": 1.05349982, "balance_loss_mlp": 1.02382278, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.6250212982316574, "language_loss": 0.87069929, "learning_rate": 3.7410985465475412e-06, "loss": 0.89267766, "num_input_tokens_seen": 67813110, "step": 3141, "time_per_iteration": 2.6677181720733643 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.01052882, "balance_loss_clip": 1.0507834, "balance_loss_mlp": 1.03243756, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 1.873404502116747, "language_loss": 0.7744689, "learning_rate": 3.7409068667439378e-06, "loss": 0.79630429, "num_input_tokens_seen": 67831070, "step": 3142, "time_per_iteration": 2.63077449798584 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.05298221, "balance_loss_mlp": 1.02132463, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 1.6611052928231447, "language_loss": 0.78867507, "learning_rate": 3.740715120924971e-06, "loss": 0.81033778, "num_input_tokens_seen": 67852170, "step": 3143, "time_per_iteration": 4.417406797409058 }, { "auxiliary_loss_clip": 0.0111986, "auxiliary_loss_mlp": 0.01048019, "balance_loss_clip": 1.05024099, "balance_loss_mlp": 1.02821851, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.855732191409361, "language_loss": 0.71476078, "learning_rate": 3.740523309097912e-06, "loss": 0.73643959, "num_input_tokens_seen": 67869945, "step": 3144, "time_per_iteration": 2.8104894161224365 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.05102479, "balance_loss_mlp": 1.02492023, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.5973078221757144, "language_loss": 0.73390597, "learning_rate": 3.7403314312700356e-06, "loss": 0.75560808, "num_input_tokens_seen": 67890240, "step": 3145, "time_per_iteration": 2.715609312057495 }, { "auxiliary_loss_clip": 0.01110308, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.04543984, "balance_loss_mlp": 1.02446938, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.915733862437625, "language_loss": 0.76263785, "learning_rate": 3.740139487448616e-06, "loss": 0.78416634, "num_input_tokens_seen": 67907825, "step": 3146, "time_per_iteration": 2.777221202850342 }, { "auxiliary_loss_clip": 0.01092807, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.04319823, "balance_loss_mlp": 1.02829611, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 1.988128972125699, "language_loss": 0.7837925, "learning_rate": 3.7399474776409326e-06, "loss": 0.80521393, "num_input_tokens_seen": 67926670, "step": 3147, "time_per_iteration": 2.8039205074310303 }, { "auxiliary_loss_clip": 0.01143577, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.0548687, "balance_loss_mlp": 1.02454758, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 3.932544798883504, "language_loss": 0.67477876, "learning_rate": 3.739755401854267e-06, "loss": 0.69664401, "num_input_tokens_seen": 67943645, "step": 3148, "time_per_iteration": 2.7273359298706055 }, { "auxiliary_loss_clip": 0.01112331, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04617155, "balance_loss_mlp": 1.02014899, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.9848849244070315, "language_loss": 0.76207471, "learning_rate": 3.739563260095902e-06, "loss": 0.78358936, "num_input_tokens_seen": 67962345, "step": 3149, "time_per_iteration": 2.8031978607177734 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.05438852, "balance_loss_mlp": 1.02797484, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.3661599820320136, "language_loss": 0.80378366, "learning_rate": 3.7393710523731245e-06, "loss": 0.82554519, "num_input_tokens_seen": 67979760, "step": 3150, "time_per_iteration": 2.7836129665374756 }, { "auxiliary_loss_clip": 0.01137112, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.0528239, "balance_loss_mlp": 1.03019929, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.0711129864945956, "language_loss": 0.85251844, "learning_rate": 3.7391787786932215e-06, "loss": 0.87437713, "num_input_tokens_seen": 67996895, "step": 3151, "time_per_iteration": 2.7782201766967773 }, { "auxiliary_loss_clip": 0.01121267, "auxiliary_loss_mlp": 0.01046776, "balance_loss_clip": 1.05223882, "balance_loss_mlp": 1.02839363, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 2.1337439707996673, "language_loss": 0.74114192, "learning_rate": 3.7389864390634857e-06, "loss": 0.76282233, "num_input_tokens_seen": 68018365, "step": 3152, "time_per_iteration": 2.8767755031585693 }, { "auxiliary_loss_clip": 0.01120312, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05119991, "balance_loss_mlp": 1.02463925, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 1.9471461777193173, "language_loss": 0.75520492, "learning_rate": 3.738794033491209e-06, "loss": 0.77685189, "num_input_tokens_seen": 68037985, "step": 3153, "time_per_iteration": 2.7722980976104736 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.0559293, "balance_loss_mlp": 1.03102183, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.099749434473157, "language_loss": 0.79984629, "learning_rate": 3.7386015619836887e-06, "loss": 0.82192594, "num_input_tokens_seen": 68057975, "step": 3154, "time_per_iteration": 2.6530587673187256 }, { "auxiliary_loss_clip": 0.01117992, "auxiliary_loss_mlp": 0.01056707, "balance_loss_clip": 1.04851115, "balance_loss_mlp": 1.03536844, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.210440214164498, "language_loss": 0.73046303, "learning_rate": 3.738409024548223e-06, "loss": 0.75220996, "num_input_tokens_seen": 68074175, "step": 3155, "time_per_iteration": 2.729832410812378 }, { "auxiliary_loss_clip": 0.01126019, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05104291, "balance_loss_mlp": 1.02626419, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 1.8299076145086866, "language_loss": 0.73869717, "learning_rate": 3.7382164211921136e-06, "loss": 0.76041389, "num_input_tokens_seen": 68095230, "step": 3156, "time_per_iteration": 2.6747231483459473 }, { "auxiliary_loss_clip": 0.01156549, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.05489409, "balance_loss_mlp": 1.02645326, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.9629652277148564, "language_loss": 0.68053937, "learning_rate": 3.7380237519226623e-06, "loss": 0.70254672, "num_input_tokens_seen": 68113805, "step": 3157, "time_per_iteration": 2.7092478275299072 }, { "auxiliary_loss_clip": 0.01114914, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.04805827, "balance_loss_mlp": 1.02533436, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 1.7829025355963362, "language_loss": 0.79893303, "learning_rate": 3.737831016747176e-06, "loss": 0.82052404, "num_input_tokens_seen": 68133190, "step": 3158, "time_per_iteration": 2.7921364307403564 }, { "auxiliary_loss_clip": 0.01163231, "auxiliary_loss_mlp": 0.01049502, "balance_loss_clip": 1.05787683, "balance_loss_mlp": 1.02923679, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 1.856283461980025, "language_loss": 0.72348613, "learning_rate": 3.737638215672964e-06, "loss": 0.74561346, "num_input_tokens_seen": 68152330, "step": 3159, "time_per_iteration": 2.6111273765563965 }, { "auxiliary_loss_clip": 0.01149613, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.05840325, "balance_loss_mlp": 1.03386414, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 2.2573250756933647, "language_loss": 0.84977192, "learning_rate": 3.7374453487073366e-06, "loss": 0.87180614, "num_input_tokens_seen": 68170185, "step": 3160, "time_per_iteration": 2.659259796142578 }, { "auxiliary_loss_clip": 0.01129342, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.05297387, "balance_loss_mlp": 1.03289795, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 2.752358611011079, "language_loss": 0.73407793, "learning_rate": 3.7372524158576074e-06, "loss": 0.7558704, "num_input_tokens_seen": 68191665, "step": 3161, "time_per_iteration": 2.784040689468384 }, { "auxiliary_loss_clip": 0.01139858, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.05456805, "balance_loss_mlp": 1.03476942, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.6629026055958476, "language_loss": 0.8115741, "learning_rate": 3.7370594171310926e-06, "loss": 0.83352458, "num_input_tokens_seen": 68214635, "step": 3162, "time_per_iteration": 2.9375386238098145 }, { "auxiliary_loss_clip": 0.01157449, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.05625844, "balance_loss_mlp": 1.02062798, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 2.448016750033594, "language_loss": 0.75615001, "learning_rate": 3.73686635253511e-06, "loss": 0.77812481, "num_input_tokens_seen": 68232150, "step": 3163, "time_per_iteration": 2.7344541549682617 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01050093, "balance_loss_clip": 1.050578, "balance_loss_mlp": 1.02880192, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 2.2644227245470514, "language_loss": 0.74093997, "learning_rate": 3.736673222076982e-06, "loss": 0.76245081, "num_input_tokens_seen": 68253370, "step": 3164, "time_per_iteration": 2.9165730476379395 }, { "auxiliary_loss_clip": 0.01141317, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.05518687, "balance_loss_mlp": 1.0195303, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 1.5484522746055986, "language_loss": 0.66844344, "learning_rate": 3.7364800257640313e-06, "loss": 0.69023699, "num_input_tokens_seen": 68278895, "step": 3165, "time_per_iteration": 3.006096124649048 }, { "auxiliary_loss_clip": 0.01146225, "auxiliary_loss_mlp": 0.0104856, "balance_loss_clip": 1.05512285, "balance_loss_mlp": 1.02848506, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.8598536292657144, "language_loss": 0.74239767, "learning_rate": 3.7362867636035835e-06, "loss": 0.76434553, "num_input_tokens_seen": 68294880, "step": 3166, "time_per_iteration": 2.678844928741455 }, { "auxiliary_loss_clip": 0.01050093, "auxiliary_loss_mlp": 0.01014959, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.01201403, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7754190343967906, "language_loss": 0.50311053, "learning_rate": 3.736093435602968e-06, "loss": 0.52376103, "num_input_tokens_seen": 68359665, "step": 3167, "time_per_iteration": 3.277529239654541 }, { "auxiliary_loss_clip": 0.01138483, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.05485487, "balance_loss_mlp": 1.03293037, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 2.3487387451986192, "language_loss": 0.74504036, "learning_rate": 3.7359000417695156e-06, "loss": 0.76693863, "num_input_tokens_seen": 68378950, "step": 3168, "time_per_iteration": 2.690995216369629 }, { "auxiliary_loss_clip": 0.01040165, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.03869283, "balance_loss_mlp": 1.02085996, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8605055473788603, "language_loss": 0.60079956, "learning_rate": 3.73570658211056e-06, "loss": 0.62143636, "num_input_tokens_seen": 68434235, "step": 3169, "time_per_iteration": 3.2108101844787598 }, { "auxiliary_loss_clip": 0.01103792, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.05267787, "balance_loss_mlp": 1.03741288, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.5575975614891868, "language_loss": 0.78179795, "learning_rate": 3.735513056633436e-06, "loss": 0.80340189, "num_input_tokens_seen": 68453830, "step": 3170, "time_per_iteration": 2.832043409347534 }, { "auxiliary_loss_clip": 0.01142047, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.05325115, "balance_loss_mlp": 1.02605128, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.7671932984988854, "language_loss": 0.78177166, "learning_rate": 3.7353194653454834e-06, "loss": 0.80364257, "num_input_tokens_seen": 68473005, "step": 3171, "time_per_iteration": 2.7823612689971924 }, { "auxiliary_loss_clip": 0.01158227, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05499291, "balance_loss_mlp": 1.0285697, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.1976685633770905, "language_loss": 0.77953529, "learning_rate": 3.7351258082540426e-06, "loss": 0.80159104, "num_input_tokens_seen": 68493470, "step": 3172, "time_per_iteration": 2.746279001235962 }, { "auxiliary_loss_clip": 0.01145112, "auxiliary_loss_mlp": 0.01055334, "balance_loss_clip": 1.05438328, "balance_loss_mlp": 1.03703523, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 1.5258786569967644, "language_loss": 0.80223799, "learning_rate": 3.7349320853664576e-06, "loss": 0.82424247, "num_input_tokens_seen": 68511290, "step": 3173, "time_per_iteration": 2.7396810054779053 }, { "auxiliary_loss_clip": 0.01113266, "auxiliary_loss_mlp": 0.00778142, "balance_loss_clip": 1.04967713, "balance_loss_mlp": 1.00094676, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 1.5341307852526682, "language_loss": 0.78495061, "learning_rate": 3.7347382966900735e-06, "loss": 0.80386466, "num_input_tokens_seen": 68532575, "step": 3174, "time_per_iteration": 2.8579304218292236 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.04928994, "balance_loss_mlp": 1.02838778, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 1.8075853216546063, "language_loss": 0.81067109, "learning_rate": 3.7345444422322395e-06, "loss": 0.83221382, "num_input_tokens_seen": 68548760, "step": 3175, "time_per_iteration": 2.718254804611206 }, { "auxiliary_loss_clip": 0.01080497, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.04361629, "balance_loss_mlp": 1.0342685, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 2.2545261224105873, "language_loss": 0.85529047, "learning_rate": 3.7343505220003067e-06, "loss": 0.87663192, "num_input_tokens_seen": 68563100, "step": 3176, "time_per_iteration": 4.2962729930877686 }, { "auxiliary_loss_clip": 0.0113361, "auxiliary_loss_mlp": 0.01059849, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.03928506, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.0896270593066832, "language_loss": 0.813025, "learning_rate": 3.7341565360016285e-06, "loss": 0.83495957, "num_input_tokens_seen": 68581650, "step": 3177, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01122377, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.0482533, "balance_loss_mlp": 1.0265398, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.67963335978105, "language_loss": 0.7530241, "learning_rate": 3.73396248424356e-06, "loss": 0.7746973, "num_input_tokens_seen": 68600360, "step": 3178, "time_per_iteration": 4.351228475570679 }, { "auxiliary_loss_clip": 0.01146729, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02458286, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 4.753014277211421, "language_loss": 0.81381619, "learning_rate": 3.7337683667334606e-06, "loss": 0.83570826, "num_input_tokens_seen": 68617885, "step": 3179, "time_per_iteration": 4.259284019470215 }, { "auxiliary_loss_clip": 0.01147837, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.05645823, "balance_loss_mlp": 1.0291661, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.753081884541086, "language_loss": 0.79384613, "learning_rate": 3.733574183478691e-06, "loss": 0.81580591, "num_input_tokens_seen": 68634550, "step": 3180, "time_per_iteration": 2.6609203815460205 }, { "auxiliary_loss_clip": 0.01129361, "auxiliary_loss_mlp": 0.0105402, "balance_loss_clip": 1.05249727, "balance_loss_mlp": 1.03445804, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.660238694189741, "language_loss": 0.79517245, "learning_rate": 3.733379934486615e-06, "loss": 0.81700623, "num_input_tokens_seen": 68651895, "step": 3181, "time_per_iteration": 2.6877176761627197 }, { "auxiliary_loss_clip": 0.0114301, "auxiliary_loss_mlp": 0.01053621, "balance_loss_clip": 1.05339336, "balance_loss_mlp": 1.03527462, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 2.2179888965480243, "language_loss": 0.74570775, "learning_rate": 3.7331856197645973e-06, "loss": 0.76767409, "num_input_tokens_seen": 68671500, "step": 3182, "time_per_iteration": 4.2829508781433105 }, { "auxiliary_loss_clip": 0.01128679, "auxiliary_loss_mlp": 0.01044063, "balance_loss_clip": 1.05578041, "balance_loss_mlp": 1.02575254, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 1.7534728284311585, "language_loss": 0.64618582, "learning_rate": 3.7329912393200084e-06, "loss": 0.66791326, "num_input_tokens_seen": 68690570, "step": 3183, "time_per_iteration": 2.7652854919433594 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.0512805, "balance_loss_mlp": 1.0311259, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.555926798692704, "language_loss": 0.73459226, "learning_rate": 3.7327967931602173e-06, "loss": 0.75642347, "num_input_tokens_seen": 68709735, "step": 3184, "time_per_iteration": 2.6929056644439697 }, { "auxiliary_loss_clip": 0.01122578, "auxiliary_loss_mlp": 0.01054123, "balance_loss_clip": 1.05015373, "balance_loss_mlp": 1.03347623, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 2.0989643169058514, "language_loss": 0.87983418, "learning_rate": 3.732602281292598e-06, "loss": 0.9016012, "num_input_tokens_seen": 68727565, "step": 3185, "time_per_iteration": 2.6859230995178223 }, { "auxiliary_loss_clip": 0.01153787, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.02505302, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.4520480945942587, "language_loss": 0.73240852, "learning_rate": 3.7324077037245267e-06, "loss": 0.75439072, "num_input_tokens_seen": 68748110, "step": 3186, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01132874, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.05609488, "balance_loss_mlp": 1.02379346, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 2.739457234253781, "language_loss": 0.83550584, "learning_rate": 3.7322130604633825e-06, "loss": 0.85728443, "num_input_tokens_seen": 68769765, "step": 3187, "time_per_iteration": 2.7476372718811035 }, { "auxiliary_loss_clip": 0.01076264, "auxiliary_loss_mlp": 0.01021317, "balance_loss_clip": 1.04604995, "balance_loss_mlp": 1.01892138, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8659386797819415, "language_loss": 0.55824959, "learning_rate": 3.732018351516544e-06, "loss": 0.57922542, "num_input_tokens_seen": 68826815, "step": 3188, "time_per_iteration": 3.2144031524658203 }, { "auxiliary_loss_clip": 0.01139007, "auxiliary_loss_mlp": 0.01054399, "balance_loss_clip": 1.054564, "balance_loss_mlp": 1.03537333, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 2.2897904709915573, "language_loss": 0.69839454, "learning_rate": 3.731823576891397e-06, "loss": 0.72032857, "num_input_tokens_seen": 68847585, "step": 3189, "time_per_iteration": 2.7998950481414795 }, { "auxiliary_loss_clip": 0.01118438, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.04930174, "balance_loss_mlp": 1.02116132, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 2.362312815249866, "language_loss": 0.74320328, "learning_rate": 3.7316287365953266e-06, "loss": 0.76477331, "num_input_tokens_seen": 68866620, "step": 3190, "time_per_iteration": 2.7386670112609863 }, { "auxiliary_loss_clip": 0.01111071, "auxiliary_loss_mlp": 0.0106718, "balance_loss_clip": 1.04946983, "balance_loss_mlp": 1.04702199, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 3.545467698458187, "language_loss": 0.8444041, "learning_rate": 3.73143383063572e-06, "loss": 0.8661865, "num_input_tokens_seen": 68885515, "step": 3191, "time_per_iteration": 2.7025794982910156 }, { "auxiliary_loss_clip": 0.01127894, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.02231336, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 2.0663841109071526, "language_loss": 0.89985192, "learning_rate": 3.73123885901997e-06, "loss": 0.92152941, "num_input_tokens_seen": 68903225, "step": 3192, "time_per_iteration": 2.802852153778076 }, { "auxiliary_loss_clip": 0.01130336, "auxiliary_loss_mlp": 0.01054766, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.03509688, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 2.3467564445058775, "language_loss": 0.75159264, "learning_rate": 3.7310438217554687e-06, "loss": 0.77344358, "num_input_tokens_seen": 68922860, "step": 3193, "time_per_iteration": 2.7680914402008057 }, { "auxiliary_loss_clip": 0.01128303, "auxiliary_loss_mlp": 0.00777332, "balance_loss_clip": 1.05222785, "balance_loss_mlp": 1.00071752, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 2.078743387775855, "language_loss": 0.75189757, "learning_rate": 3.730848718849612e-06, "loss": 0.77095383, "num_input_tokens_seen": 68943000, "step": 3194, "time_per_iteration": 2.7537553310394287 }, { "auxiliary_loss_clip": 0.01068142, "auxiliary_loss_mlp": 0.01004387, "balance_loss_clip": 1.03910232, "balance_loss_mlp": 1.00182378, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7955224937316553, "language_loss": 0.68507159, "learning_rate": 3.7306535503097985e-06, "loss": 0.70579696, "num_input_tokens_seen": 69000255, "step": 3195, "time_per_iteration": 3.117191791534424 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.01052392, "balance_loss_clip": 1.05205238, "balance_loss_mlp": 1.0320189, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 2.6559439291645757, "language_loss": 0.73141015, "learning_rate": 3.730458316143429e-06, "loss": 0.75309479, "num_input_tokens_seen": 69019665, "step": 3196, "time_per_iteration": 2.7234303951263428 }, { "auxiliary_loss_clip": 0.01139018, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.06151462, "balance_loss_mlp": 1.02596927, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 3.0997718824135734, "language_loss": 0.83654135, "learning_rate": 3.7302630163579068e-06, "loss": 0.85838103, "num_input_tokens_seen": 69039055, "step": 3197, "time_per_iteration": 2.72575306892395 }, { "auxiliary_loss_clip": 0.01086216, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.04615641, "balance_loss_mlp": 1.03320754, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.2465298420006383, "language_loss": 0.80656433, "learning_rate": 3.7300676509606373e-06, "loss": 0.82797706, "num_input_tokens_seen": 69056370, "step": 3198, "time_per_iteration": 2.741678237915039 }, { "auxiliary_loss_clip": 0.01135487, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.05502987, "balance_loss_mlp": 1.03655636, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.9205907836873994, "language_loss": 0.78993976, "learning_rate": 3.729872219959029e-06, "loss": 0.81186032, "num_input_tokens_seen": 69075915, "step": 3199, "time_per_iteration": 2.7821297645568848 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.05010581, "balance_loss_mlp": 1.036412, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 3.662083840248298, "language_loss": 0.83574522, "learning_rate": 3.7296767233604934e-06, "loss": 0.85748297, "num_input_tokens_seen": 69094145, "step": 3200, "time_per_iteration": 2.7095022201538086 }, { "auxiliary_loss_clip": 0.01159025, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.05997193, "balance_loss_mlp": 1.03060746, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.9278966392289572, "language_loss": 0.79092836, "learning_rate": 3.729481161172443e-06, "loss": 0.81300688, "num_input_tokens_seen": 69111110, "step": 3201, "time_per_iteration": 2.684979200363159 }, { "auxiliary_loss_clip": 0.01103349, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.04825675, "balance_loss_mlp": 1.03418541, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 2.4062417134527645, "language_loss": 0.69276404, "learning_rate": 3.7292855334022927e-06, "loss": 0.71433127, "num_input_tokens_seen": 69130280, "step": 3202, "time_per_iteration": 2.8284943103790283 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.05389905, "balance_loss_mlp": 1.02256894, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.9491265782204168, "language_loss": 0.91396749, "learning_rate": 3.7290898400574627e-06, "loss": 0.93570089, "num_input_tokens_seen": 69149570, "step": 3203, "time_per_iteration": 2.802433729171753 }, { "auxiliary_loss_clip": 0.0114953, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.05674863, "balance_loss_mlp": 1.02959776, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 5.05881669068558, "language_loss": 0.81689429, "learning_rate": 3.7288940811453725e-06, "loss": 0.83888692, "num_input_tokens_seen": 69168190, "step": 3204, "time_per_iteration": 2.671285629272461 }, { "auxiliary_loss_clip": 0.01116988, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.04950142, "balance_loss_mlp": 1.0298202, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.296941025186916, "language_loss": 0.76167846, "learning_rate": 3.7286982566734454e-06, "loss": 0.78333771, "num_input_tokens_seen": 69186950, "step": 3205, "time_per_iteration": 2.8654470443725586 }, { "auxiliary_loss_clip": 0.01140852, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05839586, "balance_loss_mlp": 1.02749407, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 3.761768843322395, "language_loss": 0.83394569, "learning_rate": 3.728502366649107e-06, "loss": 0.85582072, "num_input_tokens_seen": 69204850, "step": 3206, "time_per_iteration": 2.8610613346099854 }, { "auxiliary_loss_clip": 0.0105715, "auxiliary_loss_mlp": 0.01004055, "balance_loss_clip": 1.03779244, "balance_loss_mlp": 1.00174224, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8644529519848262, "language_loss": 0.60561717, "learning_rate": 3.728306411079786e-06, "loss": 0.62622917, "num_input_tokens_seen": 69259200, "step": 3207, "time_per_iteration": 3.126537322998047 }, { "auxiliary_loss_clip": 0.01120285, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.05201781, "balance_loss_mlp": 1.02678764, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.296187182186814, "language_loss": 0.75463599, "learning_rate": 3.7281103899729125e-06, "loss": 0.77629405, "num_input_tokens_seen": 69275835, "step": 3208, "time_per_iteration": 2.6978750228881836 }, { "auxiliary_loss_clip": 0.01150534, "auxiliary_loss_mlp": 0.00777875, "balance_loss_clip": 1.05520236, "balance_loss_mlp": 1.00063884, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 1.9483983315924505, "language_loss": 0.60869855, "learning_rate": 3.7279143033359195e-06, "loss": 0.62798262, "num_input_tokens_seen": 69294810, "step": 3209, "time_per_iteration": 2.699798107147217 }, { "auxiliary_loss_clip": 0.01158758, "auxiliary_loss_mlp": 0.01053815, "balance_loss_clip": 1.05472994, "balance_loss_mlp": 1.03261995, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 1.9992177661428934, "language_loss": 0.80025005, "learning_rate": 3.727718151176243e-06, "loss": 0.82237577, "num_input_tokens_seen": 69316065, "step": 3210, "time_per_iteration": 2.832665205001831 }, { "auxiliary_loss_clip": 0.01118997, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.05044246, "balance_loss_mlp": 1.02920699, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.515510367397107, "language_loss": 0.82571948, "learning_rate": 3.7275219335013217e-06, "loss": 0.84738445, "num_input_tokens_seen": 69332900, "step": 3211, "time_per_iteration": 2.7664191722869873 }, { "auxiliary_loss_clip": 0.01073663, "auxiliary_loss_mlp": 0.01002544, "balance_loss_clip": 1.03501034, "balance_loss_mlp": 1.00021982, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9633495631759209, "language_loss": 0.63641912, "learning_rate": 3.7273256503185953e-06, "loss": 0.6571812, "num_input_tokens_seen": 69382535, "step": 3212, "time_per_iteration": 2.974940299987793 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05586314, "balance_loss_mlp": 1.02565336, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.7209148950717332, "language_loss": 0.76375663, "learning_rate": 3.7271293016355074e-06, "loss": 0.78551459, "num_input_tokens_seen": 69400600, "step": 3213, "time_per_iteration": 2.7898454666137695 }, { "auxiliary_loss_clip": 0.01123196, "auxiliary_loss_mlp": 0.0105066, "balance_loss_clip": 1.05261111, "balance_loss_mlp": 1.03116894, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.349758973823363, "language_loss": 0.70871878, "learning_rate": 3.726932887459503e-06, "loss": 0.73045731, "num_input_tokens_seen": 69417350, "step": 3214, "time_per_iteration": 2.8155152797698975 }, { "auxiliary_loss_clip": 0.01155585, "auxiliary_loss_mlp": 0.01047831, "balance_loss_clip": 1.05412841, "balance_loss_mlp": 1.02807808, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 2.190607045917922, "language_loss": 0.75067955, "learning_rate": 3.72673640779803e-06, "loss": 0.77271378, "num_input_tokens_seen": 69431845, "step": 3215, "time_per_iteration": 4.111938238143921 }, { "auxiliary_loss_clip": 0.01112217, "auxiliary_loss_mlp": 0.01049964, "balance_loss_clip": 1.04928339, "balance_loss_mlp": 1.0323447, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 1.7842520268521305, "language_loss": 0.88426638, "learning_rate": 3.72653986265854e-06, "loss": 0.9058882, "num_input_tokens_seen": 69453275, "step": 3216, "time_per_iteration": 2.7699615955352783 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01052131, "balance_loss_clip": 1.05435801, "balance_loss_mlp": 1.03442836, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 1.6996051239972392, "language_loss": 0.7974773, "learning_rate": 3.726343252048485e-06, "loss": 0.81953669, "num_input_tokens_seen": 69471830, "step": 3217, "time_per_iteration": 2.6788718700408936 }, { "auxiliary_loss_clip": 0.01143281, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.05695105, "balance_loss_mlp": 1.02864754, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 4.708784796317305, "language_loss": 0.6161437, "learning_rate": 3.7261465759753206e-06, "loss": 0.6380679, "num_input_tokens_seen": 69489320, "step": 3218, "time_per_iteration": 4.352849960327148 }, { "auxiliary_loss_clip": 0.01157355, "auxiliary_loss_mlp": 0.01047211, "balance_loss_clip": 1.05723107, "balance_loss_mlp": 1.02873373, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.9724785552136583, "language_loss": 0.80345452, "learning_rate": 3.7259498344465053e-06, "loss": 0.82550013, "num_input_tokens_seen": 69506665, "step": 3219, "time_per_iteration": 4.1739161014556885 }, { "auxiliary_loss_clip": 0.01104687, "auxiliary_loss_mlp": 0.01047672, "balance_loss_clip": 1.05145359, "balance_loss_mlp": 1.02819324, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.7508533279024077, "language_loss": 0.85693008, "learning_rate": 3.7257530274694993e-06, "loss": 0.87845367, "num_input_tokens_seen": 69523835, "step": 3220, "time_per_iteration": 2.777284622192383 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.05441856, "balance_loss_mlp": 1.02511764, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.05545450883527, "language_loss": 0.84637755, "learning_rate": 3.725556155051766e-06, "loss": 0.86829084, "num_input_tokens_seen": 69542620, "step": 3221, "time_per_iteration": 4.224115371704102 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.05466259, "balance_loss_mlp": 1.02730846, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 2.658004231066563, "language_loss": 0.86087942, "learning_rate": 3.7253592172007702e-06, "loss": 0.8827616, "num_input_tokens_seen": 69561130, "step": 3222, "time_per_iteration": 2.6400530338287354 }, { "auxiliary_loss_clip": 0.01069453, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.04206085, "balance_loss_mlp": 1.02599275, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 1.8604116943694204, "language_loss": 0.78510809, "learning_rate": 3.72516221392398e-06, "loss": 0.8062554, "num_input_tokens_seen": 69580425, "step": 3223, "time_per_iteration": 2.9685652256011963 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.05697751, "balance_loss_mlp": 1.02819431, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 1.8958208586464897, "language_loss": 0.75391948, "learning_rate": 3.7249651452288653e-06, "loss": 0.77584827, "num_input_tokens_seen": 69597085, "step": 3224, "time_per_iteration": 2.665294885635376 }, { "auxiliary_loss_clip": 0.01102293, "auxiliary_loss_mlp": 0.01050181, "balance_loss_clip": 1.04728186, "balance_loss_mlp": 1.02927208, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 3.358076005999295, "language_loss": 0.71180636, "learning_rate": 3.7247680111229e-06, "loss": 0.73333108, "num_input_tokens_seen": 69618885, "step": 3225, "time_per_iteration": 2.997511863708496 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.0519309, "balance_loss_mlp": 1.03480864, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.42331686427639, "language_loss": 0.69379079, "learning_rate": 3.7245708116135585e-06, "loss": 0.71552593, "num_input_tokens_seen": 69638200, "step": 3226, "time_per_iteration": 2.746338129043579 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.02264214, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 2.1006513764454864, "language_loss": 0.76236808, "learning_rate": 3.7243735467083193e-06, "loss": 0.78405869, "num_input_tokens_seen": 69657550, "step": 3227, "time_per_iteration": 2.760087728500366 }, { "auxiliary_loss_clip": 0.01117794, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.05304587, "balance_loss_mlp": 1.0256561, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 2.8268368707906397, "language_loss": 0.69577461, "learning_rate": 3.724176216414662e-06, "loss": 0.71739054, "num_input_tokens_seen": 69675005, "step": 3228, "time_per_iteration": 2.6779348850250244 }, { "auxiliary_loss_clip": 0.01148199, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02445757, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 1.7694943420266864, "language_loss": 0.74160898, "learning_rate": 3.72397882074007e-06, "loss": 0.76351416, "num_input_tokens_seen": 69696455, "step": 3229, "time_per_iteration": 2.7229623794555664 }, { "auxiliary_loss_clip": 0.01119678, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.05435359, "balance_loss_mlp": 1.0262022, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.9766126324167548, "language_loss": 0.65722096, "learning_rate": 3.7237813596920285e-06, "loss": 0.67885935, "num_input_tokens_seen": 69714245, "step": 3230, "time_per_iteration": 2.740324020385742 }, { "auxiliary_loss_clip": 0.01124671, "auxiliary_loss_mlp": 0.00776003, "balance_loss_clip": 1.05223823, "balance_loss_mlp": 1.00081468, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 1.9307338208311895, "language_loss": 0.82042694, "learning_rate": 3.7235838332780254e-06, "loss": 0.83943367, "num_input_tokens_seen": 69731515, "step": 3231, "time_per_iteration": 2.7453513145446777 }, { "auxiliary_loss_clip": 0.0113141, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.05393946, "balance_loss_mlp": 1.02220988, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 10.866686758212083, "language_loss": 0.87038374, "learning_rate": 3.72338624150555e-06, "loss": 0.89212114, "num_input_tokens_seen": 69748885, "step": 3232, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.05029583, "balance_loss_mlp": 1.03102958, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 2.531838729905544, "language_loss": 0.85189134, "learning_rate": 3.723188584382096e-06, "loss": 0.87340462, "num_input_tokens_seen": 69767540, "step": 3233, "time_per_iteration": 2.8617444038391113 }, { "auxiliary_loss_clip": 0.01149478, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.0574832, "balance_loss_mlp": 1.0357672, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.7408859410354203, "language_loss": 0.89099532, "learning_rate": 3.722990861915158e-06, "loss": 0.91303527, "num_input_tokens_seen": 69789340, "step": 3234, "time_per_iteration": 2.7648239135742188 }, { "auxiliary_loss_clip": 0.01135157, "auxiliary_loss_mlp": 0.01044708, "balance_loss_clip": 1.05003643, "balance_loss_mlp": 1.02544403, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 2.4074482975555926, "language_loss": 0.78673434, "learning_rate": 3.722793074112234e-06, "loss": 0.80853301, "num_input_tokens_seen": 69806470, "step": 3235, "time_per_iteration": 2.76930832862854 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.0580672, "balance_loss_mlp": 1.0293448, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 2.2511193258734354, "language_loss": 0.79391634, "learning_rate": 3.7225952209808233e-06, "loss": 0.81574875, "num_input_tokens_seen": 69822655, "step": 3236, "time_per_iteration": 2.7060179710388184 }, { "auxiliary_loss_clip": 0.01156991, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.02482522, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 2.1553329609131713, "language_loss": 0.76224017, "learning_rate": 3.72239730252843e-06, "loss": 0.78425646, "num_input_tokens_seen": 69841895, "step": 3237, "time_per_iteration": 2.642235040664673 }, { "auxiliary_loss_clip": 0.01158804, "auxiliary_loss_mlp": 0.01051059, "balance_loss_clip": 1.05648041, "balance_loss_mlp": 1.03289127, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.5204653275468003, "language_loss": 0.74828202, "learning_rate": 3.7221993187625583e-06, "loss": 0.77038062, "num_input_tokens_seen": 69862220, "step": 3238, "time_per_iteration": 2.6618688106536865 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.04992437, "balance_loss_mlp": 1.02791595, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 3.1324225641798518, "language_loss": 0.734164, "learning_rate": 3.7220012696907155e-06, "loss": 0.75571299, "num_input_tokens_seen": 69881830, "step": 3239, "time_per_iteration": 2.7637152671813965 }, { "auxiliary_loss_clip": 0.01132567, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.05458641, "balance_loss_mlp": 1.02947509, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 2.155392951393246, "language_loss": 0.73291272, "learning_rate": 3.721803155320412e-06, "loss": 0.7547183, "num_input_tokens_seen": 69900515, "step": 3240, "time_per_iteration": 2.6980888843536377 }, { "auxiliary_loss_clip": 0.01131601, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.05846488, "balance_loss_mlp": 1.02208555, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 5.847648280625993, "language_loss": 0.65809447, "learning_rate": 3.7216049756591606e-06, "loss": 0.6798048, "num_input_tokens_seen": 69920060, "step": 3241, "time_per_iteration": 2.659707546234131 }, { "auxiliary_loss_clip": 0.01128971, "auxiliary_loss_mlp": 0.01048707, "balance_loss_clip": 1.05226684, "balance_loss_mlp": 1.03039646, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 1.4408225707306088, "language_loss": 0.82747853, "learning_rate": 3.7214067307144754e-06, "loss": 0.84925532, "num_input_tokens_seen": 69939820, "step": 3242, "time_per_iteration": 2.7137632369995117 }, { "auxiliary_loss_clip": 0.01077632, "auxiliary_loss_mlp": 0.01014225, "balance_loss_clip": 1.04083347, "balance_loss_mlp": 1.01131678, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.853263603243422, "language_loss": 0.57500821, "learning_rate": 3.721208420493875e-06, "loss": 0.59592682, "num_input_tokens_seen": 70002145, "step": 3243, "time_per_iteration": 3.1446309089660645 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01050428, "balance_loss_clip": 1.05331421, "balance_loss_mlp": 1.02988815, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 7.2345723863132, "language_loss": 0.83789021, "learning_rate": 3.7210100450048784e-06, "loss": 0.85977995, "num_input_tokens_seen": 70020510, "step": 3244, "time_per_iteration": 2.6194229125976562 }, { "auxiliary_loss_clip": 0.01143261, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05732584, "balance_loss_mlp": 1.02869976, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 2.0710390949438837, "language_loss": 0.7739507, "learning_rate": 3.7208116042550088e-06, "loss": 0.79584551, "num_input_tokens_seen": 70040760, "step": 3245, "time_per_iteration": 2.6684374809265137 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.05566645, "balance_loss_mlp": 1.02431464, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.1010289547443133, "language_loss": 0.83988321, "learning_rate": 3.7206130982517906e-06, "loss": 0.86174309, "num_input_tokens_seen": 70058720, "step": 3246, "time_per_iteration": 2.6595354080200195 }, { "auxiliary_loss_clip": 0.0114599, "auxiliary_loss_mlp": 0.00776442, "balance_loss_clip": 1.05517101, "balance_loss_mlp": 1.00080454, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 3.3581015873305438, "language_loss": 0.76840878, "learning_rate": 3.7204145270027514e-06, "loss": 0.78763306, "num_input_tokens_seen": 70076470, "step": 3247, "time_per_iteration": 2.7777793407440186 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02651262, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.8981807103962522, "language_loss": 0.75459039, "learning_rate": 3.720215890515421e-06, "loss": 0.77624786, "num_input_tokens_seen": 70096220, "step": 3248, "time_per_iteration": 2.8088901042938232 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.05548215, "balance_loss_mlp": 1.03008783, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 2.7209722336942135, "language_loss": 0.77774823, "learning_rate": 3.7200171887973316e-06, "loss": 0.79979146, "num_input_tokens_seen": 70114800, "step": 3249, "time_per_iteration": 2.610877752304077 }, { "auxiliary_loss_clip": 0.01148434, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05689144, "balance_loss_mlp": 1.03299928, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.5551573885822045, "language_loss": 0.73118901, "learning_rate": 3.7198184218560176e-06, "loss": 0.75317669, "num_input_tokens_seen": 70134930, "step": 3250, "time_per_iteration": 2.5901567935943604 }, { "auxiliary_loss_clip": 0.01101628, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05080378, "balance_loss_mlp": 1.02876413, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.030501302548557, "language_loss": 0.79203367, "learning_rate": 3.719619589699017e-06, "loss": 0.81351459, "num_input_tokens_seen": 70152045, "step": 3251, "time_per_iteration": 2.6619749069213867 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.05741858, "balance_loss_mlp": 1.02606022, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 7.451515078679223, "language_loss": 0.83871722, "learning_rate": 3.7194206923338695e-06, "loss": 0.86074108, "num_input_tokens_seen": 70169240, "step": 3252, "time_per_iteration": 2.5029656887054443 }, { "auxiliary_loss_clip": 0.01142752, "auxiliary_loss_mlp": 0.01057294, "balance_loss_clip": 1.05278862, "balance_loss_mlp": 1.03518057, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.7140417843701068, "language_loss": 0.73995864, "learning_rate": 3.719221729768117e-06, "loss": 0.76195908, "num_input_tokens_seen": 70192690, "step": 3253, "time_per_iteration": 2.609117269515991 }, { "auxiliary_loss_clip": 0.01102675, "auxiliary_loss_mlp": 0.01046707, "balance_loss_clip": 1.04759037, "balance_loss_mlp": 1.02782381, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.1302159220485675, "language_loss": 0.76167047, "learning_rate": 3.7190227020093037e-06, "loss": 0.78316426, "num_input_tokens_seen": 70209685, "step": 3254, "time_per_iteration": 4.174965858459473 }, { "auxiliary_loss_clip": 0.01043127, "auxiliary_loss_mlp": 0.01006966, "balance_loss_clip": 1.04737842, "balance_loss_mlp": 1.0036757, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.84452007287803, "language_loss": 0.55275303, "learning_rate": 3.7188236090649774e-06, "loss": 0.57325399, "num_input_tokens_seen": 70265050, "step": 3255, "time_per_iteration": 3.2241716384887695 }, { "auxiliary_loss_clip": 0.01133721, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.0557251, "balance_loss_mlp": 1.02349281, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 2.6103802859468392, "language_loss": 0.70870697, "learning_rate": 3.718624450942688e-06, "loss": 0.73046112, "num_input_tokens_seen": 70281830, "step": 3256, "time_per_iteration": 2.641296148300171 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.0544858, "balance_loss_mlp": 1.02523613, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.649319646209249, "language_loss": 0.80722409, "learning_rate": 3.718425227649987e-06, "loss": 0.82918048, "num_input_tokens_seen": 70297420, "step": 3257, "time_per_iteration": 4.258259057998657 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.05470431, "balance_loss_mlp": 1.02601588, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 6.015808523610408, "language_loss": 0.75124931, "learning_rate": 3.7182259391944292e-06, "loss": 0.77274966, "num_input_tokens_seen": 70319210, "step": 3258, "time_per_iteration": 4.386433362960815 }, { "auxiliary_loss_clip": 0.01082287, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.04533339, "balance_loss_mlp": 1.0237875, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 1.8034996675319444, "language_loss": 0.73872411, "learning_rate": 3.7180265855835714e-06, "loss": 0.75998843, "num_input_tokens_seen": 70339045, "step": 3259, "time_per_iteration": 2.815469264984131 }, { "auxiliary_loss_clip": 0.01131793, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.05167735, "balance_loss_mlp": 1.02392125, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.2096667980592, "language_loss": 0.77053022, "learning_rate": 3.7178271668249735e-06, "loss": 0.79228187, "num_input_tokens_seen": 70356505, "step": 3260, "time_per_iteration": 4.2817702293396 }, { "auxiliary_loss_clip": 0.01148118, "auxiliary_loss_mlp": 0.01043761, "balance_loss_clip": 1.0551343, "balance_loss_mlp": 1.0248661, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 5.605178759176999, "language_loss": 0.82261205, "learning_rate": 3.7176276829261975e-06, "loss": 0.84453082, "num_input_tokens_seen": 70375410, "step": 3261, "time_per_iteration": 2.673092842102051 }, { "auxiliary_loss_clip": 0.01121379, "auxiliary_loss_mlp": 0.01044043, "balance_loss_clip": 1.0550617, "balance_loss_mlp": 1.02488637, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 1.8492209450679535, "language_loss": 0.76671481, "learning_rate": 3.717428133894807e-06, "loss": 0.78836906, "num_input_tokens_seen": 70396315, "step": 3262, "time_per_iteration": 2.803938150405884 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01047259, "balance_loss_clip": 1.05960584, "balance_loss_mlp": 1.02950907, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.7278621785184562, "language_loss": 0.8668195, "learning_rate": 3.71722851973837e-06, "loss": 0.88878107, "num_input_tokens_seen": 70417945, "step": 3263, "time_per_iteration": 2.6677918434143066 }, { "auxiliary_loss_clip": 0.0113123, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.05328059, "balance_loss_mlp": 1.02505815, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 3.447639973868791, "language_loss": 0.73775035, "learning_rate": 3.717028840464455e-06, "loss": 0.75948811, "num_input_tokens_seen": 70438690, "step": 3264, "time_per_iteration": 2.6973094940185547 }, { "auxiliary_loss_clip": 0.01144053, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.05736756, "balance_loss_mlp": 1.03223944, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 2.4424358562200927, "language_loss": 0.78513813, "learning_rate": 3.7168290960806344e-06, "loss": 0.80707777, "num_input_tokens_seen": 70455385, "step": 3265, "time_per_iteration": 2.625739336013794 }, { "auxiliary_loss_clip": 0.01031434, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 1.03386986, "balance_loss_mlp": 0.99983466, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.7932330660809486, "language_loss": 0.53389955, "learning_rate": 3.716629286594483e-06, "loss": 0.55423868, "num_input_tokens_seen": 70514280, "step": 3266, "time_per_iteration": 3.2586586475372314 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.00776501, "balance_loss_clip": 1.04957044, "balance_loss_mlp": 1.00080895, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 2.0008611208986133, "language_loss": 0.80109024, "learning_rate": 3.7164294120135767e-06, "loss": 0.8201015, "num_input_tokens_seen": 70531800, "step": 3267, "time_per_iteration": 2.678537368774414 }, { "auxiliary_loss_clip": 0.01130982, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.05263019, "balance_loss_mlp": 1.02660179, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 1.9909459598185588, "language_loss": 0.86758262, "learning_rate": 3.7162294723454953e-06, "loss": 0.88933229, "num_input_tokens_seen": 70550615, "step": 3268, "time_per_iteration": 2.6949849128723145 }, { "auxiliary_loss_clip": 0.01099432, "auxiliary_loss_mlp": 0.01041621, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.02408528, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 2.2632495429204127, "language_loss": 0.68785441, "learning_rate": 3.7160294675978197e-06, "loss": 0.70926493, "num_input_tokens_seen": 70568690, "step": 3269, "time_per_iteration": 2.770078182220459 }, { "auxiliary_loss_clip": 0.01116538, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.03330541, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 7.1863103423452355, "language_loss": 0.80241841, "learning_rate": 3.715829397778135e-06, "loss": 0.82410699, "num_input_tokens_seen": 70588665, "step": 3270, "time_per_iteration": 2.7294864654541016 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.05189824, "balance_loss_mlp": 1.02833724, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 1.9668649321541274, "language_loss": 0.83912349, "learning_rate": 3.715629262894028e-06, "loss": 0.86097538, "num_input_tokens_seen": 70606900, "step": 3271, "time_per_iteration": 2.640235662460327 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.05468225, "balance_loss_mlp": 1.0332067, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 1.9968416702279483, "language_loss": 0.79902714, "learning_rate": 3.715429062953087e-06, "loss": 0.82093388, "num_input_tokens_seen": 70625955, "step": 3272, "time_per_iteration": 2.636629343032837 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.05192566, "balance_loss_mlp": 1.03715479, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.7302013075823783, "language_loss": 0.80942369, "learning_rate": 3.7152287979629043e-06, "loss": 0.83125186, "num_input_tokens_seen": 70646090, "step": 3273, "time_per_iteration": 2.6967809200286865 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.01054564, "balance_loss_clip": 1.05456042, "balance_loss_mlp": 1.03655195, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 2.225126358921887, "language_loss": 0.77984649, "learning_rate": 3.7150284679310735e-06, "loss": 0.80183232, "num_input_tokens_seen": 70666065, "step": 3274, "time_per_iteration": 2.6808643341064453 }, { "auxiliary_loss_clip": 0.01141267, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05480242, "balance_loss_mlp": 1.02840877, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.318697297640889, "language_loss": 0.81433225, "learning_rate": 3.7148280728651914e-06, "loss": 0.8362093, "num_input_tokens_seen": 70681580, "step": 3275, "time_per_iteration": 2.672672986984253 }, { "auxiliary_loss_clip": 0.01115756, "auxiliary_loss_mlp": 0.01045314, "balance_loss_clip": 1.05148947, "balance_loss_mlp": 1.02686024, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 2.4665004531377166, "language_loss": 0.80909657, "learning_rate": 3.7146276127728563e-06, "loss": 0.83070731, "num_input_tokens_seen": 70697745, "step": 3276, "time_per_iteration": 2.726970672607422 }, { "auxiliary_loss_clip": 0.01142619, "auxiliary_loss_mlp": 0.01043042, "balance_loss_clip": 1.05443609, "balance_loss_mlp": 1.02491045, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.17541075016206, "language_loss": 0.89113599, "learning_rate": 3.7144270876616713e-06, "loss": 0.9129926, "num_input_tokens_seen": 70715110, "step": 3277, "time_per_iteration": 2.6738827228546143 }, { "auxiliary_loss_clip": 0.01103709, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.02864444, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 2.640727897616601, "language_loss": 0.62070847, "learning_rate": 3.714226497539239e-06, "loss": 0.64222991, "num_input_tokens_seen": 70734715, "step": 3278, "time_per_iteration": 2.7382938861846924 }, { "auxiliary_loss_clip": 0.01115303, "auxiliary_loss_mlp": 0.0105759, "balance_loss_clip": 1.05033016, "balance_loss_mlp": 1.03793263, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 1.930104581155035, "language_loss": 0.73606467, "learning_rate": 3.714025842413166e-06, "loss": 0.75779366, "num_input_tokens_seen": 70752650, "step": 3279, "time_per_iteration": 2.8123648166656494 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.05422091, "balance_loss_mlp": 1.02567458, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.7034036878345749, "language_loss": 0.82685816, "learning_rate": 3.713825122291061e-06, "loss": 0.84875143, "num_input_tokens_seen": 70772365, "step": 3280, "time_per_iteration": 2.7000861167907715 }, { "auxiliary_loss_clip": 0.01106655, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.04887283, "balance_loss_mlp": 1.03071654, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 2.435959864664923, "language_loss": 0.78173983, "learning_rate": 3.713624337180536e-06, "loss": 0.80329525, "num_input_tokens_seen": 70790340, "step": 3281, "time_per_iteration": 2.7017247676849365 }, { "auxiliary_loss_clip": 0.01125353, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.05461836, "balance_loss_mlp": 1.02519727, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.7390973872526612, "language_loss": 0.79777479, "learning_rate": 3.7134234870892045e-06, "loss": 0.8194418, "num_input_tokens_seen": 70809295, "step": 3282, "time_per_iteration": 2.7064146995544434 }, { "auxiliary_loss_clip": 0.01112073, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.05485284, "balance_loss_mlp": 1.02538049, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 2.512566515566025, "language_loss": 0.7192747, "learning_rate": 3.7132225720246826e-06, "loss": 0.74082589, "num_input_tokens_seen": 70828765, "step": 3283, "time_per_iteration": 2.775297164916992 }, { "auxiliary_loss_clip": 0.01137498, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.05320621, "balance_loss_mlp": 1.02665281, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.8864815757917637, "language_loss": 0.78981179, "learning_rate": 3.7130215919945886e-06, "loss": 0.81162113, "num_input_tokens_seen": 70846805, "step": 3284, "time_per_iteration": 2.6344916820526123 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.00776821, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.00114048, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.1903874509936982, "language_loss": 0.86317503, "learning_rate": 3.7128205470065445e-06, "loss": 0.88220835, "num_input_tokens_seen": 70863805, "step": 3285, "time_per_iteration": 2.725186586380005 }, { "auxiliary_loss_clip": 0.01115791, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.05167055, "balance_loss_mlp": 1.02658761, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 2.208260347555195, "language_loss": 0.88770825, "learning_rate": 3.712619437068174e-06, "loss": 0.90931326, "num_input_tokens_seen": 70882660, "step": 3286, "time_per_iteration": 2.6819698810577393 }, { "auxiliary_loss_clip": 0.01118742, "auxiliary_loss_mlp": 0.01052526, "balance_loss_clip": 1.05227792, "balance_loss_mlp": 1.03016233, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.0768117117784874, "language_loss": 0.77941382, "learning_rate": 3.712418262187102e-06, "loss": 0.80112648, "num_input_tokens_seen": 70898765, "step": 3287, "time_per_iteration": 2.641193389892578 }, { "auxiliary_loss_clip": 0.01127955, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.0526104, "balance_loss_mlp": 1.02849019, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 2.061421898899755, "language_loss": 0.80853081, "learning_rate": 3.7122170223709584e-06, "loss": 0.83028376, "num_input_tokens_seen": 70916370, "step": 3288, "time_per_iteration": 2.625068426132202 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01048194, "balance_loss_clip": 1.05143857, "balance_loss_mlp": 1.03045535, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 2.345717890688315, "language_loss": 0.7317158, "learning_rate": 3.712015717627374e-06, "loss": 0.75352174, "num_input_tokens_seen": 70934870, "step": 3289, "time_per_iteration": 2.6319406032562256 }, { "auxiliary_loss_clip": 0.01133413, "auxiliary_loss_mlp": 0.01045224, "balance_loss_clip": 1.05575252, "balance_loss_mlp": 1.02678204, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 1.9087552003653308, "language_loss": 0.79608113, "learning_rate": 3.7118143479639813e-06, "loss": 0.81786746, "num_input_tokens_seen": 70955140, "step": 3290, "time_per_iteration": 2.706570863723755 }, { "auxiliary_loss_clip": 0.01049926, "auxiliary_loss_mlp": 0.0101105, "balance_loss_clip": 1.0327636, "balance_loss_mlp": 1.00853467, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.8952067644857119, "language_loss": 0.60318571, "learning_rate": 3.711612913388418e-06, "loss": 0.62379545, "num_input_tokens_seen": 71012005, "step": 3291, "time_per_iteration": 3.2849009037017822 }, { "auxiliary_loss_clip": 0.01158891, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.05417156, "balance_loss_mlp": 1.02088892, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 1.932789926440358, "language_loss": 0.81595641, "learning_rate": 3.7114114139083204e-06, "loss": 0.83795315, "num_input_tokens_seen": 71031140, "step": 3292, "time_per_iteration": 2.6751551628112793 }, { "auxiliary_loss_clip": 0.01119797, "auxiliary_loss_mlp": 0.00778082, "balance_loss_clip": 1.05296063, "balance_loss_mlp": 1.00086236, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 2.409042629875397, "language_loss": 0.81013, "learning_rate": 3.7112098495313313e-06, "loss": 0.82910883, "num_input_tokens_seen": 71050250, "step": 3293, "time_per_iteration": 4.3039703369140625 }, { "auxiliary_loss_clip": 0.01137316, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.05370128, "balance_loss_mlp": 1.03277683, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 1.8764131105986912, "language_loss": 0.61480314, "learning_rate": 3.711008220265093e-06, "loss": 0.63670063, "num_input_tokens_seen": 71068665, "step": 3294, "time_per_iteration": 2.671241044998169 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.05456376, "balance_loss_mlp": 1.02201271, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 2.0334748560156393, "language_loss": 0.87313825, "learning_rate": 3.710806526117251e-06, "loss": 0.89486015, "num_input_tokens_seen": 71085320, "step": 3295, "time_per_iteration": 2.659680128097534 }, { "auxiliary_loss_clip": 0.01113106, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03256536, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 2.5215255479345067, "language_loss": 0.80839241, "learning_rate": 3.7106047670954544e-06, "loss": 0.83002532, "num_input_tokens_seen": 71102020, "step": 3296, "time_per_iteration": 4.299339294433594 }, { "auxiliary_loss_clip": 0.01123906, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.05233586, "balance_loss_mlp": 1.02522039, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 2.528943220563754, "language_loss": 0.68126047, "learning_rate": 3.710402943207354e-06, "loss": 0.70295388, "num_input_tokens_seen": 71123390, "step": 3297, "time_per_iteration": 4.258284091949463 }, { "auxiliary_loss_clip": 0.01153129, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.05660713, "balance_loss_mlp": 1.02031219, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.9083451106828888, "language_loss": 0.81310993, "learning_rate": 3.7102010544606016e-06, "loss": 0.83501697, "num_input_tokens_seen": 71141800, "step": 3298, "time_per_iteration": 2.6156656742095947 }, { "auxiliary_loss_clip": 0.01137409, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.0573976, "balance_loss_mlp": 1.02159238, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 1.8996943203321497, "language_loss": 0.85154539, "learning_rate": 3.7099991008628544e-06, "loss": 0.87334174, "num_input_tokens_seen": 71159505, "step": 3299, "time_per_iteration": 2.6749041080474854 }, { "auxiliary_loss_clip": 0.01036953, "auxiliary_loss_mlp": 0.01013935, "balance_loss_clip": 1.02875936, "balance_loss_mlp": 1.01106215, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.82907550606663, "language_loss": 0.53206414, "learning_rate": 3.7097970824217706e-06, "loss": 0.55257303, "num_input_tokens_seen": 71223265, "step": 3300, "time_per_iteration": 4.83857798576355 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01064471, "balance_loss_clip": 1.04748702, "balance_loss_mlp": 1.0410459, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 316.1702389408657, "language_loss": 0.73014295, "learning_rate": 3.7095949991450093e-06, "loss": 0.75183886, "num_input_tokens_seen": 71242385, "step": 3301, "time_per_iteration": 2.700654983520508 }, { "auxiliary_loss_clip": 0.01118926, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.05295372, "balance_loss_mlp": 1.02619529, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.410718710355122, "language_loss": 0.88264418, "learning_rate": 3.709392851040235e-06, "loss": 0.90427655, "num_input_tokens_seen": 71258990, "step": 3302, "time_per_iteration": 2.7190146446228027 }, { "auxiliary_loss_clip": 0.01118067, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02661204, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 2.210364764996701, "language_loss": 0.73592931, "learning_rate": 3.709190638115111e-06, "loss": 0.75756073, "num_input_tokens_seen": 71282770, "step": 3303, "time_per_iteration": 2.9379186630249023 }, { "auxiliary_loss_clip": 0.01143275, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.05491257, "balance_loss_mlp": 1.03002524, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.9482807590384623, "language_loss": 0.75103521, "learning_rate": 3.7089883603773084e-06, "loss": 0.77295315, "num_input_tokens_seen": 71301410, "step": 3304, "time_per_iteration": 2.743474245071411 }, { "auxiliary_loss_clip": 0.01133571, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.05309725, "balance_loss_mlp": 1.01710188, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.8722016114425952, "language_loss": 0.8628391, "learning_rate": 3.7087860178344955e-06, "loss": 0.8845247, "num_input_tokens_seen": 71319670, "step": 3305, "time_per_iteration": 2.7129390239715576 }, { "auxiliary_loss_clip": 0.01128329, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04770195, "balance_loss_mlp": 1.02603281, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 2.9829227362861106, "language_loss": 0.68476367, "learning_rate": 3.7085836104943445e-06, "loss": 0.70648777, "num_input_tokens_seen": 71339850, "step": 3306, "time_per_iteration": 2.7083208560943604 }, { "auxiliary_loss_clip": 0.01119386, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04822719, "balance_loss_mlp": 1.02168787, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.683647244561179, "language_loss": 0.76433122, "learning_rate": 3.7083811383645332e-06, "loss": 0.78591287, "num_input_tokens_seen": 71359795, "step": 3307, "time_per_iteration": 2.728661298751831 }, { "auxiliary_loss_clip": 0.01157548, "auxiliary_loss_mlp": 0.01044665, "balance_loss_clip": 1.05895782, "balance_loss_mlp": 1.02714145, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 2.438172575069382, "language_loss": 0.75991976, "learning_rate": 3.708178601452737e-06, "loss": 0.78194201, "num_input_tokens_seen": 71378885, "step": 3308, "time_per_iteration": 2.6580557823181152 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.05453563, "balance_loss_mlp": 1.02307141, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.928689575161362, "language_loss": 0.76043576, "learning_rate": 3.7079759997666374e-06, "loss": 0.7819252, "num_input_tokens_seen": 71397285, "step": 3309, "time_per_iteration": 2.77226185798645 }, { "auxiliary_loss_clip": 0.0114115, "auxiliary_loss_mlp": 0.01045061, "balance_loss_clip": 1.05222607, "balance_loss_mlp": 1.02592754, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 75.17312936609292, "language_loss": 0.87855697, "learning_rate": 3.707773333313917e-06, "loss": 0.90041906, "num_input_tokens_seen": 71415775, "step": 3310, "time_per_iteration": 2.6789662837982178 }, { "auxiliary_loss_clip": 0.01153037, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.05415869, "balance_loss_mlp": 1.02139854, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.3155756588664342, "language_loss": 0.63650048, "learning_rate": 3.70757060210226e-06, "loss": 0.6584295, "num_input_tokens_seen": 71437315, "step": 3311, "time_per_iteration": 2.7604620456695557 }, { "auxiliary_loss_clip": 0.01115133, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.04763019, "balance_loss_mlp": 1.02501202, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 3.8064295514597717, "language_loss": 0.74542546, "learning_rate": 3.707367806139355e-06, "loss": 0.76701546, "num_input_tokens_seen": 71456320, "step": 3312, "time_per_iteration": 2.796475410461426 }, { "auxiliary_loss_clip": 0.01141587, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.05358124, "balance_loss_mlp": 1.02017355, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 2.2312990164825943, "language_loss": 0.84033173, "learning_rate": 3.7071649454328915e-06, "loss": 0.86212194, "num_input_tokens_seen": 71475360, "step": 3313, "time_per_iteration": 2.6044952869415283 }, { "auxiliary_loss_clip": 0.01146797, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.05695391, "balance_loss_mlp": 1.02422476, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 3.856678450124864, "language_loss": 0.810305, "learning_rate": 3.7069620199905625e-06, "loss": 0.83219463, "num_input_tokens_seen": 71496155, "step": 3314, "time_per_iteration": 2.68841814994812 }, { "auxiliary_loss_clip": 0.01112846, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.04617178, "balance_loss_mlp": 1.02643955, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.4822079401394097, "language_loss": 0.87391549, "learning_rate": 3.7067590298200627e-06, "loss": 0.89548075, "num_input_tokens_seen": 71517295, "step": 3315, "time_per_iteration": 2.720093011856079 }, { "auxiliary_loss_clip": 0.0111589, "auxiliary_loss_mlp": 0.00777002, "balance_loss_clip": 1.04992676, "balance_loss_mlp": 1.00093687, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.7805516248937883, "language_loss": 0.70957202, "learning_rate": 3.7065559749290892e-06, "loss": 0.72850096, "num_input_tokens_seen": 71540000, "step": 3316, "time_per_iteration": 2.850100517272949 }, { "auxiliary_loss_clip": 0.01019745, "auxiliary_loss_mlp": 0.01012504, "balance_loss_clip": 1.03032303, "balance_loss_mlp": 1.01003671, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8326978726055106, "language_loss": 0.66287398, "learning_rate": 3.706352855325342e-06, "loss": 0.68319643, "num_input_tokens_seen": 71607880, "step": 3317, "time_per_iteration": 3.425114870071411 }, { "auxiliary_loss_clip": 0.01148059, "auxiliary_loss_mlp": 0.01048913, "balance_loss_clip": 1.05397809, "balance_loss_mlp": 1.02964854, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 2.282515690517884, "language_loss": 0.74494618, "learning_rate": 3.7061496710165233e-06, "loss": 0.76691592, "num_input_tokens_seen": 71625695, "step": 3318, "time_per_iteration": 2.6815896034240723 }, { "auxiliary_loss_clip": 0.01114942, "auxiliary_loss_mlp": 0.01044681, "balance_loss_clip": 1.04767084, "balance_loss_mlp": 1.02786088, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 1.8966456913695608, "language_loss": 0.78894758, "learning_rate": 3.7059464220103385e-06, "loss": 0.81054389, "num_input_tokens_seen": 71648520, "step": 3319, "time_per_iteration": 2.847911834716797 }, { "auxiliary_loss_clip": 0.01134557, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.05354095, "balance_loss_mlp": 1.02312756, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.1348540211051197, "language_loss": 0.76006937, "learning_rate": 3.7057431083144945e-06, "loss": 0.78184479, "num_input_tokens_seen": 71672185, "step": 3320, "time_per_iteration": 2.9324615001678467 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01042998, "balance_loss_clip": 1.05083311, "balance_loss_mlp": 1.02496171, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 2.2436863685702546, "language_loss": 0.80077857, "learning_rate": 3.705539729936701e-06, "loss": 0.82241082, "num_input_tokens_seen": 71692890, "step": 3321, "time_per_iteration": 2.7534186840057373 }, { "auxiliary_loss_clip": 0.01033096, "auxiliary_loss_mlp": 0.01011167, "balance_loss_clip": 1.02391553, "balance_loss_mlp": 1.00828266, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.874673110280983, "language_loss": 0.65145189, "learning_rate": 3.7053362868846696e-06, "loss": 0.67189455, "num_input_tokens_seen": 71745815, "step": 3322, "time_per_iteration": 3.0398683547973633 }, { "auxiliary_loss_clip": 0.01039999, "auxiliary_loss_mlp": 0.01007775, "balance_loss_clip": 1.02971482, "balance_loss_mlp": 1.00479472, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7915334307535052, "language_loss": 0.56919783, "learning_rate": 3.7051327791661153e-06, "loss": 0.58967561, "num_input_tokens_seen": 71806915, "step": 3323, "time_per_iteration": 3.2814581394195557 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.00776139, "balance_loss_clip": 1.05244064, "balance_loss_mlp": 1.00085235, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 1.8766856730809967, "language_loss": 0.80573648, "learning_rate": 3.7049292067887555e-06, "loss": 0.82481277, "num_input_tokens_seen": 71824645, "step": 3324, "time_per_iteration": 2.66456937789917 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.04625165, "balance_loss_mlp": 1.03027487, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 2.4535669107623486, "language_loss": 0.53931105, "learning_rate": 3.7047255697603092e-06, "loss": 0.56113935, "num_input_tokens_seen": 71845125, "step": 3325, "time_per_iteration": 2.696556329727173 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.05065942, "balance_loss_mlp": 1.03328443, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.1570763946475187, "language_loss": 0.86074936, "learning_rate": 3.7045218680884984e-06, "loss": 0.88255823, "num_input_tokens_seen": 71863500, "step": 3326, "time_per_iteration": 2.7167885303497314 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.05427039, "balance_loss_mlp": 1.02511311, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 2.0419576492150395, "language_loss": 0.71793801, "learning_rate": 3.7043181017810476e-06, "loss": 0.73987597, "num_input_tokens_seen": 71881845, "step": 3327, "time_per_iteration": 2.6097662448883057 }, { "auxiliary_loss_clip": 0.01131035, "auxiliary_loss_mlp": 0.01052756, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.03290796, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 1.8948781463857982, "language_loss": 0.7668376, "learning_rate": 3.7041142708456833e-06, "loss": 0.78867549, "num_input_tokens_seen": 71900940, "step": 3328, "time_per_iteration": 2.6869349479675293 }, { "auxiliary_loss_clip": 0.01118681, "auxiliary_loss_mlp": 0.01044603, "balance_loss_clip": 1.04693103, "balance_loss_mlp": 1.02799726, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 2.0833377369651984, "language_loss": 0.69400644, "learning_rate": 3.7039103752901353e-06, "loss": 0.71563935, "num_input_tokens_seen": 71921925, "step": 3329, "time_per_iteration": 2.844280481338501 }, { "auxiliary_loss_clip": 0.01107384, "auxiliary_loss_mlp": 0.01069575, "balance_loss_clip": 1.04727411, "balance_loss_mlp": 1.04641271, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 3.099532194576676, "language_loss": 0.81395614, "learning_rate": 3.7037064151221353e-06, "loss": 0.83572567, "num_input_tokens_seen": 71941855, "step": 3330, "time_per_iteration": 2.841885566711426 }, { "auxiliary_loss_clip": 0.01137825, "auxiliary_loss_mlp": 0.01048123, "balance_loss_clip": 1.05147684, "balance_loss_mlp": 1.02977705, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 2.224132696455658, "language_loss": 0.76606882, "learning_rate": 3.703502390349417e-06, "loss": 0.78792834, "num_input_tokens_seen": 71960915, "step": 3331, "time_per_iteration": 2.7007360458374023 }, { "auxiliary_loss_clip": 0.01093521, "auxiliary_loss_mlp": 0.01069739, "balance_loss_clip": 1.04292202, "balance_loss_mlp": 1.04851985, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 2.044808670508971, "language_loss": 0.79330826, "learning_rate": 3.7032983009797176e-06, "loss": 0.81494087, "num_input_tokens_seen": 71979220, "step": 3332, "time_per_iteration": 4.518973112106323 }, { "auxiliary_loss_clip": 0.01046467, "auxiliary_loss_mlp": 0.010754, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.07303989, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.9607431077817938, "language_loss": 0.61968678, "learning_rate": 3.703094147020776e-06, "loss": 0.64090544, "num_input_tokens_seen": 72033950, "step": 3333, "time_per_iteration": 3.074782371520996 }, { "auxiliary_loss_clip": 0.01112058, "auxiliary_loss_mlp": 0.00777645, "balance_loss_clip": 1.04686844, "balance_loss_mlp": 1.00099933, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 2.9954165903614447, "language_loss": 0.81385547, "learning_rate": 3.7028899284803334e-06, "loss": 0.83275253, "num_input_tokens_seen": 72051395, "step": 3334, "time_per_iteration": 4.270732641220093 }, { "auxiliary_loss_clip": 0.01096467, "auxiliary_loss_mlp": 0.01058699, "balance_loss_clip": 1.04709518, "balance_loss_mlp": 1.03889799, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 2.9016061168315703, "language_loss": 0.74238038, "learning_rate": 3.702685645366134e-06, "loss": 0.76393211, "num_input_tokens_seen": 72071305, "step": 3335, "time_per_iteration": 4.376626491546631 }, { "auxiliary_loss_clip": 0.01149242, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.05611062, "balance_loss_mlp": 1.04120684, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 1.700795836589561, "language_loss": 0.79981416, "learning_rate": 3.7024812976859243e-06, "loss": 0.82190514, "num_input_tokens_seen": 72090165, "step": 3336, "time_per_iteration": 2.7031586170196533 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01048065, "balance_loss_clip": 1.04808092, "balance_loss_mlp": 1.0272038, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 2.0182523905302157, "language_loss": 0.7761423, "learning_rate": 3.7022768854474532e-06, "loss": 0.79778945, "num_input_tokens_seen": 72107210, "step": 3337, "time_per_iteration": 2.6990835666656494 }, { "auxiliary_loss_clip": 0.01158617, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.05752003, "balance_loss_mlp": 1.02631783, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 2.232061800350416, "language_loss": 0.69108742, "learning_rate": 3.7020724086584724e-06, "loss": 0.71312982, "num_input_tokens_seen": 72126315, "step": 3338, "time_per_iteration": 2.6827659606933594 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.04930723, "balance_loss_mlp": 1.03543282, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 2.685005372503905, "language_loss": 0.68898237, "learning_rate": 3.701867867326735e-06, "loss": 0.71069658, "num_input_tokens_seen": 72146470, "step": 3339, "time_per_iteration": 4.430418014526367 }, { "auxiliary_loss_clip": 0.01123098, "auxiliary_loss_mlp": 0.01041763, "balance_loss_clip": 1.05656064, "balance_loss_mlp": 1.02408433, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 2.0597617887640607, "language_loss": 0.66606021, "learning_rate": 3.7016632614599974e-06, "loss": 0.6877088, "num_input_tokens_seen": 72166600, "step": 3340, "time_per_iteration": 3.0020461082458496 }, { "auxiliary_loss_clip": 0.01145166, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.05326021, "balance_loss_mlp": 1.01712155, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 6.669810478748975, "language_loss": 0.74554622, "learning_rate": 3.701458591066019e-06, "loss": 0.76736599, "num_input_tokens_seen": 72185160, "step": 3341, "time_per_iteration": 2.762573480606079 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01044424, "balance_loss_clip": 1.04981375, "balance_loss_mlp": 1.02595794, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 7.177474445031109, "language_loss": 0.71779013, "learning_rate": 3.70125385615256e-06, "loss": 0.73933673, "num_input_tokens_seen": 72205160, "step": 3342, "time_per_iteration": 2.7128167152404785 }, { "auxiliary_loss_clip": 0.01114025, "auxiliary_loss_mlp": 0.01045057, "balance_loss_clip": 1.05036438, "balance_loss_mlp": 1.02749765, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 2.3652416151608873, "language_loss": 0.72892809, "learning_rate": 3.701049056727384e-06, "loss": 0.75051892, "num_input_tokens_seen": 72223555, "step": 3343, "time_per_iteration": 2.8155410289764404 }, { "auxiliary_loss_clip": 0.01113341, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.04568779, "balance_loss_mlp": 1.02762532, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 2.2972411099560195, "language_loss": 0.80645263, "learning_rate": 3.7008441927982574e-06, "loss": 0.82805163, "num_input_tokens_seen": 72242465, "step": 3344, "time_per_iteration": 2.780198335647583 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.05386972, "balance_loss_mlp": 1.02773499, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 2.2640230255386125, "language_loss": 0.83114576, "learning_rate": 3.700639264372948e-06, "loss": 0.85315621, "num_input_tokens_seen": 72260655, "step": 3345, "time_per_iteration": 2.6209781169891357 }, { "auxiliary_loss_clip": 0.01093716, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04619193, "balance_loss_mlp": 1.02492619, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 1.7610524328763844, "language_loss": 0.67947632, "learning_rate": 3.7004342714592283e-06, "loss": 0.70082676, "num_input_tokens_seen": 72279055, "step": 3346, "time_per_iteration": 2.692222833633423 }, { "auxiliary_loss_clip": 0.01114086, "auxiliary_loss_mlp": 0.01048128, "balance_loss_clip": 1.04710329, "balance_loss_mlp": 1.03028262, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 2.3067659385334958, "language_loss": 0.72993439, "learning_rate": 3.70022921406487e-06, "loss": 0.75155658, "num_input_tokens_seen": 72297895, "step": 3347, "time_per_iteration": 2.7501564025878906 }, { "auxiliary_loss_clip": 0.01142236, "auxiliary_loss_mlp": 0.01047715, "balance_loss_clip": 1.05465829, "balance_loss_mlp": 1.03122878, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 1.5798788242702444, "language_loss": 0.86869538, "learning_rate": 3.70002409219765e-06, "loss": 0.8905949, "num_input_tokens_seen": 72318385, "step": 3348, "time_per_iteration": 2.688606023788452 }, { "auxiliary_loss_clip": 0.01099793, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04737949, "balance_loss_mlp": 1.02587092, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 1.8024729376762028, "language_loss": 0.71082795, "learning_rate": 3.699818905865346e-06, "loss": 0.73227775, "num_input_tokens_seen": 72338235, "step": 3349, "time_per_iteration": 2.8423163890838623 }, { "auxiliary_loss_clip": 0.01119982, "auxiliary_loss_mlp": 0.01044662, "balance_loss_clip": 1.0504061, "balance_loss_mlp": 1.02520752, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.7324672298731074, "language_loss": 0.71324664, "learning_rate": 3.6996136550757377e-06, "loss": 0.73489314, "num_input_tokens_seen": 72357825, "step": 3350, "time_per_iteration": 2.7691454887390137 }, { "auxiliary_loss_clip": 0.01126392, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.0497458, "balance_loss_mlp": 1.02312887, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 2.3965463087123107, "language_loss": 0.76391226, "learning_rate": 3.69940833983661e-06, "loss": 0.78561449, "num_input_tokens_seen": 72376335, "step": 3351, "time_per_iteration": 2.701244592666626 }, { "auxiliary_loss_clip": 0.01134085, "auxiliary_loss_mlp": 0.01047695, "balance_loss_clip": 1.05303741, "balance_loss_mlp": 1.02840734, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 1.5574195085232978, "language_loss": 0.80808926, "learning_rate": 3.699202960155748e-06, "loss": 0.82990712, "num_input_tokens_seen": 72395440, "step": 3352, "time_per_iteration": 2.707792043685913 }, { "auxiliary_loss_clip": 0.011457, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.05415952, "balance_loss_mlp": 1.0244298, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 1.9831574274346238, "language_loss": 0.80594563, "learning_rate": 3.6989975160409396e-06, "loss": 0.82783151, "num_input_tokens_seen": 72414670, "step": 3353, "time_per_iteration": 2.675960063934326 }, { "auxiliary_loss_clip": 0.01126272, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.05195928, "balance_loss_mlp": 1.02787042, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 2.0684163707657763, "language_loss": 0.90046668, "learning_rate": 3.6987920074999747e-06, "loss": 0.92218912, "num_input_tokens_seen": 72432210, "step": 3354, "time_per_iteration": 2.6648361682891846 }, { "auxiliary_loss_clip": 0.0104514, "auxiliary_loss_mlp": 0.0075774, "balance_loss_clip": 1.0285337, "balance_loss_mlp": 1.00170481, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8264169258847935, "language_loss": 0.55863291, "learning_rate": 3.6985864345406465e-06, "loss": 0.57666171, "num_input_tokens_seen": 72489225, "step": 3355, "time_per_iteration": 3.155352830886841 }, { "auxiliary_loss_clip": 0.01127799, "auxiliary_loss_mlp": 0.00776255, "balance_loss_clip": 1.05133796, "balance_loss_mlp": 1.00109434, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 1.8367443502770229, "language_loss": 0.84333616, "learning_rate": 3.698380797170751e-06, "loss": 0.86237669, "num_input_tokens_seen": 72508715, "step": 3356, "time_per_iteration": 2.754645586013794 }, { "auxiliary_loss_clip": 0.01127514, "auxiliary_loss_mlp": 0.01052066, "balance_loss_clip": 1.04904747, "balance_loss_mlp": 1.02811635, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 3.2349249330618504, "language_loss": 0.70046175, "learning_rate": 3.698175095398085e-06, "loss": 0.72225749, "num_input_tokens_seen": 72525135, "step": 3357, "time_per_iteration": 2.6905863285064697 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01044956, "balance_loss_clip": 1.05209541, "balance_loss_mlp": 1.02590632, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 2.41944886120848, "language_loss": 0.7169627, "learning_rate": 3.6979693292304493e-06, "loss": 0.73876572, "num_input_tokens_seen": 72543690, "step": 3358, "time_per_iteration": 2.696295738220215 }, { "auxiliary_loss_clip": 0.01139673, "auxiliary_loss_mlp": 0.01052145, "balance_loss_clip": 1.05050206, "balance_loss_mlp": 1.03496706, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 2.6870341127491675, "language_loss": 0.83242267, "learning_rate": 3.6977634986756463e-06, "loss": 0.85434085, "num_input_tokens_seen": 72560725, "step": 3359, "time_per_iteration": 2.6779677867889404 }, { "auxiliary_loss_clip": 0.01052166, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.02534354, "balance_loss_mlp": 1.02345943, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.8259567660078829, "language_loss": 0.58980465, "learning_rate": 3.697557603741482e-06, "loss": 0.61059082, "num_input_tokens_seen": 72621940, "step": 3360, "time_per_iteration": 3.1175289154052734 }, { "auxiliary_loss_clip": 0.01096543, "auxiliary_loss_mlp": 0.01051237, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.03154337, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 2.668010943284884, "language_loss": 0.63219774, "learning_rate": 3.697351644435763e-06, "loss": 0.65367556, "num_input_tokens_seen": 72639135, "step": 3361, "time_per_iteration": 2.7732017040252686 }, { "auxiliary_loss_clip": 0.01119862, "auxiliary_loss_mlp": 0.01069748, "balance_loss_clip": 1.04988885, "balance_loss_mlp": 1.05035317, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 1.9150118782569074, "language_loss": 0.75946522, "learning_rate": 3.6971456207662993e-06, "loss": 0.78136134, "num_input_tokens_seen": 72658525, "step": 3362, "time_per_iteration": 2.755686044692993 }, { "auxiliary_loss_clip": 0.01139499, "auxiliary_loss_mlp": 0.00777827, "balance_loss_clip": 1.05068207, "balance_loss_mlp": 1.0011797, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 2.043450343479612, "language_loss": 0.76542944, "learning_rate": 3.6969395327409035e-06, "loss": 0.78460264, "num_input_tokens_seen": 72678085, "step": 3363, "time_per_iteration": 2.788773775100708 }, { "auxiliary_loss_clip": 0.01143235, "auxiliary_loss_mlp": 0.01068217, "balance_loss_clip": 1.05241406, "balance_loss_mlp": 1.0511229, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 1.8380065969237507, "language_loss": 0.75088942, "learning_rate": 3.696733380367391e-06, "loss": 0.773004, "num_input_tokens_seen": 72698695, "step": 3364, "time_per_iteration": 2.7484803199768066 }, { "auxiliary_loss_clip": 0.01111683, "auxiliary_loss_mlp": 0.01065374, "balance_loss_clip": 1.05202723, "balance_loss_mlp": 1.04583549, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 2.1478979049108395, "language_loss": 0.71917796, "learning_rate": 3.6965271636535783e-06, "loss": 0.7409485, "num_input_tokens_seen": 72717880, "step": 3365, "time_per_iteration": 2.770939350128174 }, { "auxiliary_loss_clip": 0.01110149, "auxiliary_loss_mlp": 0.01064133, "balance_loss_clip": 1.04989934, "balance_loss_mlp": 1.04559648, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 2.2136098995040228, "language_loss": 0.85318875, "learning_rate": 3.696320882607286e-06, "loss": 0.87493157, "num_input_tokens_seen": 72736410, "step": 3366, "time_per_iteration": 2.717759609222412 }, { "auxiliary_loss_clip": 0.01116913, "auxiliary_loss_mlp": 0.0106476, "balance_loss_clip": 1.050488, "balance_loss_mlp": 1.04605615, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 2.048733189447585, "language_loss": 0.69766563, "learning_rate": 3.696114537236335e-06, "loss": 0.71948242, "num_input_tokens_seen": 72758295, "step": 3367, "time_per_iteration": 2.788444995880127 }, { "auxiliary_loss_clip": 0.01144949, "auxiliary_loss_mlp": 0.01060722, "balance_loss_clip": 1.04997301, "balance_loss_mlp": 1.03857303, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 1.942153338299175, "language_loss": 0.68162113, "learning_rate": 3.6959081275485512e-06, "loss": 0.70367789, "num_input_tokens_seen": 72782495, "step": 3368, "time_per_iteration": 2.7339746952056885 }, { "auxiliary_loss_clip": 0.01123527, "auxiliary_loss_mlp": 0.01063426, "balance_loss_clip": 1.0543493, "balance_loss_mlp": 1.04405439, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 1.8860162071579365, "language_loss": 0.77298439, "learning_rate": 3.6957016535517615e-06, "loss": 0.79485393, "num_input_tokens_seen": 72801885, "step": 3369, "time_per_iteration": 2.739088535308838 }, { "auxiliary_loss_clip": 0.01136965, "auxiliary_loss_mlp": 0.01071822, "balance_loss_clip": 1.05140853, "balance_loss_mlp": 1.05315351, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 2.9806431283259354, "language_loss": 0.65055734, "learning_rate": 3.695495115253795e-06, "loss": 0.67264521, "num_input_tokens_seen": 72816990, "step": 3370, "time_per_iteration": 2.7082977294921875 }, { "auxiliary_loss_clip": 0.0105828, "auxiliary_loss_mlp": 0.01019528, "balance_loss_clip": 1.03235602, "balance_loss_mlp": 1.01690567, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.678414814309544, "language_loss": 0.58126765, "learning_rate": 3.6952885126624834e-06, "loss": 0.60204571, "num_input_tokens_seen": 72879240, "step": 3371, "time_per_iteration": 4.805691242218018 }, { "auxiliary_loss_clip": 0.01117624, "auxiliary_loss_mlp": 0.01050757, "balance_loss_clip": 1.04833245, "balance_loss_mlp": 1.0329231, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 2.167047343870177, "language_loss": 0.91830015, "learning_rate": 3.6950818457856617e-06, "loss": 0.9399839, "num_input_tokens_seen": 72899030, "step": 3372, "time_per_iteration": 4.306687831878662 }, { "auxiliary_loss_clip": 0.01137734, "auxiliary_loss_mlp": 0.01057192, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.03598428, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 2.1240220719821195, "language_loss": 0.78505349, "learning_rate": 3.694875114631167e-06, "loss": 0.80700278, "num_input_tokens_seen": 72919190, "step": 3373, "time_per_iteration": 4.223219394683838 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01058555, "balance_loss_clip": 1.04464257, "balance_loss_mlp": 1.03719246, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 2.5403716567908745, "language_loss": 0.71275264, "learning_rate": 3.6946683192068377e-06, "loss": 0.7342521, "num_input_tokens_seen": 72939720, "step": 3374, "time_per_iteration": 2.853079319000244 }, { "auxiliary_loss_clip": 0.01042818, "auxiliary_loss_mlp": 0.01010518, "balance_loss_clip": 1.02580416, "balance_loss_mlp": 1.00797904, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 0.9711663240936556, "language_loss": 0.62466931, "learning_rate": 3.694461459520516e-06, "loss": 0.64520264, "num_input_tokens_seen": 73000015, "step": 3375, "time_per_iteration": 3.2016799449920654 }, { "auxiliary_loss_clip": 0.01153133, "auxiliary_loss_mlp": 0.01048539, "balance_loss_clip": 1.05278802, "balance_loss_mlp": 1.03021622, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 1.613636998778186, "language_loss": 0.82316196, "learning_rate": 3.6942545355800463e-06, "loss": 0.84517872, "num_input_tokens_seen": 73017675, "step": 3376, "time_per_iteration": 2.6073458194732666 }, { "auxiliary_loss_clip": 0.01142412, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.0506475, "balance_loss_mlp": 1.01912737, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 2.0454517065820026, "language_loss": 0.81243992, "learning_rate": 3.6940475473932743e-06, "loss": 0.83424926, "num_input_tokens_seen": 73036135, "step": 3377, "time_per_iteration": 2.6802914142608643 }, { "auxiliary_loss_clip": 0.01127133, "auxiliary_loss_mlp": 0.01049784, "balance_loss_clip": 1.05416846, "balance_loss_mlp": 1.03053212, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 1.9719049052811064, "language_loss": 0.76726258, "learning_rate": 3.69384049496805e-06, "loss": 0.78903174, "num_input_tokens_seen": 73054075, "step": 3378, "time_per_iteration": 2.7052531242370605 }, { "auxiliary_loss_clip": 0.01087342, "auxiliary_loss_mlp": 0.01049115, "balance_loss_clip": 1.04531622, "balance_loss_mlp": 1.02726364, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 2.0079998756584017, "language_loss": 0.7982831, "learning_rate": 3.6936333783122242e-06, "loss": 0.81964767, "num_input_tokens_seen": 73073530, "step": 3379, "time_per_iteration": 4.379331588745117 }, { "auxiliary_loss_clip": 0.01139431, "auxiliary_loss_mlp": 0.01039085, "balance_loss_clip": 1.05384874, "balance_loss_mlp": 1.02164412, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 1.5868581768713355, "language_loss": 0.86639273, "learning_rate": 3.6934261974336505e-06, "loss": 0.88817787, "num_input_tokens_seen": 73092820, "step": 3380, "time_per_iteration": 2.7405402660369873 }, { "auxiliary_loss_clip": 0.01156702, "auxiliary_loss_mlp": 0.01053775, "balance_loss_clip": 1.05730438, "balance_loss_mlp": 1.03507149, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 2.063467458189152, "language_loss": 0.74637043, "learning_rate": 3.693218952340186e-06, "loss": 0.76847517, "num_input_tokens_seen": 73113385, "step": 3381, "time_per_iteration": 2.6237549781799316 }, { "auxiliary_loss_clip": 0.01118794, "auxiliary_loss_mlp": 0.01042351, "balance_loss_clip": 1.04590273, "balance_loss_mlp": 1.02289653, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 1.6994666268173182, "language_loss": 0.79167414, "learning_rate": 3.6930116430396895e-06, "loss": 0.81328559, "num_input_tokens_seen": 73131195, "step": 3382, "time_per_iteration": 2.6707420349121094 }, { "auxiliary_loss_clip": 0.01113758, "auxiliary_loss_mlp": 0.00779415, "balance_loss_clip": 1.0459373, "balance_loss_mlp": 1.00091934, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 1.9483404178521286, "language_loss": 0.8042953, "learning_rate": 3.6928042695400214e-06, "loss": 0.82322699, "num_input_tokens_seen": 73148850, "step": 3383, "time_per_iteration": 2.7859487533569336 }, { "auxiliary_loss_clip": 0.01100731, "auxiliary_loss_mlp": 0.01046151, "balance_loss_clip": 1.04473877, "balance_loss_mlp": 1.02621913, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 3.0507793260875693, "language_loss": 0.74539214, "learning_rate": 3.6925968318490464e-06, "loss": 0.76686096, "num_input_tokens_seen": 73166775, "step": 3384, "time_per_iteration": 2.802645206451416 }, { "auxiliary_loss_clip": 0.0114772, "auxiliary_loss_mlp": 0.01042851, "balance_loss_clip": 1.05207324, "balance_loss_mlp": 1.02232289, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 7.661095363155204, "language_loss": 0.76801658, "learning_rate": 3.6923893299746293e-06, "loss": 0.7899223, "num_input_tokens_seen": 73183215, "step": 3385, "time_per_iteration": 2.823343515396118 }, { "auxiliary_loss_clip": 0.01107407, "auxiliary_loss_mlp": 0.01063941, "balance_loss_clip": 1.04730904, "balance_loss_mlp": 1.04331779, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 41.05937457193927, "language_loss": 0.68458641, "learning_rate": 3.692181763924639e-06, "loss": 0.70629984, "num_input_tokens_seen": 73203290, "step": 3386, "time_per_iteration": 2.830810546875 }, { "auxiliary_loss_clip": 0.01104248, "auxiliary_loss_mlp": 0.01064893, "balance_loss_clip": 1.04774165, "balance_loss_mlp": 1.04379284, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 3.4161658794101384, "language_loss": 0.80985248, "learning_rate": 3.691974133706947e-06, "loss": 0.83154386, "num_input_tokens_seen": 73226185, "step": 3387, "time_per_iteration": 2.8204662799835205 }, { "auxiliary_loss_clip": 0.0112504, "auxiliary_loss_mlp": 0.01049361, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.03000104, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 2.703878094865874, "language_loss": 0.7988956, "learning_rate": 3.6917664393294262e-06, "loss": 0.82063961, "num_input_tokens_seen": 73243300, "step": 3388, "time_per_iteration": 2.687053918838501 }, { "auxiliary_loss_clip": 0.01157403, "auxiliary_loss_mlp": 0.01048089, "balance_loss_clip": 1.05471182, "balance_loss_mlp": 1.0281812, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 1.8133180655285324, "language_loss": 0.7184962, "learning_rate": 3.6915586807999527e-06, "loss": 0.74055111, "num_input_tokens_seen": 73261490, "step": 3389, "time_per_iteration": 2.614321708679199 }, { "auxiliary_loss_clip": 0.01141855, "auxiliary_loss_mlp": 0.01054311, "balance_loss_clip": 1.05387521, "balance_loss_mlp": 1.0351541, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 1.8982692343761227, "language_loss": 0.87280858, "learning_rate": 3.691350858126404e-06, "loss": 0.89477026, "num_input_tokens_seen": 73280180, "step": 3390, "time_per_iteration": 2.6770312786102295 }, { "auxiliary_loss_clip": 0.01125093, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.05142403, "balance_loss_mlp": 1.03129053, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 2.3308941901233355, "language_loss": 0.71194077, "learning_rate": 3.691142971316662e-06, "loss": 0.73371667, "num_input_tokens_seen": 73300680, "step": 3391, "time_per_iteration": 2.7198221683502197 }, { "auxiliary_loss_clip": 0.01120121, "auxiliary_loss_mlp": 0.01051383, "balance_loss_clip": 1.05222178, "balance_loss_mlp": 1.0318923, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 2.4765720957839217, "language_loss": 0.86745828, "learning_rate": 3.6909350203786086e-06, "loss": 0.88917333, "num_input_tokens_seen": 73316760, "step": 3392, "time_per_iteration": 2.6961052417755127 }, { "auxiliary_loss_clip": 0.01145712, "auxiliary_loss_mlp": 0.01051212, "balance_loss_clip": 1.05204964, "balance_loss_mlp": 1.03236461, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 1.665333238668028, "language_loss": 0.80659354, "learning_rate": 3.69072700532013e-06, "loss": 0.82856286, "num_input_tokens_seen": 73339385, "step": 3393, "time_per_iteration": 2.6883490085601807 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.04751348, "balance_loss_mlp": 1.02385163, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 1.8745864895680615, "language_loss": 0.86126244, "learning_rate": 3.6905189261491137e-06, "loss": 0.88291663, "num_input_tokens_seen": 73357235, "step": 3394, "time_per_iteration": 2.758887767791748 }, { "auxiliary_loss_clip": 0.0114219, "auxiliary_loss_mlp": 0.01049288, "balance_loss_clip": 1.05699492, "balance_loss_mlp": 1.03088212, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 2.5133342949273416, "language_loss": 0.83761692, "learning_rate": 3.69031078287345e-06, "loss": 0.85953164, "num_input_tokens_seen": 73374435, "step": 3395, "time_per_iteration": 2.6468729972839355 }, { "auxiliary_loss_clip": 0.01145796, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.05311751, "balance_loss_mlp": 1.0200156, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 2.8477422591662376, "language_loss": 0.83736277, "learning_rate": 3.690102575501033e-06, "loss": 0.85921878, "num_input_tokens_seen": 73391025, "step": 3396, "time_per_iteration": 2.6296958923339844 }, { "auxiliary_loss_clip": 0.01112843, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.04787922, "balance_loss_mlp": 1.02616525, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 2.1192113228666303, "language_loss": 0.77199841, "learning_rate": 3.6898943040397556e-06, "loss": 0.79358017, "num_input_tokens_seen": 73409270, "step": 3397, "time_per_iteration": 2.776784896850586 }, { "auxiliary_loss_clip": 0.01128614, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.05143905, "balance_loss_mlp": 1.03264332, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 3.16091809956727, "language_loss": 0.8791461, "learning_rate": 3.689685968497518e-06, "loss": 0.9009335, "num_input_tokens_seen": 73425225, "step": 3398, "time_per_iteration": 2.6866374015808105 }, { "auxiliary_loss_clip": 0.01126796, "auxiliary_loss_mlp": 0.01052169, "balance_loss_clip": 1.05476117, "balance_loss_mlp": 1.03316689, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 2.139785862197821, "language_loss": 0.78045064, "learning_rate": 3.6894775688822186e-06, "loss": 0.80224031, "num_input_tokens_seen": 73440940, "step": 3399, "time_per_iteration": 2.6545825004577637 }, { "auxiliary_loss_clip": 0.01144155, "auxiliary_loss_mlp": 0.01042424, "balance_loss_clip": 1.05252838, "balance_loss_mlp": 1.02299261, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 3.6374157446104802, "language_loss": 0.76563728, "learning_rate": 3.6892691052017603e-06, "loss": 0.787503, "num_input_tokens_seen": 73458805, "step": 3400, "time_per_iteration": 2.7279481887817383 }, { "auxiliary_loss_clip": 0.01121071, "auxiliary_loss_mlp": 0.00776799, "balance_loss_clip": 1.05304742, "balance_loss_mlp": 1.00072634, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 1.8758513970592474, "language_loss": 0.79382575, "learning_rate": 3.6890605774640487e-06, "loss": 0.81280446, "num_input_tokens_seen": 73479380, "step": 3401, "time_per_iteration": 2.7918031215667725 }, { "auxiliary_loss_clip": 0.01131319, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.0484674, "balance_loss_mlp": 1.02540183, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 2.2159471948141034, "language_loss": 0.69798994, "learning_rate": 3.688851985676991e-06, "loss": 0.71975207, "num_input_tokens_seen": 73505105, "step": 3402, "time_per_iteration": 2.79670786857605 }, { "auxiliary_loss_clip": 0.01120554, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.05060196, "balance_loss_mlp": 1.02439535, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 1.7908768446457861, "language_loss": 0.81114817, "learning_rate": 3.688643329848496e-06, "loss": 0.83279312, "num_input_tokens_seen": 73523700, "step": 3403, "time_per_iteration": 2.70182728767395 }, { "auxiliary_loss_clip": 0.01144248, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.05348516, "balance_loss_mlp": 1.02295971, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 2.511955552730785, "language_loss": 0.83403814, "learning_rate": 3.6884346099864772e-06, "loss": 0.8558926, "num_input_tokens_seen": 73542625, "step": 3404, "time_per_iteration": 2.630807399749756 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01048101, "balance_loss_clip": 1.04838705, "balance_loss_mlp": 1.0292058, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 1.7149716538767368, "language_loss": 0.86209136, "learning_rate": 3.6882258260988487e-06, "loss": 0.88396174, "num_input_tokens_seen": 73561450, "step": 3405, "time_per_iteration": 2.6076929569244385 }, { "auxiliary_loss_clip": 0.01116224, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05039132, "balance_loss_mlp": 1.02621806, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 2.1633598971137435, "language_loss": 0.84356105, "learning_rate": 3.6880169781935276e-06, "loss": 0.86516619, "num_input_tokens_seen": 73577155, "step": 3406, "time_per_iteration": 2.768890142440796 }, { "auxiliary_loss_clip": 0.01152751, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.0542599, "balance_loss_mlp": 1.02191663, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 2.4892039461455675, "language_loss": 0.67453218, "learning_rate": 3.6878080662784336e-06, "loss": 0.69645512, "num_input_tokens_seen": 73594900, "step": 3407, "time_per_iteration": 2.5661377906799316 }, { "auxiliary_loss_clip": 0.0115175, "auxiliary_loss_mlp": 0.01050505, "balance_loss_clip": 1.05328465, "balance_loss_mlp": 1.03294516, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 2.4363182538361285, "language_loss": 0.84214294, "learning_rate": 3.6875990903614886e-06, "loss": 0.86416554, "num_input_tokens_seen": 73613810, "step": 3408, "time_per_iteration": 2.585186004638672 }, { "auxiliary_loss_clip": 0.01154901, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.0536257, "balance_loss_mlp": 1.02471161, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 2.317815935455145, "language_loss": 0.63898516, "learning_rate": 3.6873900504506166e-06, "loss": 0.6609571, "num_input_tokens_seen": 73631495, "step": 3409, "time_per_iteration": 2.5877959728240967 }, { "auxiliary_loss_clip": 0.0113795, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.04903567, "balance_loss_mlp": 1.02409852, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 1.3925959707869588, "language_loss": 0.80547982, "learning_rate": 3.687180946553745e-06, "loss": 0.8272841, "num_input_tokens_seen": 73652840, "step": 3410, "time_per_iteration": 4.1697752475738525 }, { "auxiliary_loss_clip": 0.01099823, "auxiliary_loss_mlp": 0.01046015, "balance_loss_clip": 1.05186486, "balance_loss_mlp": 1.02820492, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 2.407452066099965, "language_loss": 0.75804615, "learning_rate": 3.686971778678803e-06, "loss": 0.77950454, "num_input_tokens_seen": 73672150, "step": 3411, "time_per_iteration": 2.8072102069854736 }, { "auxiliary_loss_clip": 0.0113879, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.05501246, "balance_loss_mlp": 1.02887905, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 2.4936494073109445, "language_loss": 0.73356283, "learning_rate": 3.686762546833722e-06, "loss": 0.75541937, "num_input_tokens_seen": 73691940, "step": 3412, "time_per_iteration": 5.778446912765503 }, { "auxiliary_loss_clip": 0.01127692, "auxiliary_loss_mlp": 0.01057937, "balance_loss_clip": 1.04926813, "balance_loss_mlp": 1.03748107, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 2.3541654180764353, "language_loss": 0.77958596, "learning_rate": 3.6865532510264362e-06, "loss": 0.80144227, "num_input_tokens_seen": 73709080, "step": 3413, "time_per_iteration": 2.6457245349884033 }, { "auxiliary_loss_clip": 0.0110869, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.04991519, "balance_loss_mlp": 1.02862608, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 2.4834314093653673, "language_loss": 0.85112405, "learning_rate": 3.6863438912648823e-06, "loss": 0.8726896, "num_input_tokens_seen": 73727670, "step": 3414, "time_per_iteration": 2.7343668937683105 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.05012155, "balance_loss_mlp": 1.02118468, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 2.0410772094937433, "language_loss": 0.80372798, "learning_rate": 3.6861344675569986e-06, "loss": 0.82552463, "num_input_tokens_seen": 73747170, "step": 3415, "time_per_iteration": 2.6669082641601562 }, { "auxiliary_loss_clip": 0.01087022, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 1.04786301, "balance_loss_mlp": 1.02643943, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 1.941742032659622, "language_loss": 0.72958827, "learning_rate": 3.6859249799107275e-06, "loss": 0.75088626, "num_input_tokens_seen": 73767690, "step": 3416, "time_per_iteration": 2.892782211303711 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.05453372, "balance_loss_mlp": 1.02577877, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 2.508583707985938, "language_loss": 0.78741407, "learning_rate": 3.6857154283340115e-06, "loss": 0.80930889, "num_input_tokens_seen": 73786900, "step": 3417, "time_per_iteration": 2.7298929691314697 }, { "auxiliary_loss_clip": 0.01145459, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.0536468, "balance_loss_mlp": 1.02819777, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 2.4305498920504043, "language_loss": 0.8729043, "learning_rate": 3.685505812834798e-06, "loss": 0.89482725, "num_input_tokens_seen": 73804515, "step": 3418, "time_per_iteration": 4.382033109664917 }, { "auxiliary_loss_clip": 0.01140182, "auxiliary_loss_mlp": 0.01046543, "balance_loss_clip": 1.05682349, "balance_loss_mlp": 1.02776778, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 14.690715253896212, "language_loss": 0.62538671, "learning_rate": 3.685296133421035e-06, "loss": 0.64725399, "num_input_tokens_seen": 73822910, "step": 3419, "time_per_iteration": 2.7318668365478516 }, { "auxiliary_loss_clip": 0.01139691, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.05550981, "balance_loss_mlp": 1.02651954, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 1.8153871521224594, "language_loss": 0.86339438, "learning_rate": 3.685086390100674e-06, "loss": 0.88526058, "num_input_tokens_seen": 73841160, "step": 3420, "time_per_iteration": 2.723606824874878 }, { "auxiliary_loss_clip": 0.01104401, "auxiliary_loss_mlp": 0.00780617, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.00071514, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 2.3982854973621954, "language_loss": 0.7127136, "learning_rate": 3.684876582881668e-06, "loss": 0.73156381, "num_input_tokens_seen": 73862795, "step": 3421, "time_per_iteration": 2.8138315677642822 }, { "auxiliary_loss_clip": 0.01153254, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.05382609, "balance_loss_mlp": 1.02160168, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 6.231519820465981, "language_loss": 0.70559299, "learning_rate": 3.6846667117719732e-06, "loss": 0.72752541, "num_input_tokens_seen": 73881525, "step": 3422, "time_per_iteration": 2.6411848068237305 }, { "auxiliary_loss_clip": 0.01062123, "auxiliary_loss_mlp": 0.01005097, "balance_loss_clip": 1.03459418, "balance_loss_mlp": 1.00220013, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.740118932422812, "language_loss": 0.55461621, "learning_rate": 3.684456776779548e-06, "loss": 0.57528841, "num_input_tokens_seen": 73937775, "step": 3423, "time_per_iteration": 3.259685516357422 }, { "auxiliary_loss_clip": 0.01104389, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.04975653, "balance_loss_mlp": 1.02089024, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 1.9242047681435088, "language_loss": 0.71910381, "learning_rate": 3.684246777912353e-06, "loss": 0.74054068, "num_input_tokens_seen": 73958250, "step": 3424, "time_per_iteration": 2.800283432006836 }, { "auxiliary_loss_clip": 0.01125916, "auxiliary_loss_mlp": 0.00777945, "balance_loss_clip": 1.05704927, "balance_loss_mlp": 1.00086677, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 1.6235965502825092, "language_loss": 0.74980927, "learning_rate": 3.684036715178351e-06, "loss": 0.76884782, "num_input_tokens_seen": 73977775, "step": 3425, "time_per_iteration": 2.751030206680298 }, { "auxiliary_loss_clip": 0.01104665, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.05047321, "balance_loss_mlp": 1.03983784, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 1.7765616723027935, "language_loss": 0.87936616, "learning_rate": 3.683826588585508e-06, "loss": 0.90099961, "num_input_tokens_seen": 73996590, "step": 3426, "time_per_iteration": 2.8539180755615234 }, { "auxiliary_loss_clip": 0.01144422, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.05773449, "balance_loss_mlp": 1.0281601, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 1.836530467647624, "language_loss": 0.76435733, "learning_rate": 3.6836163981417926e-06, "loss": 0.78625643, "num_input_tokens_seen": 74015935, "step": 3427, "time_per_iteration": 2.7024967670440674 }, { "auxiliary_loss_clip": 0.01159387, "auxiliary_loss_mlp": 0.01050023, "balance_loss_clip": 1.0577209, "balance_loss_mlp": 1.03185558, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 2.7350574840199964, "language_loss": 0.74176943, "learning_rate": 3.683406143855174e-06, "loss": 0.76386356, "num_input_tokens_seen": 74036575, "step": 3428, "time_per_iteration": 2.593151569366455 }, { "auxiliary_loss_clip": 0.01132797, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05232322, "balance_loss_mlp": 1.0274843, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 3.829070534376961, "language_loss": 0.73316109, "learning_rate": 3.6831958257336256e-06, "loss": 0.75495446, "num_input_tokens_seen": 74055365, "step": 3429, "time_per_iteration": 2.7357261180877686 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01049081, "balance_loss_clip": 1.05838966, "balance_loss_mlp": 1.03030515, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 2.201354934958512, "language_loss": 0.85586745, "learning_rate": 3.6829854437851237e-06, "loss": 0.87779927, "num_input_tokens_seen": 74074875, "step": 3430, "time_per_iteration": 2.658486843109131 }, { "auxiliary_loss_clip": 0.01088509, "auxiliary_loss_mlp": 0.01053254, "balance_loss_clip": 1.04814601, "balance_loss_mlp": 1.03387105, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 1.8292569880077065, "language_loss": 0.68859613, "learning_rate": 3.6827749980176444e-06, "loss": 0.71001375, "num_input_tokens_seen": 74094505, "step": 3431, "time_per_iteration": 2.811061143875122 }, { "auxiliary_loss_clip": 0.01027012, "auxiliary_loss_mlp": 0.01012446, "balance_loss_clip": 1.03099978, "balance_loss_mlp": 1.00976419, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.8066063325789609, "language_loss": 0.60172188, "learning_rate": 3.6825644884391693e-06, "loss": 0.62211645, "num_input_tokens_seen": 74158500, "step": 3432, "time_per_iteration": 3.415828227996826 }, { "auxiliary_loss_clip": 0.01146488, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.0583806, "balance_loss_mlp": 1.02669072, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 2.5535613418278116, "language_loss": 0.72622889, "learning_rate": 3.682353915057679e-06, "loss": 0.74814081, "num_input_tokens_seen": 74176685, "step": 3433, "time_per_iteration": 2.715195655822754 }, { "auxiliary_loss_clip": 0.0109694, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.04781306, "balance_loss_mlp": 1.03019655, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 2.096486283687917, "language_loss": 0.87233114, "learning_rate": 3.6821432778811604e-06, "loss": 0.8938092, "num_input_tokens_seen": 74194935, "step": 3434, "time_per_iteration": 2.7781460285186768 }, { "auxiliary_loss_clip": 0.01151381, "auxiliary_loss_mlp": 0.01045497, "balance_loss_clip": 1.05561388, "balance_loss_mlp": 1.02719867, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 1.7621185839090663, "language_loss": 0.69533503, "learning_rate": 3.6819325769176004e-06, "loss": 0.71730381, "num_input_tokens_seen": 74215400, "step": 3435, "time_per_iteration": 2.7425992488861084 }, { "auxiliary_loss_clip": 0.01127853, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.05583, "balance_loss_mlp": 1.02672172, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 30.077934868422773, "language_loss": 0.89116997, "learning_rate": 3.681721812174988e-06, "loss": 0.91290456, "num_input_tokens_seen": 74234090, "step": 3436, "time_per_iteration": 2.7460577487945557 }, { "auxiliary_loss_clip": 0.01118033, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.05178559, "balance_loss_mlp": 1.02168477, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 1.7370712778981523, "language_loss": 0.77330887, "learning_rate": 3.6815109836613163e-06, "loss": 0.79490477, "num_input_tokens_seen": 74253345, "step": 3437, "time_per_iteration": 2.7507588863372803 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.05298507, "balance_loss_mlp": 1.02323389, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 1.8326742989814773, "language_loss": 0.77813125, "learning_rate": 3.6813000913845795e-06, "loss": 0.799981, "num_input_tokens_seen": 74271615, "step": 3438, "time_per_iteration": 2.7624385356903076 }, { "auxiliary_loss_clip": 0.01063811, "auxiliary_loss_mlp": 0.01002308, "balance_loss_clip": 1.03603387, "balance_loss_mlp": 0.9995541, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8298524953876073, "language_loss": 0.67093015, "learning_rate": 3.6810891353527747e-06, "loss": 0.69159138, "num_input_tokens_seen": 74331390, "step": 3439, "time_per_iteration": 3.2026216983795166 }, { "auxiliary_loss_clip": 0.01148913, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05590546, "balance_loss_mlp": 1.02299786, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 1.9537104709510729, "language_loss": 0.83907467, "learning_rate": 3.6808781155739014e-06, "loss": 0.86097592, "num_input_tokens_seen": 74347335, "step": 3440, "time_per_iteration": 2.6949758529663086 }, { "auxiliary_loss_clip": 0.01147739, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.05509627, "balance_loss_mlp": 1.02458239, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 1.8008884636634683, "language_loss": 0.84828413, "learning_rate": 3.6806670320559614e-06, "loss": 0.8701809, "num_input_tokens_seen": 74366310, "step": 3441, "time_per_iteration": 2.6440463066101074 }, { "auxiliary_loss_clip": 0.01110175, "auxiliary_loss_mlp": 0.01048552, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.03050399, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 1.7415147413468661, "language_loss": 0.85854685, "learning_rate": 3.680455884806959e-06, "loss": 0.88013411, "num_input_tokens_seen": 74387100, "step": 3442, "time_per_iteration": 2.8222689628601074 }, { "auxiliary_loss_clip": 0.01078025, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.05186844, "balance_loss_mlp": 1.03095019, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 1.9775081815037283, "language_loss": 0.73038852, "learning_rate": 3.6802446738349014e-06, "loss": 0.75167674, "num_input_tokens_seen": 74404460, "step": 3443, "time_per_iteration": 2.8044140338897705 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.00776303, "balance_loss_clip": 1.05408895, "balance_loss_mlp": 1.00079513, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 1.84636320729986, "language_loss": 0.85586846, "learning_rate": 3.680033399147797e-06, "loss": 0.87490773, "num_input_tokens_seen": 74423790, "step": 3444, "time_per_iteration": 2.7582647800445557 }, { "auxiliary_loss_clip": 0.01036759, "auxiliary_loss_mlp": 0.01007145, "balance_loss_clip": 1.03905272, "balance_loss_mlp": 1.0042963, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 0.6999396122177431, "language_loss": 0.57092249, "learning_rate": 3.6798220607536585e-06, "loss": 0.59136152, "num_input_tokens_seen": 74488130, "step": 3445, "time_per_iteration": 3.249602794647217 }, { "auxiliary_loss_clip": 0.01152738, "auxiliary_loss_mlp": 0.00776634, "balance_loss_clip": 1.0538106, "balance_loss_mlp": 1.00088191, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 1.6453630130444594, "language_loss": 0.78469276, "learning_rate": 3.6796106586604987e-06, "loss": 0.80398649, "num_input_tokens_seen": 74506720, "step": 3446, "time_per_iteration": 2.6341898441314697 }, { "auxiliary_loss_clip": 0.01151445, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.05439711, "balance_loss_mlp": 1.02297151, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 2.013256457797304, "language_loss": 0.63031304, "learning_rate": 3.679399192876334e-06, "loss": 0.65226525, "num_input_tokens_seen": 74525330, "step": 3447, "time_per_iteration": 2.6912922859191895 }, { "auxiliary_loss_clip": 0.01103828, "auxiliary_loss_mlp": 0.01058453, "balance_loss_clip": 1.04668319, "balance_loss_mlp": 1.03828287, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 1.7423220349735584, "language_loss": 0.86291325, "learning_rate": 3.679187663409184e-06, "loss": 0.88453603, "num_input_tokens_seen": 74544535, "step": 3448, "time_per_iteration": 2.787576675415039 }, { "auxiliary_loss_clip": 0.01128629, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.049932, "balance_loss_mlp": 1.02556467, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 3.8253504349982044, "language_loss": 0.75264204, "learning_rate": 3.6789760702670696e-06, "loss": 0.77437979, "num_input_tokens_seen": 74562300, "step": 3449, "time_per_iteration": 4.354467391967773 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.0534308, "balance_loss_mlp": 1.03073323, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 2.156163289660715, "language_loss": 0.76558924, "learning_rate": 3.6787644134580134e-06, "loss": 0.787503, "num_input_tokens_seen": 74580080, "step": 3450, "time_per_iteration": 2.7020533084869385 }, { "auxiliary_loss_clip": 0.01128554, "auxiliary_loss_mlp": 0.01044182, "balance_loss_clip": 1.05234683, "balance_loss_mlp": 1.02522802, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 1.6446708221415856, "language_loss": 0.82074821, "learning_rate": 3.6785526929900436e-06, "loss": 0.84247565, "num_input_tokens_seen": 74598980, "step": 3451, "time_per_iteration": 2.7753186225891113 }, { "auxiliary_loss_clip": 0.01064426, "auxiliary_loss_mlp": 0.01003577, "balance_loss_clip": 1.02722275, "balance_loss_mlp": 1.00099015, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.793594031040259, "language_loss": 0.56562752, "learning_rate": 3.6783409088711875e-06, "loss": 0.58630753, "num_input_tokens_seen": 74655275, "step": 3452, "time_per_iteration": 6.257205963134766 }, { "auxiliary_loss_clip": 0.01124123, "auxiliary_loss_mlp": 0.00776806, "balance_loss_clip": 1.05206704, "balance_loss_mlp": 1.0008918, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 2.245823129763223, "language_loss": 0.88341558, "learning_rate": 3.6781290611094755e-06, "loss": 0.90242493, "num_input_tokens_seen": 74674560, "step": 3453, "time_per_iteration": 2.7009050846099854 }, { "auxiliary_loss_clip": 0.01146287, "auxiliary_loss_mlp": 0.01044217, "balance_loss_clip": 1.05471313, "balance_loss_mlp": 1.02521539, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 2.2325669459725574, "language_loss": 0.79920429, "learning_rate": 3.6779171497129407e-06, "loss": 0.82110935, "num_input_tokens_seen": 74694500, "step": 3454, "time_per_iteration": 2.7080893516540527 }, { "auxiliary_loss_clip": 0.01104984, "auxiliary_loss_mlp": 0.00777717, "balance_loss_clip": 1.04356718, "balance_loss_mlp": 1.0007751, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 3.601668384502942, "language_loss": 0.76601356, "learning_rate": 3.6777051746896202e-06, "loss": 0.78484058, "num_input_tokens_seen": 74710485, "step": 3455, "time_per_iteration": 2.6733248233795166 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.01050321, "balance_loss_clip": 1.04759336, "balance_loss_mlp": 1.0326066, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 1.908671081537558, "language_loss": 0.80200219, "learning_rate": 3.6774931360475516e-06, "loss": 0.82365942, "num_input_tokens_seen": 74727450, "step": 3456, "time_per_iteration": 2.6950278282165527 }, { "auxiliary_loss_clip": 0.01112832, "auxiliary_loss_mlp": 0.00777675, "balance_loss_clip": 1.05166578, "balance_loss_mlp": 1.00099969, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 2.135320694722552, "language_loss": 0.78070557, "learning_rate": 3.6772810337947745e-06, "loss": 0.79961067, "num_input_tokens_seen": 74746725, "step": 3457, "time_per_iteration": 4.381137132644653 }, { "auxiliary_loss_clip": 0.01082177, "auxiliary_loss_mlp": 0.01058291, "balance_loss_clip": 1.04310393, "balance_loss_mlp": 1.03651094, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 1.7652855773158553, "language_loss": 0.8360287, "learning_rate": 3.677068867939333e-06, "loss": 0.85743344, "num_input_tokens_seen": 74765255, "step": 3458, "time_per_iteration": 2.7332653999328613 }, { "auxiliary_loss_clip": 0.01140275, "auxiliary_loss_mlp": 0.0077698, "balance_loss_clip": 1.05156302, "balance_loss_mlp": 1.00095606, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 11.883071119862361, "language_loss": 0.75769317, "learning_rate": 3.676856638489272e-06, "loss": 0.77686572, "num_input_tokens_seen": 74785710, "step": 3459, "time_per_iteration": 2.705026626586914 }, { "auxiliary_loss_clip": 0.01089168, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.04769015, "balance_loss_mlp": 1.02081251, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 2.1071303009051428, "language_loss": 0.77105331, "learning_rate": 3.6766443454526382e-06, "loss": 0.79233319, "num_input_tokens_seen": 74804490, "step": 3460, "time_per_iteration": 2.749965190887451 }, { "auxiliary_loss_clip": 0.0109477, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04938984, "balance_loss_mlp": 1.02838707, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 9.5480036120023, "language_loss": 0.75802225, "learning_rate": 3.6764319888374836e-06, "loss": 0.77944589, "num_input_tokens_seen": 74826340, "step": 3461, "time_per_iteration": 2.7929086685180664 }, { "auxiliary_loss_clip": 0.01124748, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.04610133, "balance_loss_mlp": 1.02203989, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 2.001927586001653, "language_loss": 0.8848443, "learning_rate": 3.6762195686518604e-06, "loss": 0.90650725, "num_input_tokens_seen": 74844960, "step": 3462, "time_per_iteration": 2.7031619548797607 }, { "auxiliary_loss_clip": 0.01023861, "auxiliary_loss_mlp": 0.00757905, "balance_loss_clip": 1.02540636, "balance_loss_mlp": 1.00168896, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7622558664505636, "language_loss": 0.59010452, "learning_rate": 3.6760070849038226e-06, "loss": 0.6079222, "num_input_tokens_seen": 74909075, "step": 3463, "time_per_iteration": 3.4111485481262207 }, { "auxiliary_loss_clip": 0.01132553, "auxiliary_loss_mlp": 0.01047591, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.02866018, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 2.6002828602708283, "language_loss": 0.66744608, "learning_rate": 3.675794537601429e-06, "loss": 0.68924749, "num_input_tokens_seen": 74928125, "step": 3464, "time_per_iteration": 2.718229293823242 }, { "auxiliary_loss_clip": 0.0112374, "auxiliary_loss_mlp": 0.0104712, "balance_loss_clip": 1.05101657, "balance_loss_mlp": 1.02755797, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 2.9384916482598205, "language_loss": 0.84044278, "learning_rate": 3.6755819267527373e-06, "loss": 0.86215138, "num_input_tokens_seen": 74945090, "step": 3465, "time_per_iteration": 2.732109546661377 }, { "auxiliary_loss_clip": 0.01096712, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04373813, "balance_loss_mlp": 1.02221096, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 2.576139197384499, "language_loss": 0.81923312, "learning_rate": 3.6753692523658113e-06, "loss": 0.84060633, "num_input_tokens_seen": 74963630, "step": 3466, "time_per_iteration": 2.7758567333221436 }, { "auxiliary_loss_clip": 0.01140158, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.05322194, "balance_loss_mlp": 1.02787983, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 4.780862188541671, "language_loss": 0.82008922, "learning_rate": 3.675156514448716e-06, "loss": 0.84193271, "num_input_tokens_seen": 74981875, "step": 3467, "time_per_iteration": 2.5788159370422363 }, { "auxiliary_loss_clip": 0.01149826, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.05362797, "balance_loss_mlp": 1.02265835, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 2.009157691583003, "language_loss": 0.82178962, "learning_rate": 3.674943713009518e-06, "loss": 0.84368813, "num_input_tokens_seen": 74999155, "step": 3468, "time_per_iteration": 2.5874218940734863 }, { "auxiliary_loss_clip": 0.01143942, "auxiliary_loss_mlp": 0.01048537, "balance_loss_clip": 1.05300629, "balance_loss_mlp": 1.02774715, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 2.0793964386868584, "language_loss": 0.90328556, "learning_rate": 3.6747308480562856e-06, "loss": 0.92521036, "num_input_tokens_seen": 75017850, "step": 3469, "time_per_iteration": 2.6595447063446045 }, { "auxiliary_loss_clip": 0.01125181, "auxiliary_loss_mlp": 0.0104984, "balance_loss_clip": 1.05548537, "balance_loss_mlp": 1.03175592, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 1.9058635967771913, "language_loss": 0.76809812, "learning_rate": 3.674517919597092e-06, "loss": 0.78984833, "num_input_tokens_seen": 75039270, "step": 3470, "time_per_iteration": 2.908046245574951 }, { "auxiliary_loss_clip": 0.01133446, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.0551517, "balance_loss_mlp": 1.02942634, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 2.301093296435647, "language_loss": 0.75801277, "learning_rate": 3.674304927640011e-06, "loss": 0.77982342, "num_input_tokens_seen": 75059350, "step": 3471, "time_per_iteration": 2.713533401489258 }, { "auxiliary_loss_clip": 0.01123818, "auxiliary_loss_mlp": 0.01053513, "balance_loss_clip": 1.04961812, "balance_loss_mlp": 1.03384328, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 2.366290140730035, "language_loss": 0.75703716, "learning_rate": 3.67409187219312e-06, "loss": 0.77881044, "num_input_tokens_seen": 75080150, "step": 3472, "time_per_iteration": 2.785034656524658 }, { "auxiliary_loss_clip": 0.01140589, "auxiliary_loss_mlp": 0.01046494, "balance_loss_clip": 1.05084538, "balance_loss_mlp": 1.02854145, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 7.277377921302429, "language_loss": 0.84276807, "learning_rate": 3.6738787532644966e-06, "loss": 0.86463886, "num_input_tokens_seen": 75097920, "step": 3473, "time_per_iteration": 2.6236281394958496 }, { "auxiliary_loss_clip": 0.01057043, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.05363917, "balance_loss_mlp": 1.03434241, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.9045809123115837, "language_loss": 0.63652557, "learning_rate": 3.6736655708622235e-06, "loss": 0.65747303, "num_input_tokens_seen": 75152410, "step": 3474, "time_per_iteration": 3.1946537494659424 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01045984, "balance_loss_clip": 1.05276895, "balance_loss_mlp": 1.02782845, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 3.2311626254468795, "language_loss": 0.69970965, "learning_rate": 3.6734523249943844e-06, "loss": 0.72152305, "num_input_tokens_seen": 75173265, "step": 3475, "time_per_iteration": 2.7967529296875 }, { "auxiliary_loss_clip": 0.01158022, "auxiliary_loss_mlp": 0.01046944, "balance_loss_clip": 1.05606794, "balance_loss_mlp": 1.02862167, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 1.9789108228051473, "language_loss": 0.70372891, "learning_rate": 3.673239015669065e-06, "loss": 0.72577858, "num_input_tokens_seen": 75193640, "step": 3476, "time_per_iteration": 2.629687786102295 }, { "auxiliary_loss_clip": 0.01131765, "auxiliary_loss_mlp": 0.01045236, "balance_loss_clip": 1.05439556, "balance_loss_mlp": 1.02722347, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 2.3868812434184603, "language_loss": 0.89227062, "learning_rate": 3.6730256428943544e-06, "loss": 0.91404068, "num_input_tokens_seen": 75212545, "step": 3477, "time_per_iteration": 2.7574357986450195 }, { "auxiliary_loss_clip": 0.01092922, "auxiliary_loss_mlp": 0.01046119, "balance_loss_clip": 1.045825, "balance_loss_mlp": 1.02737951, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 2.6092415644893814, "language_loss": 0.67816859, "learning_rate": 3.672812206678344e-06, "loss": 0.69955903, "num_input_tokens_seen": 75230865, "step": 3478, "time_per_iteration": 2.7929017543792725 }, { "auxiliary_loss_clip": 0.01094689, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.04024661, "balance_loss_mlp": 1.02308464, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 4.056245481336458, "language_loss": 0.84239435, "learning_rate": 3.672598707029127e-06, "loss": 0.86376888, "num_input_tokens_seen": 75248285, "step": 3479, "time_per_iteration": 2.743544816970825 }, { "auxiliary_loss_clip": 0.01111533, "auxiliary_loss_mlp": 0.01050991, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.03028417, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 9.599906344578406, "language_loss": 0.74294043, "learning_rate": 3.6723851439548003e-06, "loss": 0.76456571, "num_input_tokens_seen": 75266310, "step": 3480, "time_per_iteration": 2.7278034687042236 }, { "auxiliary_loss_clip": 0.01107791, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04748154, "balance_loss_mlp": 1.02226901, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 2.178942595840573, "language_loss": 0.75664043, "learning_rate": 3.67217151746346e-06, "loss": 0.77810597, "num_input_tokens_seen": 75284175, "step": 3481, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01090021, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.02727938, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 1.816378391984801, "language_loss": 0.8517971, "learning_rate": 3.671957827563209e-06, "loss": 0.87315965, "num_input_tokens_seen": 75303465, "step": 3482, "time_per_iteration": 2.8777174949645996 }, { "auxiliary_loss_clip": 0.01099298, "auxiliary_loss_mlp": 0.01046228, "balance_loss_clip": 1.05039477, "balance_loss_mlp": 1.02817941, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 1.802490425012806, "language_loss": 0.70550174, "learning_rate": 3.6717440742621494e-06, "loss": 0.72695696, "num_input_tokens_seen": 75325290, "step": 3483, "time_per_iteration": 2.8599836826324463 }, { "auxiliary_loss_clip": 0.01127333, "auxiliary_loss_mlp": 0.01048954, "balance_loss_clip": 1.05204535, "balance_loss_mlp": 1.03082263, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 1.9649551735344426, "language_loss": 0.74867833, "learning_rate": 3.6715302575683865e-06, "loss": 0.77044123, "num_input_tokens_seen": 75343895, "step": 3484, "time_per_iteration": 2.655538320541382 }, { "auxiliary_loss_clip": 0.01117623, "auxiliary_loss_mlp": 0.01046902, "balance_loss_clip": 1.0514648, "balance_loss_mlp": 1.0274353, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 1.6308141537991403, "language_loss": 0.70815694, "learning_rate": 3.6713163774900292e-06, "loss": 0.72980225, "num_input_tokens_seen": 75367100, "step": 3485, "time_per_iteration": 2.744417667388916 }, { "auxiliary_loss_clip": 0.01083098, "auxiliary_loss_mlp": 0.00777163, "balance_loss_clip": 1.0433619, "balance_loss_mlp": 1.00097859, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 2.030771632516388, "language_loss": 0.83274543, "learning_rate": 3.6711024340351875e-06, "loss": 0.85134804, "num_input_tokens_seen": 75389925, "step": 3486, "time_per_iteration": 2.742042303085327 }, { "auxiliary_loss_clip": 0.01140212, "auxiliary_loss_mlp": 0.01048337, "balance_loss_clip": 1.05242062, "balance_loss_mlp": 1.03115916, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 1.6926372989653347, "language_loss": 0.87134725, "learning_rate": 3.6708884272119737e-06, "loss": 0.89323276, "num_input_tokens_seen": 75408575, "step": 3487, "time_per_iteration": 2.708331346511841 }, { "auxiliary_loss_clip": 0.01112214, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.04791641, "balance_loss_mlp": 1.0228194, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 4.471143750410675, "language_loss": 0.72291327, "learning_rate": 3.670674357028504e-06, "loss": 0.74445224, "num_input_tokens_seen": 75427155, "step": 3488, "time_per_iteration": 4.250715970993042 }, { "auxiliary_loss_clip": 0.01121403, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.05096245, "balance_loss_mlp": 1.02014148, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 2.6694226497987437, "language_loss": 0.79665899, "learning_rate": 3.6704602234928945e-06, "loss": 0.81824595, "num_input_tokens_seen": 75444450, "step": 3489, "time_per_iteration": 2.6926958560943604 }, { "auxiliary_loss_clip": 0.01152639, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.05325401, "balance_loss_mlp": 1.02875018, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 2.022409198347131, "language_loss": 0.72505707, "learning_rate": 3.670246026613266e-06, "loss": 0.74704129, "num_input_tokens_seen": 75462625, "step": 3490, "time_per_iteration": 4.133761644363403 }, { "auxiliary_loss_clip": 0.01122247, "auxiliary_loss_mlp": 0.01050283, "balance_loss_clip": 1.0509479, "balance_loss_mlp": 1.03402328, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 1.8035978449536252, "language_loss": 0.70332754, "learning_rate": 3.6700317663977415e-06, "loss": 0.72505283, "num_input_tokens_seen": 75480640, "step": 3491, "time_per_iteration": 2.667243003845215 }, { "auxiliary_loss_clip": 0.0113848, "auxiliary_loss_mlp": 0.0077627, "balance_loss_clip": 1.05017376, "balance_loss_mlp": 1.00098944, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 2.379943808529104, "language_loss": 0.79751909, "learning_rate": 3.669817442854444e-06, "loss": 0.81666666, "num_input_tokens_seen": 75494900, "step": 3492, "time_per_iteration": 4.270704984664917 }, { "auxiliary_loss_clip": 0.01138825, "auxiliary_loss_mlp": 0.00776339, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.00108409, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 2.2783194747149906, "language_loss": 0.86987948, "learning_rate": 3.669603055991502e-06, "loss": 0.88903111, "num_input_tokens_seen": 75513370, "step": 3493, "time_per_iteration": 2.7830448150634766 }, { "auxiliary_loss_clip": 0.01110786, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.04520118, "balance_loss_mlp": 1.02105093, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 6.813030650079402, "language_loss": 0.68622243, "learning_rate": 3.6693886058170455e-06, "loss": 0.70770705, "num_input_tokens_seen": 75532480, "step": 3494, "time_per_iteration": 2.8479061126708984 }, { "auxiliary_loss_clip": 0.01145467, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.05302739, "balance_loss_mlp": 1.01998639, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 1.7516454579581615, "language_loss": 0.78848761, "learning_rate": 3.6691740923392053e-06, "loss": 0.81031501, "num_input_tokens_seen": 75552745, "step": 3495, "time_per_iteration": 2.9313197135925293 }, { "auxiliary_loss_clip": 0.01119614, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.04760814, "balance_loss_mlp": 1.02708316, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 2.1492916784611844, "language_loss": 0.77302933, "learning_rate": 3.668959515566116e-06, "loss": 0.79467654, "num_input_tokens_seen": 75574355, "step": 3496, "time_per_iteration": 4.467881441116333 }, { "auxiliary_loss_clip": 0.01135202, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.05169654, "balance_loss_mlp": 1.02839065, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 2.146148958862047, "language_loss": 0.82076812, "learning_rate": 3.668744875505915e-06, "loss": 0.8425864, "num_input_tokens_seen": 75592215, "step": 3497, "time_per_iteration": 2.683037281036377 }, { "auxiliary_loss_clip": 0.01144559, "auxiliary_loss_mlp": 0.01047188, "balance_loss_clip": 1.05445957, "balance_loss_mlp": 1.02967596, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 1.732381679276629, "language_loss": 0.67239833, "learning_rate": 3.668530172166741e-06, "loss": 0.69431579, "num_input_tokens_seen": 75610740, "step": 3498, "time_per_iteration": 2.685481548309326 }, { "auxiliary_loss_clip": 0.01121255, "auxiliary_loss_mlp": 0.01044553, "balance_loss_clip": 1.04974794, "balance_loss_mlp": 1.02611172, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 1.7892967196850054, "language_loss": 0.80832362, "learning_rate": 3.6683154055567352e-06, "loss": 0.82998168, "num_input_tokens_seen": 75631005, "step": 3499, "time_per_iteration": 2.744995355606079 }, { "auxiliary_loss_clip": 0.01139753, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.05226696, "balance_loss_mlp": 1.02312946, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 1.6464696881852638, "language_loss": 0.77983701, "learning_rate": 3.668100575684043e-06, "loss": 0.80163181, "num_input_tokens_seen": 75650655, "step": 3500, "time_per_iteration": 2.7704038619995117 }, { "auxiliary_loss_clip": 0.01129369, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05095315, "balance_loss_mlp": 1.02390063, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 1.5981262394728393, "language_loss": 0.74450207, "learning_rate": 3.6678856825568094e-06, "loss": 0.76621759, "num_input_tokens_seen": 75669895, "step": 3501, "time_per_iteration": 2.7066893577575684 }, { "auxiliary_loss_clip": 0.01134924, "auxiliary_loss_mlp": 0.01039556, "balance_loss_clip": 1.04989994, "balance_loss_mlp": 1.02227044, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 1.6188770382514572, "language_loss": 0.75278366, "learning_rate": 3.667670726183183e-06, "loss": 0.77452844, "num_input_tokens_seen": 75689535, "step": 3502, "time_per_iteration": 2.724635124206543 }, { "auxiliary_loss_clip": 0.01098479, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.02248216, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 1.9441266701933382, "language_loss": 0.77188909, "learning_rate": 3.667455706571316e-06, "loss": 0.7932831, "num_input_tokens_seen": 75709265, "step": 3503, "time_per_iteration": 2.7545289993286133 }, { "auxiliary_loss_clip": 0.010957, "auxiliary_loss_mlp": 0.01045911, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.02478695, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 2.256374081289255, "language_loss": 0.78297234, "learning_rate": 3.6672406237293617e-06, "loss": 0.8043884, "num_input_tokens_seen": 75727050, "step": 3504, "time_per_iteration": 2.7454304695129395 }, { "auxiliary_loss_clip": 0.01117408, "auxiliary_loss_mlp": 0.01049815, "balance_loss_clip": 1.0488404, "balance_loss_mlp": 1.03152788, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 1.5753219052286964, "language_loss": 0.76731002, "learning_rate": 3.6670254776654754e-06, "loss": 0.78898227, "num_input_tokens_seen": 75747175, "step": 3505, "time_per_iteration": 2.7509703636169434 }, { "auxiliary_loss_clip": 0.01120291, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.04882348, "balance_loss_mlp": 1.03383446, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 1.9938386598136906, "language_loss": 0.63933277, "learning_rate": 3.6668102683878163e-06, "loss": 0.66105598, "num_input_tokens_seen": 75767690, "step": 3506, "time_per_iteration": 2.773611545562744 }, { "auxiliary_loss_clip": 0.01138444, "auxiliary_loss_mlp": 0.01050655, "balance_loss_clip": 1.05078697, "balance_loss_mlp": 1.03257108, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 2.170999698474249, "language_loss": 0.82010436, "learning_rate": 3.6665949959045443e-06, "loss": 0.84199536, "num_input_tokens_seen": 75787255, "step": 3507, "time_per_iteration": 2.6604206562042236 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.04972744, "balance_loss_mlp": 1.02472949, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 2.0519706535557414, "language_loss": 0.75213134, "learning_rate": 3.666379660223824e-06, "loss": 0.77393204, "num_input_tokens_seen": 75805890, "step": 3508, "time_per_iteration": 2.7164604663848877 }, { "auxiliary_loss_clip": 0.01154655, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.05263913, "balance_loss_mlp": 1.01894128, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 3.4182125548434112, "language_loss": 0.84984946, "learning_rate": 3.6661642613538192e-06, "loss": 0.87176406, "num_input_tokens_seen": 75821620, "step": 3509, "time_per_iteration": 2.661743402481079 }, { "auxiliary_loss_clip": 0.01120944, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.05299115, "balance_loss_mlp": 1.02443957, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 2.210880078691599, "language_loss": 0.68125075, "learning_rate": 3.6659487993026987e-06, "loss": 0.70290035, "num_input_tokens_seen": 75842490, "step": 3510, "time_per_iteration": 2.7881460189819336 }, { "auxiliary_loss_clip": 0.01152569, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05026078, "balance_loss_mlp": 1.02892137, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 1.958863999940011, "language_loss": 0.72639364, "learning_rate": 3.6657332740786327e-06, "loss": 0.74838924, "num_input_tokens_seen": 75865985, "step": 3511, "time_per_iteration": 2.6942689418792725 }, { "auxiliary_loss_clip": 0.01066393, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.04279399, "balance_loss_mlp": 1.0208931, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 3.2801391377369686, "language_loss": 0.69354337, "learning_rate": 3.665517685689794e-06, "loss": 0.71462011, "num_input_tokens_seen": 75882745, "step": 3512, "time_per_iteration": 2.8260998725891113 }, { "auxiliary_loss_clip": 0.01140043, "auxiliary_loss_mlp": 0.01050555, "balance_loss_clip": 1.04943943, "balance_loss_mlp": 1.03082585, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 2.072678482519775, "language_loss": 0.73145646, "learning_rate": 3.6653020341443584e-06, "loss": 0.75336242, "num_input_tokens_seen": 75904305, "step": 3513, "time_per_iteration": 2.9639391899108887 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.04785061, "balance_loss_mlp": 1.02089679, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 2.0322171916220086, "language_loss": 0.74422491, "learning_rate": 3.665086319450502e-06, "loss": 0.76582778, "num_input_tokens_seen": 75923710, "step": 3514, "time_per_iteration": 2.7379143238067627 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.05334568, "balance_loss_mlp": 1.01941383, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 2.431934297389972, "language_loss": 0.76738697, "learning_rate": 3.6648705416164062e-06, "loss": 0.78913867, "num_input_tokens_seen": 75942625, "step": 3515, "time_per_iteration": 2.6339287757873535 }, { "auxiliary_loss_clip": 0.011289, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.05247736, "balance_loss_mlp": 1.0288614, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 2.7460645413082756, "language_loss": 0.68756706, "learning_rate": 3.6646547006502518e-06, "loss": 0.70933092, "num_input_tokens_seen": 75959930, "step": 3516, "time_per_iteration": 2.6489672660827637 }, { "auxiliary_loss_clip": 0.01118182, "auxiliary_loss_mlp": 0.01049447, "balance_loss_clip": 1.05634522, "balance_loss_mlp": 1.03045666, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 1.8368744753927078, "language_loss": 0.85010064, "learning_rate": 3.664438796560225e-06, "loss": 0.87177694, "num_input_tokens_seen": 75980335, "step": 3517, "time_per_iteration": 2.745887279510498 }, { "auxiliary_loss_clip": 0.01125904, "auxiliary_loss_mlp": 0.01042813, "balance_loss_clip": 1.04719234, "balance_loss_mlp": 1.02506244, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 2.246330970109572, "language_loss": 0.63672101, "learning_rate": 3.664222829354512e-06, "loss": 0.65840822, "num_input_tokens_seen": 76002095, "step": 3518, "time_per_iteration": 2.7990219593048096 }, { "auxiliary_loss_clip": 0.01089367, "auxiliary_loss_mlp": 0.01057733, "balance_loss_clip": 1.05040181, "balance_loss_mlp": 1.04001832, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 2.1349107177710875, "language_loss": 0.89256221, "learning_rate": 3.664006799041303e-06, "loss": 0.91403317, "num_input_tokens_seen": 76020425, "step": 3519, "time_per_iteration": 2.8022944927215576 }, { "auxiliary_loss_clip": 0.01135146, "auxiliary_loss_mlp": 0.01049587, "balance_loss_clip": 1.05320001, "balance_loss_mlp": 1.03140712, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 1.8050755180524396, "language_loss": 0.81235015, "learning_rate": 3.6637907056287886e-06, "loss": 0.8341974, "num_input_tokens_seen": 76041210, "step": 3520, "time_per_iteration": 2.750988245010376 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01048631, "balance_loss_clip": 1.05111551, "balance_loss_mlp": 1.03095269, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 1.92815865975435, "language_loss": 0.76254267, "learning_rate": 3.6635745491251642e-06, "loss": 0.78427303, "num_input_tokens_seen": 76062685, "step": 3521, "time_per_iteration": 2.7965810298919678 }, { "auxiliary_loss_clip": 0.0109789, "auxiliary_loss_mlp": 0.01044794, "balance_loss_clip": 1.04872918, "balance_loss_mlp": 1.02841413, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 2.0270933567011302, "language_loss": 0.75752926, "learning_rate": 3.663358329538626e-06, "loss": 0.77895606, "num_input_tokens_seen": 76082300, "step": 3522, "time_per_iteration": 2.8280131816864014 }, { "auxiliary_loss_clip": 0.01153324, "auxiliary_loss_mlp": 0.01053431, "balance_loss_clip": 1.05353725, "balance_loss_mlp": 1.03541851, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 1.8399634756194385, "language_loss": 0.70481133, "learning_rate": 3.663142046877374e-06, "loss": 0.72687888, "num_input_tokens_seen": 76101135, "step": 3523, "time_per_iteration": 2.6909022331237793 }, { "auxiliary_loss_clip": 0.01139749, "auxiliary_loss_mlp": 0.01054127, "balance_loss_clip": 1.05166054, "balance_loss_mlp": 1.03619766, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 2.455264594190525, "language_loss": 0.77290082, "learning_rate": 3.6629257011496085e-06, "loss": 0.7948395, "num_input_tokens_seen": 76119320, "step": 3524, "time_per_iteration": 2.6844334602355957 }, { "auxiliary_loss_clip": 0.01132697, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.05066419, "balance_loss_mlp": 1.02621162, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 1.841652047976503, "language_loss": 0.81680572, "learning_rate": 3.6627092923635338e-06, "loss": 0.83857846, "num_input_tokens_seen": 76137445, "step": 3525, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01088536, "auxiliary_loss_mlp": 0.01041509, "balance_loss_clip": 1.04158318, "balance_loss_mlp": 1.02353263, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 1.867957043941215, "language_loss": 0.75627208, "learning_rate": 3.662492820527356e-06, "loss": 0.77757257, "num_input_tokens_seen": 76159500, "step": 3526, "time_per_iteration": 2.973966598510742 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.05324817, "balance_loss_mlp": 1.023229, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 1.8230643924086412, "language_loss": 0.77070421, "learning_rate": 3.662276285649284e-06, "loss": 0.79265994, "num_input_tokens_seen": 76177990, "step": 3527, "time_per_iteration": 2.648961067199707 }, { "auxiliary_loss_clip": 0.01151081, "auxiliary_loss_mlp": 0.0104874, "balance_loss_clip": 1.05143785, "balance_loss_mlp": 1.02977419, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 2.807984733302778, "language_loss": 0.7815178, "learning_rate": 3.662059687737528e-06, "loss": 0.80351603, "num_input_tokens_seen": 76197125, "step": 3528, "time_per_iteration": 4.401185989379883 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01045736, "balance_loss_clip": 1.04889631, "balance_loss_mlp": 1.02817655, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 2.1271435469609257, "language_loss": 0.8128866, "learning_rate": 3.6618430268003024e-06, "loss": 0.8347016, "num_input_tokens_seen": 76216215, "step": 3529, "time_per_iteration": 4.309772968292236 }, { "auxiliary_loss_clip": 0.0113319, "auxiliary_loss_mlp": 0.00777373, "balance_loss_clip": 1.04967499, "balance_loss_mlp": 1.00112891, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 1.9704727824538568, "language_loss": 0.76427567, "learning_rate": 3.6616263028458235e-06, "loss": 0.78338128, "num_input_tokens_seen": 76237010, "step": 3530, "time_per_iteration": 2.7592365741729736 }, { "auxiliary_loss_clip": 0.0115078, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.0522244, "balance_loss_mlp": 1.02990103, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 2.1154933827202274, "language_loss": 0.82973897, "learning_rate": 3.661409515882308e-06, "loss": 0.85171747, "num_input_tokens_seen": 76255965, "step": 3531, "time_per_iteration": 4.168981313705444 }, { "auxiliary_loss_clip": 0.01120152, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.04767489, "balance_loss_mlp": 1.02313459, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 2.335526210972433, "language_loss": 0.73087364, "learning_rate": 3.661192665917977e-06, "loss": 0.75250214, "num_input_tokens_seen": 76272150, "step": 3532, "time_per_iteration": 2.6797189712524414 }, { "auxiliary_loss_clip": 0.01126693, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0539782, "balance_loss_mlp": 1.02269292, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 6.22254473074881, "language_loss": 0.74268675, "learning_rate": 3.660975752961054e-06, "loss": 0.76436776, "num_input_tokens_seen": 76291425, "step": 3533, "time_per_iteration": 2.741152048110962 }, { "auxiliary_loss_clip": 0.01146682, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05342829, "balance_loss_mlp": 1.0265224, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 2.0406923816018714, "language_loss": 0.70889592, "learning_rate": 3.6607587770197634e-06, "loss": 0.73080653, "num_input_tokens_seen": 76313975, "step": 3534, "time_per_iteration": 2.8210513591766357 }, { "auxiliary_loss_clip": 0.01133157, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.05234385, "balance_loss_mlp": 1.02463722, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 2.102271516852891, "language_loss": 0.71675557, "learning_rate": 3.6605417381023346e-06, "loss": 0.73852366, "num_input_tokens_seen": 76330955, "step": 3535, "time_per_iteration": 2.804506540298462 }, { "auxiliary_loss_clip": 0.01137461, "auxiliary_loss_mlp": 0.01053804, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03607774, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 24.01704513629389, "language_loss": 0.70639503, "learning_rate": 3.660324636216996e-06, "loss": 0.72830772, "num_input_tokens_seen": 76352680, "step": 3536, "time_per_iteration": 4.442729473114014 }, { "auxiliary_loss_clip": 0.011554, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05231214, "balance_loss_mlp": 1.03082991, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 2.2527167001205806, "language_loss": 0.8784188, "learning_rate": 3.660107471371981e-06, "loss": 0.90047216, "num_input_tokens_seen": 76370750, "step": 3537, "time_per_iteration": 2.6365723609924316 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.00776226, "balance_loss_clip": 1.04911351, "balance_loss_mlp": 1.00101614, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 1.8080285651248438, "language_loss": 0.80480909, "learning_rate": 3.659890243575524e-06, "loss": 0.82394671, "num_input_tokens_seen": 76390610, "step": 3538, "time_per_iteration": 2.7403554916381836 }, { "auxiliary_loss_clip": 0.01080631, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.04171312, "balance_loss_mlp": 1.03219926, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 2.705287390300715, "language_loss": 0.86691839, "learning_rate": 3.659672952835863e-06, "loss": 0.88824159, "num_input_tokens_seen": 76408860, "step": 3539, "time_per_iteration": 2.8177876472473145 }, { "auxiliary_loss_clip": 0.01120184, "auxiliary_loss_mlp": 0.01047424, "balance_loss_clip": 1.04577422, "balance_loss_mlp": 1.0295074, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 5.212413836862573, "language_loss": 0.57756186, "learning_rate": 3.659455599161237e-06, "loss": 0.59923792, "num_input_tokens_seen": 76424980, "step": 3540, "time_per_iteration": 2.786552667617798 }, { "auxiliary_loss_clip": 0.01154193, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.05276537, "balance_loss_mlp": 1.02131045, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 2.318388810062464, "language_loss": 0.76114893, "learning_rate": 3.659238182559888e-06, "loss": 0.78309381, "num_input_tokens_seen": 76443135, "step": 3541, "time_per_iteration": 2.646207332611084 }, { "auxiliary_loss_clip": 0.01108241, "auxiliary_loss_mlp": 0.01044876, "balance_loss_clip": 1.0464325, "balance_loss_mlp": 1.02676797, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 3.508596736579257, "language_loss": 0.69749588, "learning_rate": 3.6590207030400615e-06, "loss": 0.71902704, "num_input_tokens_seen": 76462470, "step": 3542, "time_per_iteration": 2.746612787246704 }, { "auxiliary_loss_clip": 0.01149445, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.05146265, "balance_loss_mlp": 1.02160525, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 2.3488794859192397, "language_loss": 0.75651306, "learning_rate": 3.658803160610004e-06, "loss": 0.77839369, "num_input_tokens_seen": 76481995, "step": 3543, "time_per_iteration": 2.665900230407715 }, { "auxiliary_loss_clip": 0.0112855, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05257249, "balance_loss_mlp": 1.02409506, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 1.8076409354305347, "language_loss": 0.66981912, "learning_rate": 3.6585855552779634e-06, "loss": 0.6915251, "num_input_tokens_seen": 76500245, "step": 3544, "time_per_iteration": 2.6692638397216797 }, { "auxiliary_loss_clip": 0.01121216, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.0480237, "balance_loss_mlp": 1.02897835, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 1.8644107460894377, "language_loss": 0.70977402, "learning_rate": 3.6583678870521934e-06, "loss": 0.73145014, "num_input_tokens_seen": 76519535, "step": 3545, "time_per_iteration": 2.686939001083374 }, { "auxiliary_loss_clip": 0.01128605, "auxiliary_loss_mlp": 0.01048325, "balance_loss_clip": 1.05368018, "balance_loss_mlp": 1.0300498, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 1.8809403827144264, "language_loss": 0.72329843, "learning_rate": 3.658150155940946e-06, "loss": 0.74506772, "num_input_tokens_seen": 76542065, "step": 3546, "time_per_iteration": 2.8044040203094482 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01050245, "balance_loss_clip": 1.0539, "balance_loss_mlp": 1.03250647, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 3.48585993087404, "language_loss": 0.80431038, "learning_rate": 3.657932361952479e-06, "loss": 0.82596385, "num_input_tokens_seen": 76560540, "step": 3547, "time_per_iteration": 2.7981739044189453 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01045355, "balance_loss_clip": 1.05115056, "balance_loss_mlp": 1.02685428, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 2.460294966859189, "language_loss": 0.7449761, "learning_rate": 3.6577145050950504e-06, "loss": 0.7669735, "num_input_tokens_seen": 76581760, "step": 3548, "time_per_iteration": 2.709476947784424 }, { "auxiliary_loss_clip": 0.01117193, "auxiliary_loss_mlp": 0.01059153, "balance_loss_clip": 1.05099797, "balance_loss_mlp": 1.03938842, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 2.783715227630402, "language_loss": 0.74218595, "learning_rate": 3.657496585376922e-06, "loss": 0.76394939, "num_input_tokens_seen": 76599940, "step": 3549, "time_per_iteration": 2.751401662826538 }, { "auxiliary_loss_clip": 0.01121431, "auxiliary_loss_mlp": 0.01050546, "balance_loss_clip": 1.05331278, "balance_loss_mlp": 1.03283179, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 1.8583266555890872, "language_loss": 0.80719978, "learning_rate": 3.657278602806357e-06, "loss": 0.82891953, "num_input_tokens_seen": 76619580, "step": 3550, "time_per_iteration": 2.74678373336792 }, { "auxiliary_loss_clip": 0.01151996, "auxiliary_loss_mlp": 0.01048347, "balance_loss_clip": 1.05428052, "balance_loss_mlp": 1.03147876, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 1.7548210279469212, "language_loss": 0.88234103, "learning_rate": 3.657060557391621e-06, "loss": 0.90434444, "num_input_tokens_seen": 76638195, "step": 3551, "time_per_iteration": 2.746938705444336 }, { "auxiliary_loss_clip": 0.01151269, "auxiliary_loss_mlp": 0.01048306, "balance_loss_clip": 1.05139017, "balance_loss_mlp": 1.03111625, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 1.8976063035050816, "language_loss": 0.83877259, "learning_rate": 3.656842449140983e-06, "loss": 0.86076838, "num_input_tokens_seen": 76656695, "step": 3552, "time_per_iteration": 2.616567373275757 }, { "auxiliary_loss_clip": 0.0113626, "auxiliary_loss_mlp": 0.01050705, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.0325495, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 2.604872460919843, "language_loss": 0.76370007, "learning_rate": 3.656624278062713e-06, "loss": 0.78556973, "num_input_tokens_seen": 76677430, "step": 3553, "time_per_iteration": 2.730829954147339 }, { "auxiliary_loss_clip": 0.01142267, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.05434144, "balance_loss_mlp": 1.02915072, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 1.5008078028945642, "language_loss": 0.72580731, "learning_rate": 3.6564060441650843e-06, "loss": 0.74769098, "num_input_tokens_seen": 76697615, "step": 3554, "time_per_iteration": 2.701207399368286 }, { "auxiliary_loss_clip": 0.01097601, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.04785013, "balance_loss_mlp": 1.00128174, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 2.0681583889949957, "language_loss": 0.67728174, "learning_rate": 3.6561877474563724e-06, "loss": 0.69602168, "num_input_tokens_seen": 76715685, "step": 3555, "time_per_iteration": 2.76454758644104 }, { "auxiliary_loss_clip": 0.01124456, "auxiliary_loss_mlp": 0.01045031, "balance_loss_clip": 1.06086278, "balance_loss_mlp": 1.02689981, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 2.155752981705525, "language_loss": 0.64553648, "learning_rate": 3.6559693879448553e-06, "loss": 0.66723132, "num_input_tokens_seen": 76735405, "step": 3556, "time_per_iteration": 2.839993953704834 }, { "auxiliary_loss_clip": 0.01139371, "auxiliary_loss_mlp": 0.01051642, "balance_loss_clip": 1.05236566, "balance_loss_mlp": 1.0331769, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 1.7378281716746964, "language_loss": 0.72588408, "learning_rate": 3.6557509656388125e-06, "loss": 0.74779421, "num_input_tokens_seen": 76754395, "step": 3557, "time_per_iteration": 2.7678587436676025 }, { "auxiliary_loss_clip": 0.01151319, "auxiliary_loss_mlp": 0.00776703, "balance_loss_clip": 1.0647192, "balance_loss_mlp": 1.00117195, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 1.8333462571334693, "language_loss": 0.6714859, "learning_rate": 3.655532480546528e-06, "loss": 0.6907661, "num_input_tokens_seen": 76777210, "step": 3558, "time_per_iteration": 2.7584826946258545 }, { "auxiliary_loss_clip": 0.01159331, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.0541842, "balance_loss_mlp": 1.02297139, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 1.8974456617751176, "language_loss": 0.79882181, "learning_rate": 3.655313932676286e-06, "loss": 0.82082617, "num_input_tokens_seen": 76795830, "step": 3559, "time_per_iteration": 2.6918041706085205 }, { "auxiliary_loss_clip": 0.01155068, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.05566323, "balance_loss_mlp": 1.0295198, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 1.8730564704536732, "language_loss": 0.68085694, "learning_rate": 3.655095322036373e-06, "loss": 0.70286781, "num_input_tokens_seen": 76814700, "step": 3560, "time_per_iteration": 2.6445770263671875 }, { "auxiliary_loss_clip": 0.01145074, "auxiliary_loss_mlp": 0.01043706, "balance_loss_clip": 1.0535686, "balance_loss_mlp": 1.02537155, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 1.8952415763477797, "language_loss": 0.73272544, "learning_rate": 3.65487664863508e-06, "loss": 0.75461322, "num_input_tokens_seen": 76833400, "step": 3561, "time_per_iteration": 2.6568899154663086 }, { "auxiliary_loss_clip": 0.01133795, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.05333674, "balance_loss_mlp": 1.02700794, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 2.1953085541278203, "language_loss": 0.78028738, "learning_rate": 3.654657912480698e-06, "loss": 0.80207092, "num_input_tokens_seen": 76850645, "step": 3562, "time_per_iteration": 2.73655104637146 }, { "auxiliary_loss_clip": 0.01155634, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.05661631, "balance_loss_mlp": 1.02457595, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 3.5245068195694937, "language_loss": 0.84338713, "learning_rate": 3.6544391135815237e-06, "loss": 0.86536604, "num_input_tokens_seen": 76870135, "step": 3563, "time_per_iteration": 2.676630973815918 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.05830729, "balance_loss_mlp": 1.01957488, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 1.5172669047015535, "language_loss": 0.76581991, "learning_rate": 3.6542202519458507e-06, "loss": 0.78775525, "num_input_tokens_seen": 76893905, "step": 3564, "time_per_iteration": 2.7504193782806396 }, { "auxiliary_loss_clip": 0.01134427, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.06131172, "balance_loss_mlp": 1.02674031, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 1.7115347614953564, "language_loss": 0.88466394, "learning_rate": 3.654001327581981e-06, "loss": 0.90644825, "num_input_tokens_seen": 76914205, "step": 3565, "time_per_iteration": 2.7911624908447266 }, { "auxiliary_loss_clip": 0.01071735, "auxiliary_loss_mlp": 0.01008336, "balance_loss_clip": 1.05462575, "balance_loss_mlp": 1.0057019, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8339683756542131, "language_loss": 0.52192736, "learning_rate": 3.653782340498215e-06, "loss": 0.54272807, "num_input_tokens_seen": 76975650, "step": 3566, "time_per_iteration": 3.1801936626434326 }, { "auxiliary_loss_clip": 0.01141614, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.05527854, "balance_loss_mlp": 1.02505386, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 1.8485820369681922, "language_loss": 0.67324477, "learning_rate": 3.6535632907028566e-06, "loss": 0.6950742, "num_input_tokens_seen": 76992615, "step": 3567, "time_per_iteration": 2.6948626041412354 }, { "auxiliary_loss_clip": 0.01123629, "auxiliary_loss_mlp": 0.01045447, "balance_loss_clip": 1.05142832, "balance_loss_mlp": 1.02749455, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 3.2542445550844317, "language_loss": 0.74213678, "learning_rate": 3.6533441782042126e-06, "loss": 0.76382756, "num_input_tokens_seen": 77017005, "step": 3568, "time_per_iteration": 4.396210670471191 }, { "auxiliary_loss_clip": 0.01140095, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05480075, "balance_loss_mlp": 1.03333998, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 1.7132363384404574, "language_loss": 0.77343202, "learning_rate": 3.6531250030105917e-06, "loss": 0.79533333, "num_input_tokens_seen": 77034990, "step": 3569, "time_per_iteration": 4.224002122879028 }, { "auxiliary_loss_clip": 0.011511, "auxiliary_loss_mlp": 0.0104435, "balance_loss_clip": 1.05651093, "balance_loss_mlp": 1.02521753, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 2.6050136504577583, "language_loss": 0.70278227, "learning_rate": 3.6529057651303053e-06, "loss": 0.72473681, "num_input_tokens_seen": 77052610, "step": 3570, "time_per_iteration": 2.668304681777954 }, { "auxiliary_loss_clip": 0.01158856, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.05765057, "balance_loss_mlp": 1.02955759, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 2.5503136440013647, "language_loss": 0.79031628, "learning_rate": 3.6526864645716666e-06, "loss": 0.81237268, "num_input_tokens_seen": 77072475, "step": 3571, "time_per_iteration": 4.066440105438232 }, { "auxiliary_loss_clip": 0.0113831, "auxiliary_loss_mlp": 0.01047146, "balance_loss_clip": 1.05283594, "balance_loss_mlp": 1.02703547, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 1.9606975528380188, "language_loss": 0.82601345, "learning_rate": 3.652467101342991e-06, "loss": 0.84786803, "num_input_tokens_seen": 77089930, "step": 3572, "time_per_iteration": 2.6096267700195312 }, { "auxiliary_loss_clip": 0.01134964, "auxiliary_loss_mlp": 0.01041355, "balance_loss_clip": 1.05588293, "balance_loss_mlp": 1.02358127, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 4.1014522432452285, "language_loss": 0.65240026, "learning_rate": 3.652247675452598e-06, "loss": 0.67416352, "num_input_tokens_seen": 77108970, "step": 3573, "time_per_iteration": 2.690986394882202 }, { "auxiliary_loss_clip": 0.01147698, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.05253768, "balance_loss_mlp": 1.03140295, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 2.3397683674355565, "language_loss": 0.75229824, "learning_rate": 3.652028186908807e-06, "loss": 0.77425939, "num_input_tokens_seen": 77126045, "step": 3574, "time_per_iteration": 2.621736526489258 }, { "auxiliary_loss_clip": 0.01138272, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.02414417, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 1.8157113535402463, "language_loss": 0.72179317, "learning_rate": 3.6518086357199416e-06, "loss": 0.74359143, "num_input_tokens_seen": 77144600, "step": 3575, "time_per_iteration": 4.362869501113892 }, { "auxiliary_loss_clip": 0.01126687, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.05261374, "balance_loss_mlp": 1.02422237, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 3.8402092268612216, "language_loss": 0.68255925, "learning_rate": 3.6515890218943277e-06, "loss": 0.70423794, "num_input_tokens_seen": 77162965, "step": 3576, "time_per_iteration": 2.665370225906372 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01049053, "balance_loss_clip": 1.05064976, "balance_loss_mlp": 1.02859676, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 2.2101409055401566, "language_loss": 0.88707685, "learning_rate": 3.651369345440292e-06, "loss": 0.90895033, "num_input_tokens_seen": 77179960, "step": 3577, "time_per_iteration": 2.655118465423584 }, { "auxiliary_loss_clip": 0.01070337, "auxiliary_loss_mlp": 0.01022454, "balance_loss_clip": 1.0487709, "balance_loss_mlp": 1.01998615, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.8146982557647512, "language_loss": 0.56184745, "learning_rate": 3.6511496063661654e-06, "loss": 0.58277535, "num_input_tokens_seen": 77239500, "step": 3578, "time_per_iteration": 3.2133536338806152 }, { "auxiliary_loss_clip": 0.01144391, "auxiliary_loss_mlp": 0.00775114, "balance_loss_clip": 1.05492067, "balance_loss_mlp": 1.00130272, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 2.988933296047806, "language_loss": 0.88686001, "learning_rate": 3.6509298046802807e-06, "loss": 0.90605509, "num_input_tokens_seen": 77254680, "step": 3579, "time_per_iteration": 2.6801605224609375 }, { "auxiliary_loss_clip": 0.01143273, "auxiliary_loss_mlp": 0.0104707, "balance_loss_clip": 1.05253708, "balance_loss_mlp": 1.02945101, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 1.8556029181899094, "language_loss": 0.77953792, "learning_rate": 3.650709940390972e-06, "loss": 0.80144137, "num_input_tokens_seen": 77274060, "step": 3580, "time_per_iteration": 2.6932644844055176 }, { "auxiliary_loss_clip": 0.01145284, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.05702484, "balance_loss_mlp": 1.02543712, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 1.9843281400180077, "language_loss": 0.72948015, "learning_rate": 3.6504900135065775e-06, "loss": 0.75136507, "num_input_tokens_seen": 77293255, "step": 3581, "time_per_iteration": 2.712376117706299 }, { "auxiliary_loss_clip": 0.01138503, "auxiliary_loss_mlp": 0.0104555, "balance_loss_clip": 1.05348194, "balance_loss_mlp": 1.0269891, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 2.4257233983700113, "language_loss": 0.70726413, "learning_rate": 3.6502700240354357e-06, "loss": 0.72910464, "num_input_tokens_seen": 77312390, "step": 3582, "time_per_iteration": 2.67122220993042 }, { "auxiliary_loss_clip": 0.01154755, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.05591798, "balance_loss_mlp": 1.0227195, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 2.4025311229753363, "language_loss": 0.84906816, "learning_rate": 3.650049971985889e-06, "loss": 0.87101901, "num_input_tokens_seen": 77330985, "step": 3583, "time_per_iteration": 2.6395328044891357 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01047024, "balance_loss_clip": 1.05368245, "balance_loss_mlp": 1.02971518, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 2.7569743809923533, "language_loss": 0.83223897, "learning_rate": 3.6498298573662824e-06, "loss": 0.85404205, "num_input_tokens_seen": 77350770, "step": 3584, "time_per_iteration": 2.730823040008545 }, { "auxiliary_loss_clip": 0.01118851, "auxiliary_loss_mlp": 0.00774813, "balance_loss_clip": 1.0520674, "balance_loss_mlp": 1.00120699, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 1.9634031706782962, "language_loss": 0.90054697, "learning_rate": 3.6496096801849625e-06, "loss": 0.9194836, "num_input_tokens_seen": 77370510, "step": 3585, "time_per_iteration": 2.722216844558716 }, { "auxiliary_loss_clip": 0.01145179, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.05783939, "balance_loss_mlp": 1.02793026, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 1.9859337557251673, "language_loss": 0.74663597, "learning_rate": 3.649389440450277e-06, "loss": 0.76854134, "num_input_tokens_seen": 77390645, "step": 3586, "time_per_iteration": 2.7681503295898438 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.01046334, "balance_loss_clip": 1.05628061, "balance_loss_mlp": 1.03011, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 2.903090853788092, "language_loss": 0.83029532, "learning_rate": 3.6491691381705804e-06, "loss": 0.85196197, "num_input_tokens_seen": 77409655, "step": 3587, "time_per_iteration": 2.788416624069214 }, { "auxiliary_loss_clip": 0.01109364, "auxiliary_loss_mlp": 0.00776304, "balance_loss_clip": 1.05255485, "balance_loss_mlp": 1.00129569, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 1.7067147212291012, "language_loss": 0.75593436, "learning_rate": 3.648948773354224e-06, "loss": 0.774791, "num_input_tokens_seen": 77430560, "step": 3588, "time_per_iteration": 2.866584062576294 }, { "auxiliary_loss_clip": 0.01136336, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.0224762, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 1.721393113594195, "language_loss": 0.80745661, "learning_rate": 3.6487283460095643e-06, "loss": 0.82921582, "num_input_tokens_seen": 77455000, "step": 3589, "time_per_iteration": 2.8839404582977295 }, { "auxiliary_loss_clip": 0.01157121, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.05677748, "balance_loss_mlp": 1.01992083, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 2.201221744880259, "language_loss": 0.72849286, "learning_rate": 3.648507856144961e-06, "loss": 0.75042707, "num_input_tokens_seen": 77475075, "step": 3590, "time_per_iteration": 2.6692256927490234 }, { "auxiliary_loss_clip": 0.01134591, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.05195427, "balance_loss_mlp": 1.02623618, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 2.25677544320114, "language_loss": 0.8402462, "learning_rate": 3.648287303768775e-06, "loss": 0.86204112, "num_input_tokens_seen": 77495945, "step": 3591, "time_per_iteration": 2.7531416416168213 }, { "auxiliary_loss_clip": 0.01123784, "auxiliary_loss_mlp": 0.01049552, "balance_loss_clip": 1.05391979, "balance_loss_mlp": 1.02972734, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 2.2410681113576585, "language_loss": 0.69175243, "learning_rate": 3.6480666888893686e-06, "loss": 0.71348578, "num_input_tokens_seen": 77517140, "step": 3592, "time_per_iteration": 2.8716177940368652 }, { "auxiliary_loss_clip": 0.01117322, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.04998767, "balance_loss_mlp": 1.03179634, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 2.3652325886308123, "language_loss": 0.84022737, "learning_rate": 3.647846011515108e-06, "loss": 0.86190724, "num_input_tokens_seen": 77536085, "step": 3593, "time_per_iteration": 2.7185158729553223 }, { "auxiliary_loss_clip": 0.01123006, "auxiliary_loss_mlp": 0.01048394, "balance_loss_clip": 1.05243289, "balance_loss_mlp": 1.029809, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 4.017970268493579, "language_loss": 0.75192308, "learning_rate": 3.6476252716543625e-06, "loss": 0.77363706, "num_input_tokens_seen": 77553675, "step": 3594, "time_per_iteration": 2.726027011871338 }, { "auxiliary_loss_clip": 0.01140408, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05318236, "balance_loss_mlp": 1.02650058, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 1.541030891618627, "language_loss": 0.80459857, "learning_rate": 3.6474044693155007e-06, "loss": 0.82644665, "num_input_tokens_seen": 77573360, "step": 3595, "time_per_iteration": 2.66504168510437 }, { "auxiliary_loss_clip": 0.01119754, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.05060601, "balance_loss_mlp": 1.02125788, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 2.1030283577585007, "language_loss": 0.78930759, "learning_rate": 3.647183604506897e-06, "loss": 0.81090033, "num_input_tokens_seen": 77591865, "step": 3596, "time_per_iteration": 2.7159698009490967 }, { "auxiliary_loss_clip": 0.01080261, "auxiliary_loss_mlp": 0.01047978, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.03106225, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 1.6709210997095376, "language_loss": 0.83061242, "learning_rate": 3.6469626772369253e-06, "loss": 0.85189474, "num_input_tokens_seen": 77611600, "step": 3597, "time_per_iteration": 2.79276704788208 }, { "auxiliary_loss_clip": 0.01133147, "auxiliary_loss_mlp": 0.00775626, "balance_loss_clip": 1.05385637, "balance_loss_mlp": 1.00146937, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 1.6388312470031852, "language_loss": 0.80549502, "learning_rate": 3.6467416875139642e-06, "loss": 0.8245827, "num_input_tokens_seen": 77630665, "step": 3598, "time_per_iteration": 2.6823580265045166 }, { "auxiliary_loss_clip": 0.01123845, "auxiliary_loss_mlp": 0.01051638, "balance_loss_clip": 1.05069876, "balance_loss_mlp": 1.03218365, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 1.9066675721358164, "language_loss": 0.82023275, "learning_rate": 3.6465206353463934e-06, "loss": 0.84198749, "num_input_tokens_seen": 77650835, "step": 3599, "time_per_iteration": 2.73583722114563 }, { "auxiliary_loss_clip": 0.0110774, "auxiliary_loss_mlp": 0.00775854, "balance_loss_clip": 1.04651821, "balance_loss_mlp": 1.00131536, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 2.996184273033617, "language_loss": 0.76724887, "learning_rate": 3.6462995207425947e-06, "loss": 0.78608489, "num_input_tokens_seen": 77669000, "step": 3600, "time_per_iteration": 2.695081949234009 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044855, "balance_loss_clip": 1.04869664, "balance_loss_mlp": 1.02886891, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.259096111885494, "language_loss": 0.80784452, "learning_rate": 3.6460783437109533e-06, "loss": 0.82941765, "num_input_tokens_seen": 77688745, "step": 3601, "time_per_iteration": 2.8094849586486816 }, { "auxiliary_loss_clip": 0.01155408, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.0550983, "balance_loss_mlp": 1.02973413, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 2.558776342313561, "language_loss": 0.83192647, "learning_rate": 3.6458571042598565e-06, "loss": 0.85394967, "num_input_tokens_seen": 77708445, "step": 3602, "time_per_iteration": 2.652876377105713 }, { "auxiliary_loss_clip": 0.0115161, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.0525223, "balance_loss_mlp": 1.03286743, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 1.768938326380195, "language_loss": 0.7449019, "learning_rate": 3.645635802397693e-06, "loss": 0.76692116, "num_input_tokens_seen": 77728465, "step": 3603, "time_per_iteration": 2.619614601135254 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01047384, "balance_loss_clip": 1.04873598, "balance_loss_mlp": 1.02883554, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 1.6710689829239502, "language_loss": 0.74178421, "learning_rate": 3.645414438132855e-06, "loss": 0.76342291, "num_input_tokens_seen": 77746735, "step": 3604, "time_per_iteration": 2.730182647705078 }, { "auxiliary_loss_clip": 0.01138214, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.05246544, "balance_loss_mlp": 1.02124691, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 1.7167946204354523, "language_loss": 0.7990489, "learning_rate": 3.6451930114737366e-06, "loss": 0.82081187, "num_input_tokens_seen": 77768105, "step": 3605, "time_per_iteration": 2.67668080329895 }, { "auxiliary_loss_clip": 0.01079717, "auxiliary_loss_mlp": 0.01002026, "balance_loss_clip": 1.0400598, "balance_loss_mlp": 0.99942732, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.7112415560884942, "language_loss": 0.5834192, "learning_rate": 3.6449715224287347e-06, "loss": 0.6042366, "num_input_tokens_seen": 77833750, "step": 3606, "time_per_iteration": 3.2736570835113525 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01043491, "balance_loss_clip": 1.05404341, "balance_loss_mlp": 1.02498984, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 2.2731951350022275, "language_loss": 0.73142302, "learning_rate": 3.644749971006248e-06, "loss": 0.75341088, "num_input_tokens_seen": 77853780, "step": 3607, "time_per_iteration": 4.267899990081787 }, { "auxiliary_loss_clip": 0.01133762, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.05282903, "balance_loss_mlp": 1.02789962, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 2.181379073292718, "language_loss": 0.76540339, "learning_rate": 3.6445283572146765e-06, "loss": 0.78720737, "num_input_tokens_seen": 77872575, "step": 3608, "time_per_iteration": 4.285630464553833 }, { "auxiliary_loss_clip": 0.01080204, "auxiliary_loss_mlp": 0.01047623, "balance_loss_clip": 1.04536235, "balance_loss_mlp": 1.0309217, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 2.042587105390135, "language_loss": 0.74584132, "learning_rate": 3.6443066810624255e-06, "loss": 0.76711953, "num_input_tokens_seen": 77892700, "step": 3609, "time_per_iteration": 2.802569627761841 }, { "auxiliary_loss_clip": 0.01131798, "auxiliary_loss_mlp": 0.01049353, "balance_loss_clip": 1.05227149, "balance_loss_mlp": 1.03159094, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1.9074832440543417, "language_loss": 0.89132321, "learning_rate": 3.6440849425579e-06, "loss": 0.91313475, "num_input_tokens_seen": 77911060, "step": 3610, "time_per_iteration": 4.189727306365967 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01044238, "balance_loss_clip": 1.05534768, "balance_loss_mlp": 1.02649963, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 2.058717355808165, "language_loss": 0.77779067, "learning_rate": 3.6438631417095095e-06, "loss": 0.79978603, "num_input_tokens_seen": 77929930, "step": 3611, "time_per_iteration": 2.6317896842956543 }, { "auxiliary_loss_clip": 0.01088447, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.04764366, "balance_loss_mlp": 1.03026867, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 2.3883055198257184, "language_loss": 0.63578451, "learning_rate": 3.6436412785256637e-06, "loss": 0.65714347, "num_input_tokens_seen": 77949060, "step": 3612, "time_per_iteration": 2.8771228790283203 }, { "auxiliary_loss_clip": 0.01091118, "auxiliary_loss_mlp": 0.01053996, "balance_loss_clip": 1.04585218, "balance_loss_mlp": 1.03454065, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 1.801964584441428, "language_loss": 0.75912857, "learning_rate": 3.643419353014776e-06, "loss": 0.78057969, "num_input_tokens_seen": 77967920, "step": 3613, "time_per_iteration": 2.710601568222046 }, { "auxiliary_loss_clip": 0.0110572, "auxiliary_loss_mlp": 0.01051253, "balance_loss_clip": 1.05008733, "balance_loss_mlp": 1.03121352, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 1.9293696862218277, "language_loss": 0.71047795, "learning_rate": 3.643197365185261e-06, "loss": 0.73204768, "num_input_tokens_seen": 77985330, "step": 3614, "time_per_iteration": 4.407632112503052 }, { "auxiliary_loss_clip": 0.0114355, "auxiliary_loss_mlp": 0.01048776, "balance_loss_clip": 1.05521107, "balance_loss_mlp": 1.0306083, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 1.7289280951335333, "language_loss": 0.73030001, "learning_rate": 3.6429753150455378e-06, "loss": 0.75222325, "num_input_tokens_seen": 78003105, "step": 3615, "time_per_iteration": 2.6358401775360107 }, { "auxiliary_loss_clip": 0.01145731, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.05206716, "balance_loss_mlp": 1.02703404, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 2.3648922858816976, "language_loss": 0.90127194, "learning_rate": 3.6427532026040263e-06, "loss": 0.92319548, "num_input_tokens_seen": 78019655, "step": 3616, "time_per_iteration": 2.659787178039551 }, { "auxiliary_loss_clip": 0.01103597, "auxiliary_loss_mlp": 0.01040899, "balance_loss_clip": 1.048136, "balance_loss_mlp": 1.02244496, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 2.928463545610362, "language_loss": 0.81107831, "learning_rate": 3.642531027869148e-06, "loss": 0.83252329, "num_input_tokens_seen": 78036025, "step": 3617, "time_per_iteration": 2.7723491191864014 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 1.05330408, "balance_loss_mlp": 1.02382231, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 1.9251992817215786, "language_loss": 0.75688154, "learning_rate": 3.642308790849329e-06, "loss": 0.77861977, "num_input_tokens_seen": 78055645, "step": 3618, "time_per_iteration": 2.7608227729797363 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01048647, "balance_loss_clip": 1.05600834, "balance_loss_mlp": 1.03045571, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 2.18435089101569, "language_loss": 0.69099152, "learning_rate": 3.642086491552996e-06, "loss": 0.71295673, "num_input_tokens_seen": 78071660, "step": 3619, "time_per_iteration": 2.671637773513794 }, { "auxiliary_loss_clip": 0.01144421, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.05394137, "balance_loss_mlp": 1.02482569, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 4.829425462001391, "language_loss": 0.78716505, "learning_rate": 3.641864129988579e-06, "loss": 0.8090359, "num_input_tokens_seen": 78091265, "step": 3620, "time_per_iteration": 2.7232043743133545 }, { "auxiliary_loss_clip": 0.01148457, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05161178, "balance_loss_mlp": 1.02507412, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 1.4663479636678602, "language_loss": 0.79966211, "learning_rate": 3.641641706164509e-06, "loss": 0.82156777, "num_input_tokens_seen": 78110095, "step": 3621, "time_per_iteration": 2.6326823234558105 }, { "auxiliary_loss_clip": 0.01143183, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.05334592, "balance_loss_mlp": 1.01955688, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 1.609721344037994, "language_loss": 0.87796915, "learning_rate": 3.641419220089221e-06, "loss": 0.89975888, "num_input_tokens_seen": 78129475, "step": 3622, "time_per_iteration": 2.6864428520202637 }, { "auxiliary_loss_clip": 0.01146899, "auxiliary_loss_mlp": 0.01037591, "balance_loss_clip": 1.05495822, "balance_loss_mlp": 1.01801729, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 1.856609178217172, "language_loss": 0.77077621, "learning_rate": 3.641196671771152e-06, "loss": 0.79262108, "num_input_tokens_seen": 78146880, "step": 3623, "time_per_iteration": 2.743601083755493 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.05279899, "balance_loss_mlp": 1.03226197, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 2.4362835431673036, "language_loss": 0.84600008, "learning_rate": 3.640974061218741e-06, "loss": 0.86773914, "num_input_tokens_seen": 78165065, "step": 3624, "time_per_iteration": 2.7499353885650635 }, { "auxiliary_loss_clip": 0.01139543, "auxiliary_loss_mlp": 0.01057514, "balance_loss_clip": 1.05353129, "balance_loss_mlp": 1.03804684, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 2.4333310175924905, "language_loss": 0.78037703, "learning_rate": 3.640751388440429e-06, "loss": 0.80234766, "num_input_tokens_seen": 78180005, "step": 3625, "time_per_iteration": 2.6314821243286133 }, { "auxiliary_loss_clip": 0.01061536, "auxiliary_loss_mlp": 0.01003869, "balance_loss_clip": 1.03318405, "balance_loss_mlp": 1.00130582, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8242097668179436, "language_loss": 0.60701489, "learning_rate": 3.64052865344466e-06, "loss": 0.62766898, "num_input_tokens_seen": 78245350, "step": 3626, "time_per_iteration": 3.257289409637451 }, { "auxiliary_loss_clip": 0.0112643, "auxiliary_loss_mlp": 0.00776719, "balance_loss_clip": 1.05120194, "balance_loss_mlp": 1.00134754, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 2.2464694521793094, "language_loss": 0.9077245, "learning_rate": 3.6403058562398795e-06, "loss": 0.92675602, "num_input_tokens_seen": 78264165, "step": 3627, "time_per_iteration": 2.6639885902404785 }, { "auxiliary_loss_clip": 0.0109778, "auxiliary_loss_mlp": 0.01043665, "balance_loss_clip": 1.04912198, "balance_loss_mlp": 1.02471113, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 1.8437472480823303, "language_loss": 0.73480809, "learning_rate": 3.6400829968345365e-06, "loss": 0.75622261, "num_input_tokens_seen": 78283745, "step": 3628, "time_per_iteration": 2.7430238723754883 }, { "auxiliary_loss_clip": 0.01151444, "auxiliary_loss_mlp": 0.01042108, "balance_loss_clip": 1.05143893, "balance_loss_mlp": 1.02391696, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 2.8127332529660296, "language_loss": 0.77337319, "learning_rate": 3.6398600752370826e-06, "loss": 0.79530871, "num_input_tokens_seen": 78302900, "step": 3629, "time_per_iteration": 2.6468687057495117 }, { "auxiliary_loss_clip": 0.01142447, "auxiliary_loss_mlp": 0.01044137, "balance_loss_clip": 1.0532223, "balance_loss_mlp": 1.02709055, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 1.7154004506833416, "language_loss": 0.71373391, "learning_rate": 3.63963709145597e-06, "loss": 0.73559982, "num_input_tokens_seen": 78326470, "step": 3630, "time_per_iteration": 2.7334208488464355 }, { "auxiliary_loss_clip": 0.01089422, "auxiliary_loss_mlp": 0.01040838, "balance_loss_clip": 1.04771948, "balance_loss_mlp": 1.02488792, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 2.4394061962398625, "language_loss": 0.76502508, "learning_rate": 3.6394140454996544e-06, "loss": 0.78632766, "num_input_tokens_seen": 78345810, "step": 3631, "time_per_iteration": 2.9277098178863525 }, { "auxiliary_loss_clip": 0.01153805, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.05322635, "balance_loss_mlp": 1.01950908, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 3.3333075141454556, "language_loss": 0.75291955, "learning_rate": 3.639190937376594e-06, "loss": 0.77482736, "num_input_tokens_seen": 78364085, "step": 3632, "time_per_iteration": 2.666961908340454 }, { "auxiliary_loss_clip": 0.01149425, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.05168736, "balance_loss_mlp": 1.02262831, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 2.135610011090477, "language_loss": 0.83723396, "learning_rate": 3.638967767095249e-06, "loss": 0.85911822, "num_input_tokens_seen": 78381385, "step": 3633, "time_per_iteration": 2.6193437576293945 }, { "auxiliary_loss_clip": 0.0112373, "auxiliary_loss_mlp": 0.01049933, "balance_loss_clip": 1.05514872, "balance_loss_mlp": 1.03280258, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 1.713148643324746, "language_loss": 0.81381643, "learning_rate": 3.6387445346640823e-06, "loss": 0.83555305, "num_input_tokens_seen": 78400500, "step": 3634, "time_per_iteration": 2.7383267879486084 }, { "auxiliary_loss_clip": 0.01144832, "auxiliary_loss_mlp": 0.01040423, "balance_loss_clip": 1.0548327, "balance_loss_mlp": 1.02263677, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 1.8988648345390304, "language_loss": 0.74810624, "learning_rate": 3.638521240091558e-06, "loss": 0.76995879, "num_input_tokens_seen": 78418340, "step": 3635, "time_per_iteration": 2.7461390495300293 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.01052922, "balance_loss_clip": 1.05011106, "balance_loss_mlp": 1.03524303, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 2.2147010555825295, "language_loss": 0.88340998, "learning_rate": 3.6382978833861445e-06, "loss": 0.90518618, "num_input_tokens_seen": 78434375, "step": 3636, "time_per_iteration": 2.631352186203003 }, { "auxiliary_loss_clip": 0.01121776, "auxiliary_loss_mlp": 0.00776363, "balance_loss_clip": 1.05596519, "balance_loss_mlp": 1.00133038, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 2.464516707854487, "language_loss": 0.76037598, "learning_rate": 3.638074464556311e-06, "loss": 0.77935731, "num_input_tokens_seen": 78451735, "step": 3637, "time_per_iteration": 2.823063373565674 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.05512452, "balance_loss_mlp": 1.02393031, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 2.6753688852020328, "language_loss": 0.89996254, "learning_rate": 3.63785098361053e-06, "loss": 0.92176855, "num_input_tokens_seen": 78462730, "step": 3638, "time_per_iteration": 2.6404030323028564 }, { "auxiliary_loss_clip": 0.01142035, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.0538702, "balance_loss_mlp": 1.03351748, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.4375531856602692, "language_loss": 0.89243078, "learning_rate": 3.637627440557275e-06, "loss": 0.91436994, "num_input_tokens_seen": 78476300, "step": 3639, "time_per_iteration": 2.6214118003845215 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.00776277, "balance_loss_clip": 1.05406988, "balance_loss_mlp": 1.00129211, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 1.9800691484462982, "language_loss": 0.79167712, "learning_rate": 3.637403835405024e-06, "loss": 0.81078082, "num_input_tokens_seen": 78496135, "step": 3640, "time_per_iteration": 2.7559502124786377 }, { "auxiliary_loss_clip": 0.01149345, "auxiliary_loss_mlp": 0.01055855, "balance_loss_clip": 1.05816483, "balance_loss_mlp": 1.03617346, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2.2045237000129942, "language_loss": 0.71708757, "learning_rate": 3.637180168162255e-06, "loss": 0.73913956, "num_input_tokens_seen": 78513855, "step": 3641, "time_per_iteration": 2.6673953533172607 }, { "auxiliary_loss_clip": 0.01130115, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 1.05217481, "balance_loss_mlp": 1.02593243, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 1.9358190088314053, "language_loss": 0.81427026, "learning_rate": 3.63695643883745e-06, "loss": 0.83600873, "num_input_tokens_seen": 78531740, "step": 3642, "time_per_iteration": 2.6722965240478516 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01044184, "balance_loss_clip": 1.05707705, "balance_loss_mlp": 1.02520561, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 2.2890480980316865, "language_loss": 0.7124145, "learning_rate": 3.6367326474390928e-06, "loss": 0.73435903, "num_input_tokens_seen": 78549600, "step": 3643, "time_per_iteration": 2.6586625576019287 }, { "auxiliary_loss_clip": 0.01156283, "auxiliary_loss_mlp": 0.01046488, "balance_loss_clip": 1.05430686, "balance_loss_mlp": 1.02728367, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 2.705040309825256, "language_loss": 0.68497038, "learning_rate": 3.6365087939756696e-06, "loss": 0.70699811, "num_input_tokens_seen": 78573350, "step": 3644, "time_per_iteration": 2.835944414138794 }, { "auxiliary_loss_clip": 0.01157461, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.05381823, "balance_loss_mlp": 1.03175521, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 2.498314523319793, "language_loss": 0.77761143, "learning_rate": 3.636284878455669e-06, "loss": 0.79968452, "num_input_tokens_seen": 78591005, "step": 3645, "time_per_iteration": 2.6053528785705566 }, { "auxiliary_loss_clip": 0.01142456, "auxiliary_loss_mlp": 0.01054431, "balance_loss_clip": 1.05606842, "balance_loss_mlp": 1.03732491, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 3.1951942186566766, "language_loss": 0.82604313, "learning_rate": 3.636060900887582e-06, "loss": 0.84801197, "num_input_tokens_seen": 78610645, "step": 3646, "time_per_iteration": 4.198619842529297 }, { "auxiliary_loss_clip": 0.01141068, "auxiliary_loss_mlp": 0.01040772, "balance_loss_clip": 1.05287766, "balance_loss_mlp": 1.02365351, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 1.720246481727725, "language_loss": 0.82877636, "learning_rate": 3.635836861279901e-06, "loss": 0.85059476, "num_input_tokens_seen": 78628340, "step": 3647, "time_per_iteration": 4.229920387268066 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01054202, "balance_loss_clip": 1.05145597, "balance_loss_mlp": 1.03685677, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 1.6932394069108108, "language_loss": 0.72652817, "learning_rate": 3.635612759641123e-06, "loss": 0.74857587, "num_input_tokens_seen": 78649355, "step": 3648, "time_per_iteration": 2.7226104736328125 }, { "auxiliary_loss_clip": 0.01110484, "auxiliary_loss_mlp": 0.01057841, "balance_loss_clip": 1.04757857, "balance_loss_mlp": 1.03643107, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 3.9115777702699175, "language_loss": 0.74917972, "learning_rate": 3.635388595979745e-06, "loss": 0.77086294, "num_input_tokens_seen": 78664915, "step": 3649, "time_per_iteration": 4.201031446456909 }, { "auxiliary_loss_clip": 0.01138726, "auxiliary_loss_mlp": 0.0105421, "balance_loss_clip": 1.0536499, "balance_loss_mlp": 1.03718746, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 1.8914434058388716, "language_loss": 0.86353791, "learning_rate": 3.635164370304267e-06, "loss": 0.88546729, "num_input_tokens_seen": 78681475, "step": 3650, "time_per_iteration": 2.6061322689056396 }, { "auxiliary_loss_clip": 0.01130852, "auxiliary_loss_mlp": 0.01052398, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.03439701, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 2.798139483493165, "language_loss": 0.83541161, "learning_rate": 3.6349400826231927e-06, "loss": 0.85724407, "num_input_tokens_seen": 78702300, "step": 3651, "time_per_iteration": 2.7605133056640625 }, { "auxiliary_loss_clip": 0.01143643, "auxiliary_loss_mlp": 0.0105251, "balance_loss_clip": 1.05282581, "balance_loss_mlp": 1.03511763, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 1.9065881796375543, "language_loss": 0.74475014, "learning_rate": 3.634715732945027e-06, "loss": 0.76671165, "num_input_tokens_seen": 78720230, "step": 3652, "time_per_iteration": 2.597443103790283 }, { "auxiliary_loss_clip": 0.01038431, "auxiliary_loss_mlp": 0.01009267, "balance_loss_clip": 1.0361495, "balance_loss_mlp": 1.0068711, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7482502800744824, "language_loss": 0.51550615, "learning_rate": 3.6344913212782764e-06, "loss": 0.5359832, "num_input_tokens_seen": 78780200, "step": 3653, "time_per_iteration": 3.324497699737549 }, { "auxiliary_loss_clip": 0.01125533, "auxiliary_loss_mlp": 0.01062527, "balance_loss_clip": 1.05436754, "balance_loss_mlp": 1.04470527, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 1.9578946934595152, "language_loss": 0.75356162, "learning_rate": 3.6342668476314514e-06, "loss": 0.77544224, "num_input_tokens_seen": 78800575, "step": 3654, "time_per_iteration": 4.296064615249634 }, { "auxiliary_loss_clip": 0.01152337, "auxiliary_loss_mlp": 0.01051249, "balance_loss_clip": 1.05944824, "balance_loss_mlp": 1.03376114, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 1.8387519277823352, "language_loss": 0.72646022, "learning_rate": 3.634042312013064e-06, "loss": 0.74849606, "num_input_tokens_seen": 78819585, "step": 3655, "time_per_iteration": 2.6634860038757324 }, { "auxiliary_loss_clip": 0.01130021, "auxiliary_loss_mlp": 0.01048784, "balance_loss_clip": 1.05423379, "balance_loss_mlp": 1.03071189, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 1.722985511504472, "language_loss": 0.80795759, "learning_rate": 3.6338177144316276e-06, "loss": 0.82974565, "num_input_tokens_seen": 78837330, "step": 3656, "time_per_iteration": 2.730391502380371 }, { "auxiliary_loss_clip": 0.01124773, "auxiliary_loss_mlp": 0.00776202, "balance_loss_clip": 1.06113994, "balance_loss_mlp": 1.00139225, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 2.646453773467974, "language_loss": 0.84885842, "learning_rate": 3.63359305489566e-06, "loss": 0.86786819, "num_input_tokens_seen": 78854955, "step": 3657, "time_per_iteration": 2.657607078552246 }, { "auxiliary_loss_clip": 0.01142645, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.05631852, "balance_loss_mlp": 1.02260423, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 2.6990832263195585, "language_loss": 0.80355585, "learning_rate": 3.6333683334136803e-06, "loss": 0.82538766, "num_input_tokens_seen": 78874965, "step": 3658, "time_per_iteration": 2.6584107875823975 }, { "auxiliary_loss_clip": 0.01048937, "auxiliary_loss_mlp": 0.0100499, "balance_loss_clip": 1.03857517, "balance_loss_mlp": 1.00202215, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7788612160796681, "language_loss": 0.58191586, "learning_rate": 3.6331435499942095e-06, "loss": 0.60245514, "num_input_tokens_seen": 78937740, "step": 3659, "time_per_iteration": 3.3395371437072754 }, { "auxiliary_loss_clip": 0.01111007, "auxiliary_loss_mlp": 0.0105329, "balance_loss_clip": 1.05029392, "balance_loss_mlp": 1.03471744, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 4.382741616753977, "language_loss": 0.7477597, "learning_rate": 3.632918704645772e-06, "loss": 0.76940262, "num_input_tokens_seen": 78955055, "step": 3660, "time_per_iteration": 2.782975435256958 }, { "auxiliary_loss_clip": 0.01147277, "auxiliary_loss_mlp": 0.01044652, "balance_loss_clip": 1.05691171, "balance_loss_mlp": 1.02653265, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 1.8856077512582532, "language_loss": 0.81484449, "learning_rate": 3.632693797376893e-06, "loss": 0.83676374, "num_input_tokens_seen": 78974895, "step": 3661, "time_per_iteration": 2.7780110836029053 }, { "auxiliary_loss_clip": 0.01126694, "auxiliary_loss_mlp": 0.01056397, "balance_loss_clip": 1.05167532, "balance_loss_mlp": 1.03800273, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 1.9746283079458686, "language_loss": 0.73154199, "learning_rate": 3.632468828196102e-06, "loss": 0.75337297, "num_input_tokens_seen": 78994990, "step": 3662, "time_per_iteration": 2.7189040184020996 }, { "auxiliary_loss_clip": 0.0113519, "auxiliary_loss_mlp": 0.01051686, "balance_loss_clip": 1.05718994, "balance_loss_mlp": 1.03555691, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 2.0576168655035714, "language_loss": 0.78066969, "learning_rate": 3.632243797111929e-06, "loss": 0.80253839, "num_input_tokens_seen": 79014405, "step": 3663, "time_per_iteration": 2.731412410736084 }, { "auxiliary_loss_clip": 0.01142837, "auxiliary_loss_mlp": 0.01063521, "balance_loss_clip": 1.05659413, "balance_loss_mlp": 1.04352939, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 1.752119258875799, "language_loss": 0.80294079, "learning_rate": 3.632018704132908e-06, "loss": 0.82500434, "num_input_tokens_seen": 79032375, "step": 3664, "time_per_iteration": 2.7043297290802 }, { "auxiliary_loss_clip": 0.01134207, "auxiliary_loss_mlp": 0.01044352, "balance_loss_clip": 1.05424213, "balance_loss_mlp": 1.02474177, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 3.138103913885462, "language_loss": 0.76388288, "learning_rate": 3.6317935492675742e-06, "loss": 0.78566849, "num_input_tokens_seen": 79049635, "step": 3665, "time_per_iteration": 2.68300199508667 }, { "auxiliary_loss_clip": 0.01128405, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.05599689, "balance_loss_mlp": 1.03589976, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 2.9738702224471583, "language_loss": 0.9800086, "learning_rate": 3.631568332524466e-06, "loss": 1.00182581, "num_input_tokens_seen": 79062890, "step": 3666, "time_per_iteration": 2.702584981918335 }, { "auxiliary_loss_clip": 0.01141573, "auxiliary_loss_mlp": 0.00776689, "balance_loss_clip": 1.05254698, "balance_loss_mlp": 1.00133562, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 1.894759892223008, "language_loss": 0.80946934, "learning_rate": 3.631343053912122e-06, "loss": 0.82865196, "num_input_tokens_seen": 79085495, "step": 3667, "time_per_iteration": 2.8920814990997314 }, { "auxiliary_loss_clip": 0.01149896, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.06145239, "balance_loss_mlp": 1.03161693, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 1.8771463594277091, "language_loss": 0.7736783, "learning_rate": 3.631117713439087e-06, "loss": 0.79568756, "num_input_tokens_seen": 79101820, "step": 3668, "time_per_iteration": 2.6733500957489014 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05955744, "balance_loss_mlp": 1.02972412, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 1.7809066581326154, "language_loss": 0.71624571, "learning_rate": 3.630892311113904e-06, "loss": 0.7381565, "num_input_tokens_seen": 79123320, "step": 3669, "time_per_iteration": 2.7298974990844727 }, { "auxiliary_loss_clip": 0.01155448, "auxiliary_loss_mlp": 0.01039044, "balance_loss_clip": 1.0544126, "balance_loss_mlp": 1.0217346, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 2.1257290130035082, "language_loss": 0.85160267, "learning_rate": 3.6306668469451215e-06, "loss": 0.87354761, "num_input_tokens_seen": 79141615, "step": 3670, "time_per_iteration": 2.6624948978424072 }, { "auxiliary_loss_clip": 0.01137906, "auxiliary_loss_mlp": 0.01042298, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.02376091, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 1.8008957470192373, "language_loss": 0.76928926, "learning_rate": 3.6304413209412886e-06, "loss": 0.79109132, "num_input_tokens_seen": 79164910, "step": 3671, "time_per_iteration": 2.7914648056030273 }, { "auxiliary_loss_clip": 0.01126159, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.05423856, "balance_loss_mlp": 1.02281129, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.015071454696955, "language_loss": 0.80643147, "learning_rate": 3.6302157331109573e-06, "loss": 0.82810068, "num_input_tokens_seen": 79179685, "step": 3672, "time_per_iteration": 2.674381732940674 }, { "auxiliary_loss_clip": 0.01149005, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.05706501, "balance_loss_mlp": 1.02992952, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 2.222038104071356, "language_loss": 0.73278964, "learning_rate": 3.629990083462682e-06, "loss": 0.75475204, "num_input_tokens_seen": 79196285, "step": 3673, "time_per_iteration": 2.6856846809387207 }, { "auxiliary_loss_clip": 0.01121745, "auxiliary_loss_mlp": 0.01044908, "balance_loss_clip": 1.05473876, "balance_loss_mlp": 1.02608538, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 1.9530426336903413, "language_loss": 0.76384282, "learning_rate": 3.6297643720050203e-06, "loss": 0.78550935, "num_input_tokens_seen": 79216060, "step": 3674, "time_per_iteration": 2.816190242767334 }, { "auxiliary_loss_clip": 0.01156134, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.05650616, "balance_loss_mlp": 1.02850175, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 2.045565300481816, "language_loss": 0.74367136, "learning_rate": 3.6295385987465293e-06, "loss": 0.76571238, "num_input_tokens_seen": 79235145, "step": 3675, "time_per_iteration": 2.69748592376709 }, { "auxiliary_loss_clip": 0.01155113, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.05442023, "balance_loss_mlp": 1.02800727, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 1.898816078558846, "language_loss": 0.79801333, "learning_rate": 3.629312763695772e-06, "loss": 0.82002068, "num_input_tokens_seen": 79256960, "step": 3676, "time_per_iteration": 2.6792948246002197 }, { "auxiliary_loss_clip": 0.01133095, "auxiliary_loss_mlp": 0.01049823, "balance_loss_clip": 1.05366707, "balance_loss_mlp": 1.03257358, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 2.1537198076644954, "language_loss": 0.75327688, "learning_rate": 3.6290868668613107e-06, "loss": 0.77510607, "num_input_tokens_seen": 79274860, "step": 3677, "time_per_iteration": 2.781393527984619 }, { "auxiliary_loss_clip": 0.0111612, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.04986429, "balance_loss_mlp": 1.03212988, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 1.7875463894855461, "language_loss": 0.83287871, "learning_rate": 3.628860908251712e-06, "loss": 0.85454059, "num_input_tokens_seen": 79294005, "step": 3678, "time_per_iteration": 2.752838611602783 }, { "auxiliary_loss_clip": 0.01094052, "auxiliary_loss_mlp": 0.01058605, "balance_loss_clip": 1.04951406, "balance_loss_mlp": 1.03992522, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.6742153249136704, "language_loss": 0.89135075, "learning_rate": 3.6286348878755452e-06, "loss": 0.91287732, "num_input_tokens_seen": 79314005, "step": 3679, "time_per_iteration": 2.8282527923583984 }, { "auxiliary_loss_clip": 0.01147641, "auxiliary_loss_mlp": 0.01054276, "balance_loss_clip": 1.05507338, "balance_loss_mlp": 1.03615618, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 3.092644946410345, "language_loss": 0.8649044, "learning_rate": 3.6284088057413803e-06, "loss": 0.88692355, "num_input_tokens_seen": 79331030, "step": 3680, "time_per_iteration": 2.630829095840454 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.05374503, "balance_loss_mlp": 1.03395414, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 1.9427224492838853, "language_loss": 0.81773758, "learning_rate": 3.6281826618577894e-06, "loss": 0.83938313, "num_input_tokens_seen": 79348560, "step": 3681, "time_per_iteration": 2.805880069732666 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.00775652, "balance_loss_clip": 1.0530386, "balance_loss_mlp": 1.00146043, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 2.296553230959153, "language_loss": 0.80099678, "learning_rate": 3.62795645623335e-06, "loss": 0.82023835, "num_input_tokens_seen": 79367175, "step": 3682, "time_per_iteration": 2.624234199523926 }, { "auxiliary_loss_clip": 0.0112405, "auxiliary_loss_mlp": 0.0105126, "balance_loss_clip": 1.0500052, "balance_loss_mlp": 1.03198409, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 1.6781760642146926, "language_loss": 0.77394038, "learning_rate": 3.627730188876638e-06, "loss": 0.7956934, "num_input_tokens_seen": 79388435, "step": 3683, "time_per_iteration": 2.6746323108673096 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01051291, "balance_loss_clip": 1.05048668, "balance_loss_mlp": 1.03411245, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 2.1201256685163323, "language_loss": 0.72406399, "learning_rate": 3.627503859796234e-06, "loss": 0.7458744, "num_input_tokens_seen": 79407910, "step": 3684, "time_per_iteration": 2.695958375930786 }, { "auxiliary_loss_clip": 0.01084051, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.02571654, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.1308896442870893, "language_loss": 0.79817796, "learning_rate": 3.6272774690007207e-06, "loss": 0.81947458, "num_input_tokens_seen": 79424020, "step": 3685, "time_per_iteration": 2.7443795204162598 }, { "auxiliary_loss_clip": 0.01147394, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.05201805, "balance_loss_mlp": 1.02867222, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.6870532517893482, "language_loss": 0.87305272, "learning_rate": 3.6270510164986823e-06, "loss": 0.89498115, "num_input_tokens_seen": 79445605, "step": 3686, "time_per_iteration": 4.388494968414307 }, { "auxiliary_loss_clip": 0.01137917, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.052562, "balance_loss_mlp": 1.02620554, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 1.8821221420403713, "language_loss": 0.78069639, "learning_rate": 3.626824502298707e-06, "loss": 0.80251229, "num_input_tokens_seen": 79463850, "step": 3687, "time_per_iteration": 4.123531103134155 }, { "auxiliary_loss_clip": 0.0112545, "auxiliary_loss_mlp": 0.01052599, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.0331558, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 1.8251811803295879, "language_loss": 0.84860861, "learning_rate": 3.626597926409383e-06, "loss": 0.8703891, "num_input_tokens_seen": 79482845, "step": 3688, "time_per_iteration": 4.287938594818115 }, { "auxiliary_loss_clip": 0.01110764, "auxiliary_loss_mlp": 0.01051634, "balance_loss_clip": 1.04967332, "balance_loss_mlp": 1.03254843, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 1.7785994747216247, "language_loss": 0.81150943, "learning_rate": 3.6263712888393027e-06, "loss": 0.83313334, "num_input_tokens_seen": 79501550, "step": 3689, "time_per_iteration": 2.7521302700042725 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.05078936, "balance_loss_mlp": 1.03131568, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 1.7481542974535997, "language_loss": 0.70018351, "learning_rate": 3.626144589597061e-06, "loss": 0.72191954, "num_input_tokens_seen": 79519680, "step": 3690, "time_per_iteration": 2.6664223670959473 }, { "auxiliary_loss_clip": 0.01147193, "auxiliary_loss_mlp": 0.00777365, "balance_loss_clip": 1.0537169, "balance_loss_mlp": 1.00153625, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 1.8112729447523994, "language_loss": 0.72609359, "learning_rate": 3.6259178286912528e-06, "loss": 0.74533916, "num_input_tokens_seen": 79539000, "step": 3691, "time_per_iteration": 2.6724495887756348 }, { "auxiliary_loss_clip": 0.01144688, "auxiliary_loss_mlp": 0.01046427, "balance_loss_clip": 1.05663919, "balance_loss_mlp": 1.0275923, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 1.8134603978799304, "language_loss": 0.71503472, "learning_rate": 3.625691006130477e-06, "loss": 0.73694593, "num_input_tokens_seen": 79559695, "step": 3692, "time_per_iteration": 2.6743686199188232 }, { "auxiliary_loss_clip": 0.01147828, "auxiliary_loss_mlp": 0.01048973, "balance_loss_clip": 1.05410266, "balance_loss_mlp": 1.03098464, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 2.1147705582229577, "language_loss": 0.87551594, "learning_rate": 3.6254641219233362e-06, "loss": 0.89748394, "num_input_tokens_seen": 79579095, "step": 3693, "time_per_iteration": 4.2962939739227295 }, { "auxiliary_loss_clip": 0.01141134, "auxiliary_loss_mlp": 0.01041066, "balance_loss_clip": 1.0537045, "balance_loss_mlp": 1.02479386, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 1.9865017520636683, "language_loss": 0.85553116, "learning_rate": 3.6252371760784325e-06, "loss": 0.87735319, "num_input_tokens_seen": 79596430, "step": 3694, "time_per_iteration": 2.585657835006714 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.01045482, "balance_loss_clip": 1.04370403, "balance_loss_mlp": 1.02640843, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 2.1752375595399136, "language_loss": 0.68740189, "learning_rate": 3.6250101686043725e-06, "loss": 0.70893133, "num_input_tokens_seen": 79615825, "step": 3695, "time_per_iteration": 2.744264841079712 }, { "auxiliary_loss_clip": 0.01118075, "auxiliary_loss_mlp": 0.01047291, "balance_loss_clip": 1.051736, "balance_loss_mlp": 1.0310905, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 1.6851408018575031, "language_loss": 0.71540272, "learning_rate": 3.6247830995097637e-06, "loss": 0.73705637, "num_input_tokens_seen": 79637875, "step": 3696, "time_per_iteration": 2.7320780754089355 }, { "auxiliary_loss_clip": 0.01140935, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.05123305, "balance_loss_mlp": 1.02455115, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 1.7186386141421306, "language_loss": 0.87905443, "learning_rate": 3.624555968803217e-06, "loss": 0.90089417, "num_input_tokens_seen": 79656970, "step": 3697, "time_per_iteration": 2.65919828414917 }, { "auxiliary_loss_clip": 0.01118987, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.04718316, "balance_loss_mlp": 1.0255338, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 1.6515031384229777, "language_loss": 0.65900242, "learning_rate": 3.624328776493346e-06, "loss": 0.6806137, "num_input_tokens_seen": 79680275, "step": 3698, "time_per_iteration": 2.7708024978637695 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01049333, "balance_loss_clip": 1.05630088, "balance_loss_mlp": 1.03102303, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 1.9634592665257078, "language_loss": 0.82520199, "learning_rate": 3.6241015225887637e-06, "loss": 0.84712231, "num_input_tokens_seen": 79701255, "step": 3699, "time_per_iteration": 2.7743008136749268 }, { "auxiliary_loss_clip": 0.01129692, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.05154991, "balance_loss_mlp": 1.02939105, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 1.6711069078421557, "language_loss": 0.79384553, "learning_rate": 3.62387420709809e-06, "loss": 0.8156184, "num_input_tokens_seen": 79721315, "step": 3700, "time_per_iteration": 2.652172327041626 }, { "auxiliary_loss_clip": 0.01111144, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.04893112, "balance_loss_mlp": 1.02608061, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 2.123831341506728, "language_loss": 0.72503817, "learning_rate": 3.623646830029943e-06, "loss": 0.74660432, "num_input_tokens_seen": 79742705, "step": 3701, "time_per_iteration": 2.943124294281006 }, { "auxiliary_loss_clip": 0.01139412, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.05053067, "balance_loss_mlp": 1.0246197, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 1.9127522113256972, "language_loss": 0.79901838, "learning_rate": 3.6234193913929454e-06, "loss": 0.82083315, "num_input_tokens_seen": 79763000, "step": 3702, "time_per_iteration": 2.6978282928466797 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.04707038, "balance_loss_mlp": 1.02655816, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 1.8258996761992496, "language_loss": 0.78237271, "learning_rate": 3.623191891195723e-06, "loss": 0.80410373, "num_input_tokens_seen": 79781335, "step": 3703, "time_per_iteration": 2.6528990268707275 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.0503273, "balance_loss_mlp": 1.0171181, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 2.1693263198920563, "language_loss": 0.74490714, "learning_rate": 3.6229643294469005e-06, "loss": 0.76667851, "num_input_tokens_seen": 79800150, "step": 3704, "time_per_iteration": 2.679184913635254 }, { "auxiliary_loss_clip": 0.0110341, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.046996, "balance_loss_mlp": 1.02684951, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 1.8279463297536431, "language_loss": 0.644319, "learning_rate": 3.6227367061551074e-06, "loss": 0.66579175, "num_input_tokens_seen": 79822390, "step": 3705, "time_per_iteration": 2.972221612930298 }, { "auxiliary_loss_clip": 0.01037239, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.03748369, "balance_loss_mlp": 1.02111423, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.2472387125776994, "language_loss": 0.65169704, "learning_rate": 3.6225090213289766e-06, "loss": 0.67230093, "num_input_tokens_seen": 79873350, "step": 3706, "time_per_iteration": 3.118619203567505 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01040401, "balance_loss_clip": 1.04938805, "balance_loss_mlp": 1.02290082, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 1.912279921070755, "language_loss": 0.80597419, "learning_rate": 3.622281274977141e-06, "loss": 0.8275401, "num_input_tokens_seen": 79891715, "step": 3707, "time_per_iteration": 2.6555368900299072 }, { "auxiliary_loss_clip": 0.01149897, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02203059, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 1.9339558574691282, "language_loss": 0.78542316, "learning_rate": 3.6220534671082367e-06, "loss": 0.80731529, "num_input_tokens_seen": 79911175, "step": 3708, "time_per_iteration": 2.7179131507873535 }, { "auxiliary_loss_clip": 0.01128276, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02363038, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 1.8085596793383067, "language_loss": 0.80606776, "learning_rate": 3.6218255977309024e-06, "loss": 0.82777578, "num_input_tokens_seen": 79931875, "step": 3709, "time_per_iteration": 2.810605764389038 }, { "auxiliary_loss_clip": 0.01135044, "auxiliary_loss_mlp": 0.00777248, "balance_loss_clip": 1.0480969, "balance_loss_mlp": 1.0014261, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 2.100780376064183, "language_loss": 0.69068789, "learning_rate": 3.6215976668537787e-06, "loss": 0.70981085, "num_input_tokens_seen": 79952445, "step": 3710, "time_per_iteration": 2.7197980880737305 }, { "auxiliary_loss_clip": 0.01111671, "auxiliary_loss_mlp": 0.01050475, "balance_loss_clip": 1.04630041, "balance_loss_mlp": 1.03220057, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 2.1025491711486763, "language_loss": 0.90782154, "learning_rate": 3.6213696744855096e-06, "loss": 0.92944294, "num_input_tokens_seen": 79971030, "step": 3711, "time_per_iteration": 2.808014154434204 }, { "auxiliary_loss_clip": 0.01117969, "auxiliary_loss_mlp": 0.01059175, "balance_loss_clip": 1.04696095, "balance_loss_mlp": 1.03921938, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 6.2447945102939615, "language_loss": 0.89070308, "learning_rate": 3.6211416206347395e-06, "loss": 0.91247451, "num_input_tokens_seen": 79982085, "step": 3712, "time_per_iteration": 2.6701955795288086 }, { "auxiliary_loss_clip": 0.01150852, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.05445373, "balance_loss_mlp": 1.02627039, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 5.249819485386642, "language_loss": 0.75858659, "learning_rate": 3.620913505310117e-06, "loss": 0.78053784, "num_input_tokens_seen": 79997460, "step": 3713, "time_per_iteration": 2.5961148738861084 }, { "auxiliary_loss_clip": 0.01106588, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.05345535, "balance_loss_mlp": 1.0252645, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 1.7774284049242903, "language_loss": 0.62422931, "learning_rate": 3.6206853285202917e-06, "loss": 0.6457268, "num_input_tokens_seen": 80022450, "step": 3714, "time_per_iteration": 2.9655838012695312 }, { "auxiliary_loss_clip": 0.0112071, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.05258489, "balance_loss_mlp": 1.0163759, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 5.465931600334143, "language_loss": 0.79076529, "learning_rate": 3.6204570902739164e-06, "loss": 0.81230301, "num_input_tokens_seen": 80042100, "step": 3715, "time_per_iteration": 2.8040106296539307 }, { "auxiliary_loss_clip": 0.01113318, "auxiliary_loss_mlp": 0.01049585, "balance_loss_clip": 1.05601192, "balance_loss_mlp": 1.03176367, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 2.696607190089822, "language_loss": 0.77416688, "learning_rate": 3.620228790579645e-06, "loss": 0.79579592, "num_input_tokens_seen": 80059690, "step": 3716, "time_per_iteration": 2.721008777618408 }, { "auxiliary_loss_clip": 0.01123787, "auxiliary_loss_mlp": 0.01043954, "balance_loss_clip": 1.04860306, "balance_loss_mlp": 1.02644157, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 3.4762745813408884, "language_loss": 0.79258984, "learning_rate": 3.6200004294461367e-06, "loss": 0.81426722, "num_input_tokens_seen": 80076060, "step": 3717, "time_per_iteration": 2.724637746810913 }, { "auxiliary_loss_clip": 0.0107853, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.04485083, "balance_loss_mlp": 1.02390504, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 1.9798483733973138, "language_loss": 0.67890245, "learning_rate": 3.6197720068820497e-06, "loss": 0.70011252, "num_input_tokens_seen": 80094760, "step": 3718, "time_per_iteration": 2.8178799152374268 }, { "auxiliary_loss_clip": 0.01128946, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.04887676, "balance_loss_mlp": 1.02374721, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 1.6261924310986715, "language_loss": 0.81046188, "learning_rate": 3.619543522896045e-06, "loss": 0.83218175, "num_input_tokens_seen": 80114475, "step": 3719, "time_per_iteration": 2.8068079948425293 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.0105526, "balance_loss_clip": 1.05054009, "balance_loss_mlp": 1.03555441, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 2.128611791985372, "language_loss": 0.86535168, "learning_rate": 3.6193149774967885e-06, "loss": 0.88718653, "num_input_tokens_seen": 80132920, "step": 3720, "time_per_iteration": 2.726252794265747 }, { "auxiliary_loss_clip": 0.01123833, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.05347347, "balance_loss_mlp": 1.0207628, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 1.725668609175168, "language_loss": 0.7471531, "learning_rate": 3.619086370692945e-06, "loss": 0.76878393, "num_input_tokens_seen": 80152845, "step": 3721, "time_per_iteration": 2.77329158782959 }, { "auxiliary_loss_clip": 0.01158005, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05607998, "balance_loss_mlp": 1.02497673, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 3.166607303525693, "language_loss": 0.7957024, "learning_rate": 3.6188577024931844e-06, "loss": 0.8177169, "num_input_tokens_seen": 80170680, "step": 3722, "time_per_iteration": 2.7204909324645996 }, { "auxiliary_loss_clip": 0.01113056, "auxiliary_loss_mlp": 0.01041868, "balance_loss_clip": 1.0520618, "balance_loss_mlp": 1.02571511, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 2.0043774256219997, "language_loss": 0.82129884, "learning_rate": 3.618628972906178e-06, "loss": 0.84284806, "num_input_tokens_seen": 80189030, "step": 3723, "time_per_iteration": 2.7908549308776855 }, { "auxiliary_loss_clip": 0.01155309, "auxiliary_loss_mlp": 0.01046826, "balance_loss_clip": 1.05468059, "balance_loss_mlp": 1.02857494, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 2.0838579777085022, "language_loss": 0.84742224, "learning_rate": 3.6184001819405984e-06, "loss": 0.86944354, "num_input_tokens_seen": 80208365, "step": 3724, "time_per_iteration": 2.691678047180176 }, { "auxiliary_loss_clip": 0.01123425, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.0494504, "balance_loss_mlp": 1.02516866, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 1.76453761267329, "language_loss": 0.79456621, "learning_rate": 3.618171329605121e-06, "loss": 0.81622583, "num_input_tokens_seen": 80228685, "step": 3725, "time_per_iteration": 4.339299917221069 }, { "auxiliary_loss_clip": 0.01091555, "auxiliary_loss_mlp": 0.01043361, "balance_loss_clip": 1.05116296, "balance_loss_mlp": 1.02538443, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 1.776149940187026, "language_loss": 0.77333415, "learning_rate": 3.6179424159084254e-06, "loss": 0.79468334, "num_input_tokens_seen": 80247635, "step": 3726, "time_per_iteration": 4.320322275161743 }, { "auxiliary_loss_clip": 0.0115151, "auxiliary_loss_mlp": 0.01047267, "balance_loss_clip": 1.05424356, "balance_loss_mlp": 1.02664328, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 2.83844669603944, "language_loss": 0.72643399, "learning_rate": 3.6177134408591914e-06, "loss": 0.74842173, "num_input_tokens_seen": 80260045, "step": 3727, "time_per_iteration": 4.218656539916992 }, { "auxiliary_loss_clip": 0.01157504, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.0541296, "balance_loss_mlp": 1.02321815, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.250671737688348, "language_loss": 0.86600292, "learning_rate": 3.6174844044661013e-06, "loss": 0.88801229, "num_input_tokens_seen": 80277680, "step": 3728, "time_per_iteration": 2.650423765182495 }, { "auxiliary_loss_clip": 0.01122602, "auxiliary_loss_mlp": 0.01053562, "balance_loss_clip": 1.050982, "balance_loss_mlp": 1.03134131, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 2.1953419048873877, "language_loss": 0.80038953, "learning_rate": 3.6172553067378406e-06, "loss": 0.82215106, "num_input_tokens_seen": 80294795, "step": 3729, "time_per_iteration": 2.7553794384002686 }, { "auxiliary_loss_clip": 0.01126228, "auxiliary_loss_mlp": 0.01046911, "balance_loss_clip": 1.05183935, "balance_loss_mlp": 1.02992368, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.8211738544282683, "language_loss": 0.86968076, "learning_rate": 3.6170261476830964e-06, "loss": 0.89141214, "num_input_tokens_seen": 80315425, "step": 3730, "time_per_iteration": 2.8044395446777344 }, { "auxiliary_loss_clip": 0.01121982, "auxiliary_loss_mlp": 0.00775761, "balance_loss_clip": 1.04924226, "balance_loss_mlp": 1.00148201, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 2.1817469574553017, "language_loss": 0.73091185, "learning_rate": 3.616796927310559e-06, "loss": 0.74988931, "num_input_tokens_seen": 80333905, "step": 3731, "time_per_iteration": 2.764198064804077 }, { "auxiliary_loss_clip": 0.01127044, "auxiliary_loss_mlp": 0.0104235, "balance_loss_clip": 1.05654919, "balance_loss_mlp": 1.02467108, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 2.1924274894904787, "language_loss": 0.75427651, "learning_rate": 3.6165676456289195e-06, "loss": 0.77597046, "num_input_tokens_seen": 80352165, "step": 3732, "time_per_iteration": 4.544835090637207 }, { "auxiliary_loss_clip": 0.01155285, "auxiliary_loss_mlp": 0.01053522, "balance_loss_clip": 1.05655456, "balance_loss_mlp": 1.03560436, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.745203479087184, "language_loss": 0.88139856, "learning_rate": 3.616338302646873e-06, "loss": 0.90348667, "num_input_tokens_seen": 80371305, "step": 3733, "time_per_iteration": 2.7097933292388916 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.05094051, "balance_loss_mlp": 1.02264953, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 1.6873732683679492, "language_loss": 0.84643197, "learning_rate": 3.6161088983731166e-06, "loss": 0.86793089, "num_input_tokens_seen": 80391020, "step": 3734, "time_per_iteration": 2.7647547721862793 }, { "auxiliary_loss_clip": 0.0113181, "auxiliary_loss_mlp": 0.01049327, "balance_loss_clip": 1.05362856, "balance_loss_mlp": 1.03149319, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 1.774553175519815, "language_loss": 0.7679311, "learning_rate": 3.6158794328163482e-06, "loss": 0.78974247, "num_input_tokens_seen": 80411365, "step": 3735, "time_per_iteration": 2.7682430744171143 }, { "auxiliary_loss_clip": 0.01138858, "auxiliary_loss_mlp": 0.01045746, "balance_loss_clip": 1.06029248, "balance_loss_mlp": 1.02927136, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 1.671324371931155, "language_loss": 0.842767, "learning_rate": 3.6156499059852702e-06, "loss": 0.86461306, "num_input_tokens_seen": 80431075, "step": 3736, "time_per_iteration": 3.009368419647217 }, { "auxiliary_loss_clip": 0.0111279, "auxiliary_loss_mlp": 0.01044111, "balance_loss_clip": 1.05240226, "balance_loss_mlp": 1.02677774, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 1.8971112354532307, "language_loss": 0.86643183, "learning_rate": 3.615420317888586e-06, "loss": 0.88800085, "num_input_tokens_seen": 80449240, "step": 3737, "time_per_iteration": 2.792965888977051 }, { "auxiliary_loss_clip": 0.0115891, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.05792093, "balance_loss_mlp": 1.03051496, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 6.664079021041442, "language_loss": 0.79027152, "learning_rate": 3.6151906685350006e-06, "loss": 0.81235784, "num_input_tokens_seen": 80467900, "step": 3738, "time_per_iteration": 2.716878652572632 }, { "auxiliary_loss_clip": 0.01122737, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.0520165, "balance_loss_mlp": 1.0315063, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 1.837059456311059, "language_loss": 0.76693523, "learning_rate": 3.614960957933224e-06, "loss": 0.78864253, "num_input_tokens_seen": 80487100, "step": 3739, "time_per_iteration": 2.743222713470459 }, { "auxiliary_loss_clip": 0.01116493, "auxiliary_loss_mlp": 0.01049772, "balance_loss_clip": 1.05008686, "balance_loss_mlp": 1.03011417, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 2.2924613412630133, "language_loss": 0.74577379, "learning_rate": 3.6147311860919655e-06, "loss": 0.7674365, "num_input_tokens_seen": 80508625, "step": 3740, "time_per_iteration": 2.7339253425598145 }, { "auxiliary_loss_clip": 0.01152276, "auxiliary_loss_mlp": 0.01045147, "balance_loss_clip": 1.05556941, "balance_loss_mlp": 1.02728927, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 1.9086069443180373, "language_loss": 0.75610423, "learning_rate": 3.614501353019939e-06, "loss": 0.77807844, "num_input_tokens_seen": 80527345, "step": 3741, "time_per_iteration": 2.7347571849823 }, { "auxiliary_loss_clip": 0.01133279, "auxiliary_loss_mlp": 0.01039745, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.02316284, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 1.7754272123040742, "language_loss": 0.87332213, "learning_rate": 3.6142714587258592e-06, "loss": 0.89505225, "num_input_tokens_seen": 80545545, "step": 3742, "time_per_iteration": 2.702103614807129 }, { "auxiliary_loss_clip": 0.01095068, "auxiliary_loss_mlp": 0.01053093, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03398395, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 2.1035678371185256, "language_loss": 0.812823, "learning_rate": 3.614041503218444e-06, "loss": 0.83430457, "num_input_tokens_seen": 80565040, "step": 3743, "time_per_iteration": 2.777566909790039 }, { "auxiliary_loss_clip": 0.01142483, "auxiliary_loss_mlp": 0.01040692, "balance_loss_clip": 1.05282855, "balance_loss_mlp": 1.02319252, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 2.836562973763206, "language_loss": 0.63821399, "learning_rate": 3.6138114865064134e-06, "loss": 0.66004574, "num_input_tokens_seen": 80582815, "step": 3744, "time_per_iteration": 2.6738698482513428 }, { "auxiliary_loss_clip": 0.01139201, "auxiliary_loss_mlp": 0.01043137, "balance_loss_clip": 1.05523586, "balance_loss_mlp": 1.0255779, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 4.405698565190268, "language_loss": 0.76340199, "learning_rate": 3.613581408598489e-06, "loss": 0.78522527, "num_input_tokens_seen": 80600865, "step": 3745, "time_per_iteration": 2.8423044681549072 }, { "auxiliary_loss_clip": 0.01116037, "auxiliary_loss_mlp": 0.0104407, "balance_loss_clip": 1.04906797, "balance_loss_mlp": 1.0267489, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 7.51155110796741, "language_loss": 0.8056733, "learning_rate": 3.6133512695033965e-06, "loss": 0.82727438, "num_input_tokens_seen": 80617455, "step": 3746, "time_per_iteration": 2.743417739868164 }, { "auxiliary_loss_clip": 0.01142091, "auxiliary_loss_mlp": 0.01050597, "balance_loss_clip": 1.05323768, "balance_loss_mlp": 1.0328114, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 2.6189948571262116, "language_loss": 0.86153656, "learning_rate": 3.613121069229862e-06, "loss": 0.88346344, "num_input_tokens_seen": 80635125, "step": 3747, "time_per_iteration": 2.7622148990631104 }, { "auxiliary_loss_clip": 0.01138021, "auxiliary_loss_mlp": 0.0077598, "balance_loss_clip": 1.05126321, "balance_loss_mlp": 1.00154519, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 2.3477587169419483, "language_loss": 0.76400602, "learning_rate": 3.6128908077866145e-06, "loss": 0.78314602, "num_input_tokens_seen": 80656370, "step": 3748, "time_per_iteration": 2.7347261905670166 }, { "auxiliary_loss_clip": 0.01156837, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.05704546, "balance_loss_mlp": 1.02525926, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 1.5503962030073002, "language_loss": 0.7984724, "learning_rate": 3.6126604851823864e-06, "loss": 0.82047117, "num_input_tokens_seen": 80676495, "step": 3749, "time_per_iteration": 2.6900558471679688 }, { "auxiliary_loss_clip": 0.01123701, "auxiliary_loss_mlp": 0.01041028, "balance_loss_clip": 1.05050755, "balance_loss_mlp": 1.02436304, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 3.015206251853355, "language_loss": 0.79585081, "learning_rate": 3.6124301014259108e-06, "loss": 0.81749809, "num_input_tokens_seen": 80694755, "step": 3750, "time_per_iteration": 2.727651596069336 }, { "auxiliary_loss_clip": 0.01097337, "auxiliary_loss_mlp": 0.01055462, "balance_loss_clip": 1.05065274, "balance_loss_mlp": 1.03756917, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 2.662961533862713, "language_loss": 0.82433236, "learning_rate": 3.6121996565259244e-06, "loss": 0.84586036, "num_input_tokens_seen": 80713670, "step": 3751, "time_per_iteration": 2.827995538711548 }, { "auxiliary_loss_clip": 0.01121046, "auxiliary_loss_mlp": 0.01046103, "balance_loss_clip": 1.05429292, "balance_loss_mlp": 1.02828133, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 2.0142745824369315, "language_loss": 0.83813727, "learning_rate": 3.611969150491165e-06, "loss": 0.8598088, "num_input_tokens_seen": 80731450, "step": 3752, "time_per_iteration": 2.78725266456604 }, { "auxiliary_loss_clip": 0.01152116, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.05584741, "balance_loss_mlp": 1.02123034, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 1.9292267305553392, "language_loss": 0.78254855, "learning_rate": 3.611738583330375e-06, "loss": 0.80444479, "num_input_tokens_seen": 80748415, "step": 3753, "time_per_iteration": 2.7116169929504395 }, { "auxiliary_loss_clip": 0.01126321, "auxiliary_loss_mlp": 0.0104341, "balance_loss_clip": 1.05120027, "balance_loss_mlp": 1.02546871, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 1.8777790089425805, "language_loss": 0.78391469, "learning_rate": 3.611507955052295e-06, "loss": 0.80561191, "num_input_tokens_seen": 80770835, "step": 3754, "time_per_iteration": 2.91738224029541 }, { "auxiliary_loss_clip": 0.01128102, "auxiliary_loss_mlp": 0.01048192, "balance_loss_clip": 1.05648673, "balance_loss_mlp": 1.03040624, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 1.9337610105869587, "language_loss": 0.70648986, "learning_rate": 3.6112772656656727e-06, "loss": 0.72825277, "num_input_tokens_seen": 80787840, "step": 3755, "time_per_iteration": 2.7427992820739746 }, { "auxiliary_loss_clip": 0.01126515, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.05531752, "balance_loss_mlp": 1.03559232, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 3.9817469401483216, "language_loss": 0.77865845, "learning_rate": 3.6110465151792547e-06, "loss": 0.80045724, "num_input_tokens_seen": 80806335, "step": 3756, "time_per_iteration": 2.7879996299743652 }, { "auxiliary_loss_clip": 0.01132066, "auxiliary_loss_mlp": 0.01044227, "balance_loss_clip": 1.0559032, "balance_loss_mlp": 1.0261426, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 1.801741818571408, "language_loss": 0.82615864, "learning_rate": 3.6108157036017916e-06, "loss": 0.84792161, "num_input_tokens_seen": 80825355, "step": 3757, "time_per_iteration": 2.685218095779419 }, { "auxiliary_loss_clip": 0.01140048, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.05321026, "balance_loss_mlp": 1.02917302, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 2.3786564016745495, "language_loss": 0.73007452, "learning_rate": 3.6105848309420358e-06, "loss": 0.7519505, "num_input_tokens_seen": 80842570, "step": 3758, "time_per_iteration": 2.6716878414154053 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.0577718, "balance_loss_mlp": 1.03019619, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 2.226232476294752, "language_loss": 0.77150333, "learning_rate": 3.6103538972087412e-06, "loss": 0.79337054, "num_input_tokens_seen": 80858745, "step": 3759, "time_per_iteration": 2.787487030029297 }, { "auxiliary_loss_clip": 0.01104852, "auxiliary_loss_mlp": 0.01043473, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.02507949, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 1.6253921855068183, "language_loss": 0.78189945, "learning_rate": 3.6101229024106655e-06, "loss": 0.80338269, "num_input_tokens_seen": 80880085, "step": 3760, "time_per_iteration": 2.8760766983032227 }, { "auxiliary_loss_clip": 0.01042849, "auxiliary_loss_mlp": 0.01009599, "balance_loss_clip": 1.03235281, "balance_loss_mlp": 1.00633264, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.9481639821873915, "language_loss": 0.60083473, "learning_rate": 3.609891846556569e-06, "loss": 0.62135923, "num_input_tokens_seen": 80937660, "step": 3761, "time_per_iteration": 3.2168753147125244 }, { "auxiliary_loss_clip": 0.01114836, "auxiliary_loss_mlp": 0.01051216, "balance_loss_clip": 1.0493567, "balance_loss_mlp": 1.03295338, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 2.3328987294287047, "language_loss": 0.76767397, "learning_rate": 3.609660729655211e-06, "loss": 0.78933448, "num_input_tokens_seen": 80956265, "step": 3762, "time_per_iteration": 2.8012428283691406 }, { "auxiliary_loss_clip": 0.01128732, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02190685, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 2.7297545785195907, "language_loss": 0.79000401, "learning_rate": 3.6094295517153573e-06, "loss": 0.81171465, "num_input_tokens_seen": 80975185, "step": 3763, "time_per_iteration": 2.7217857837677 }, { "auxiliary_loss_clip": 0.01142679, "auxiliary_loss_mlp": 0.01057425, "balance_loss_clip": 1.0557214, "balance_loss_mlp": 1.03835177, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 31.68022075556768, "language_loss": 0.91241246, "learning_rate": 3.6091983127457743e-06, "loss": 0.93441343, "num_input_tokens_seen": 80992830, "step": 3764, "time_per_iteration": 4.232046842575073 }, { "auxiliary_loss_clip": 0.01131876, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.05196834, "balance_loss_mlp": 1.0367409, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 1.9816130101247444, "language_loss": 0.75202596, "learning_rate": 3.6089670127552293e-06, "loss": 0.77389991, "num_input_tokens_seen": 81013675, "step": 3765, "time_per_iteration": 4.291628122329712 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.05632913, "balance_loss_mlp": 1.02942574, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 2.1881182413466176, "language_loss": 0.8966549, "learning_rate": 3.608735651752494e-06, "loss": 0.91852784, "num_input_tokens_seen": 81030345, "step": 3766, "time_per_iteration": 2.6462960243225098 }, { "auxiliary_loss_clip": 0.01126107, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05579042, "balance_loss_mlp": 1.02950931, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 1.6297384952566736, "language_loss": 0.74816859, "learning_rate": 3.6085042297463417e-06, "loss": 0.76990426, "num_input_tokens_seen": 81051000, "step": 3767, "time_per_iteration": 4.181917667388916 }, { "auxiliary_loss_clip": 0.01139766, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05206823, "balance_loss_mlp": 1.02981031, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 1.6389844555489992, "language_loss": 0.71764815, "learning_rate": 3.6082727467455477e-06, "loss": 0.73952615, "num_input_tokens_seen": 81071205, "step": 3768, "time_per_iteration": 2.6622893810272217 }, { "auxiliary_loss_clip": 0.01143239, "auxiliary_loss_mlp": 0.01057198, "balance_loss_clip": 1.05766034, "balance_loss_mlp": 1.03895879, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 1.5883345705718652, "language_loss": 0.78320074, "learning_rate": 3.6080412027588905e-06, "loss": 0.80520505, "num_input_tokens_seen": 81091880, "step": 3769, "time_per_iteration": 2.692366123199463 }, { "auxiliary_loss_clip": 0.01121985, "auxiliary_loss_mlp": 0.01045951, "balance_loss_clip": 1.0452522, "balance_loss_mlp": 1.02712774, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 1.8427419299971495, "language_loss": 0.6877771, "learning_rate": 3.6078095977951488e-06, "loss": 0.70945644, "num_input_tokens_seen": 81113290, "step": 3770, "time_per_iteration": 2.7605137825012207 }, { "auxiliary_loss_clip": 0.01155061, "auxiliary_loss_mlp": 0.01053072, "balance_loss_clip": 1.0551908, "balance_loss_mlp": 1.03454649, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1.6594447480271795, "language_loss": 0.80540276, "learning_rate": 3.6075779318631067e-06, "loss": 0.82748413, "num_input_tokens_seen": 81133535, "step": 3771, "time_per_iteration": 4.265140771865845 }, { "auxiliary_loss_clip": 0.0110854, "auxiliary_loss_mlp": 0.01058177, "balance_loss_clip": 1.04661536, "balance_loss_mlp": 1.04091501, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 1.6696234119475444, "language_loss": 0.78947794, "learning_rate": 3.6073462049715486e-06, "loss": 0.81114507, "num_input_tokens_seen": 81154650, "step": 3772, "time_per_iteration": 2.7325806617736816 }, { "auxiliary_loss_clip": 0.01036659, "auxiliary_loss_mlp": 0.0100656, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.00336492, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.653194629863103, "language_loss": 0.54380804, "learning_rate": 3.607114417129261e-06, "loss": 0.56424022, "num_input_tokens_seen": 81221240, "step": 3773, "time_per_iteration": 3.3729567527770996 }, { "auxiliary_loss_clip": 0.0111914, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02851129, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 1.81548541557593, "language_loss": 0.70406783, "learning_rate": 3.6068825683450334e-06, "loss": 0.7257216, "num_input_tokens_seen": 81241520, "step": 3774, "time_per_iteration": 2.7159364223480225 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.05404115, "balance_loss_mlp": 1.02929282, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.2603412716687523, "language_loss": 0.74377871, "learning_rate": 3.606650658627658e-06, "loss": 0.76550257, "num_input_tokens_seen": 81256825, "step": 3775, "time_per_iteration": 2.7857720851898193 }, { "auxiliary_loss_clip": 0.01152024, "auxiliary_loss_mlp": 0.01045868, "balance_loss_clip": 1.05331159, "balance_loss_mlp": 1.02915478, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 1.8428958927362264, "language_loss": 0.81582248, "learning_rate": 3.606418687985928e-06, "loss": 0.83780146, "num_input_tokens_seen": 81275695, "step": 3776, "time_per_iteration": 2.6054935455322266 }, { "auxiliary_loss_clip": 0.01135081, "auxiliary_loss_mlp": 0.01043769, "balance_loss_clip": 1.05466735, "balance_loss_mlp": 1.02654314, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 1.7711090356153572, "language_loss": 0.82893199, "learning_rate": 3.606186656428641e-06, "loss": 0.85072052, "num_input_tokens_seen": 81294920, "step": 3777, "time_per_iteration": 2.722621202468872 }, { "auxiliary_loss_clip": 0.01127657, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.05438471, "balance_loss_mlp": 1.02435195, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 2.3905711679994295, "language_loss": 0.72538829, "learning_rate": 3.6059545639645955e-06, "loss": 0.74708927, "num_input_tokens_seen": 81314275, "step": 3778, "time_per_iteration": 2.730919599533081 }, { "auxiliary_loss_clip": 0.01112853, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.05304575, "balance_loss_mlp": 1.02241838, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 2.4150679449588535, "language_loss": 0.64176035, "learning_rate": 3.605722410602591e-06, "loss": 0.66329098, "num_input_tokens_seen": 81333890, "step": 3779, "time_per_iteration": 2.7663822174072266 }, { "auxiliary_loss_clip": 0.01132359, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.05292106, "balance_loss_mlp": 1.02928495, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 1.6627524387617407, "language_loss": 0.70659381, "learning_rate": 3.6054901963514323e-06, "loss": 0.72839016, "num_input_tokens_seen": 81353640, "step": 3780, "time_per_iteration": 2.666081666946411 }, { "auxiliary_loss_clip": 0.0114157, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.05450416, "balance_loss_mlp": 1.02880907, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 1.783300050979337, "language_loss": 0.89418924, "learning_rate": 3.6052579212199246e-06, "loss": 0.91607457, "num_input_tokens_seen": 81371595, "step": 3781, "time_per_iteration": 2.686478614807129 }, { "auxiliary_loss_clip": 0.01152428, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.05349672, "balance_loss_mlp": 1.02354264, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 19.977426185094338, "language_loss": 0.74404943, "learning_rate": 3.6050255852168753e-06, "loss": 0.76598531, "num_input_tokens_seen": 81388435, "step": 3782, "time_per_iteration": 2.5633177757263184 }, { "auxiliary_loss_clip": 0.01129007, "auxiliary_loss_mlp": 0.01045443, "balance_loss_clip": 1.05195391, "balance_loss_mlp": 1.02926588, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 2.051662638457334, "language_loss": 0.82665169, "learning_rate": 3.604793188351095e-06, "loss": 0.84839618, "num_input_tokens_seen": 81410195, "step": 3783, "time_per_iteration": 2.742572069168091 }, { "auxiliary_loss_clip": 0.01129724, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.055516, "balance_loss_mlp": 1.02495527, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 2.0126417567412256, "language_loss": 0.75996566, "learning_rate": 3.6045607306313964e-06, "loss": 0.78169543, "num_input_tokens_seen": 81430060, "step": 3784, "time_per_iteration": 2.7283668518066406 }, { "auxiliary_loss_clip": 0.01148666, "auxiliary_loss_mlp": 0.01041397, "balance_loss_clip": 1.05224681, "balance_loss_mlp": 1.02382576, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 1.784429661746796, "language_loss": 0.7105484, "learning_rate": 3.604328212066594e-06, "loss": 0.73244894, "num_input_tokens_seen": 81447375, "step": 3785, "time_per_iteration": 2.627401351928711 }, { "auxiliary_loss_clip": 0.01042691, "auxiliary_loss_mlp": 0.0101642, "balance_loss_clip": 1.03303862, "balance_loss_mlp": 1.01427412, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8323137639565091, "language_loss": 0.6189881, "learning_rate": 3.6040956326655047e-06, "loss": 0.63957924, "num_input_tokens_seen": 81505235, "step": 3786, "time_per_iteration": 3.321380376815796 }, { "auxiliary_loss_clip": 0.01135149, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.0540669, "balance_loss_mlp": 1.02645397, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 2.677223616893363, "language_loss": 0.86047274, "learning_rate": 3.6038629924369486e-06, "loss": 0.8822695, "num_input_tokens_seen": 81518685, "step": 3787, "time_per_iteration": 2.72554349899292 }, { "auxiliary_loss_clip": 0.01129718, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.05296564, "balance_loss_mlp": 1.02323031, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 1.361320938410825, "language_loss": 0.72755021, "learning_rate": 3.6036302913897474e-06, "loss": 0.74924648, "num_input_tokens_seen": 81538940, "step": 3788, "time_per_iteration": 2.7717456817626953 }, { "auxiliary_loss_clip": 0.01125411, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.01800895, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 2.510042380876752, "language_loss": 0.67785919, "learning_rate": 3.6033975295327243e-06, "loss": 0.69946766, "num_input_tokens_seen": 81555525, "step": 3789, "time_per_iteration": 2.6492021083831787 }, { "auxiliary_loss_clip": 0.01114067, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.04577208, "balance_loss_mlp": 1.0244137, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 2.807016388048184, "language_loss": 0.76026487, "learning_rate": 3.6031647068747065e-06, "loss": 0.7818349, "num_input_tokens_seen": 81576305, "step": 3790, "time_per_iteration": 2.789419412612915 }, { "auxiliary_loss_clip": 0.01094774, "auxiliary_loss_mlp": 0.01043575, "balance_loss_clip": 1.04942632, "balance_loss_mlp": 1.02388144, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 2.1998519418279843, "language_loss": 0.9070015, "learning_rate": 3.602931823424522e-06, "loss": 0.92838502, "num_input_tokens_seen": 81594115, "step": 3791, "time_per_iteration": 2.74957275390625 }, { "auxiliary_loss_clip": 0.01143903, "auxiliary_loss_mlp": 0.01039768, "balance_loss_clip": 1.05332911, "balance_loss_mlp": 1.02229166, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 1.6288404079645773, "language_loss": 0.82029706, "learning_rate": 3.6026988791910026e-06, "loss": 0.84213376, "num_input_tokens_seen": 81615355, "step": 3792, "time_per_iteration": 2.7578563690185547 }, { "auxiliary_loss_clip": 0.01074793, "auxiliary_loss_mlp": 0.01002047, "balance_loss_clip": 1.03528738, "balance_loss_mlp": 0.99944824, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.1490057531785423, "language_loss": 0.65688264, "learning_rate": 3.602465874182981e-06, "loss": 0.67765105, "num_input_tokens_seen": 81662075, "step": 3793, "time_per_iteration": 2.892385959625244 }, { "auxiliary_loss_clip": 0.01156846, "auxiliary_loss_mlp": 0.01048751, "balance_loss_clip": 1.05509233, "balance_loss_mlp": 1.03063166, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 2.315054268007893, "language_loss": 0.77095032, "learning_rate": 3.602232808409293e-06, "loss": 0.79300624, "num_input_tokens_seen": 81681625, "step": 3794, "time_per_iteration": 2.6432933807373047 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.0104554, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.02560771, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 2.8263872836139194, "language_loss": 0.80649161, "learning_rate": 3.6019996818787755e-06, "loss": 0.82801497, "num_input_tokens_seen": 81701170, "step": 3795, "time_per_iteration": 2.748461961746216 }, { "auxiliary_loss_clip": 0.01136851, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.0527277, "balance_loss_mlp": 1.03194404, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 1.970346796529307, "language_loss": 0.77348727, "learning_rate": 3.6017664946002704e-06, "loss": 0.79534674, "num_input_tokens_seen": 81721265, "step": 3796, "time_per_iteration": 2.6720409393310547 }, { "auxiliary_loss_clip": 0.01111647, "auxiliary_loss_mlp": 0.0077572, "balance_loss_clip": 1.04920197, "balance_loss_mlp": 1.00161827, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 3.9384070064251793, "language_loss": 0.95837742, "learning_rate": 3.6015332465826188e-06, "loss": 0.97725105, "num_input_tokens_seen": 81736565, "step": 3797, "time_per_iteration": 2.730684995651245 }, { "auxiliary_loss_clip": 0.01140956, "auxiliary_loss_mlp": 0.00774906, "balance_loss_clip": 1.05310869, "balance_loss_mlp": 1.00178146, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 2.215225796779507, "language_loss": 0.81875294, "learning_rate": 3.601299937834666e-06, "loss": 0.83791155, "num_input_tokens_seen": 81756240, "step": 3798, "time_per_iteration": 2.7082717418670654 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01041342, "balance_loss_clip": 1.04808974, "balance_loss_mlp": 1.02263761, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 2.1089113145856344, "language_loss": 0.78796971, "learning_rate": 3.6010665683652596e-06, "loss": 0.8095215, "num_input_tokens_seen": 81775720, "step": 3799, "time_per_iteration": 2.7810587882995605 }, { "auxiliary_loss_clip": 0.01121546, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.04926765, "balance_loss_mlp": 1.03627968, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 1.7973625036918341, "language_loss": 0.75191152, "learning_rate": 3.6008331381832484e-06, "loss": 0.77368033, "num_input_tokens_seen": 81795830, "step": 3800, "time_per_iteration": 2.7185163497924805 }, { "auxiliary_loss_clip": 0.01121477, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.04833913, "balance_loss_mlp": 1.02235246, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 1.7410667809724167, "language_loss": 0.64073247, "learning_rate": 3.600599647297484e-06, "loss": 0.66232693, "num_input_tokens_seen": 81815745, "step": 3801, "time_per_iteration": 2.7509078979492188 }, { "auxiliary_loss_clip": 0.01129432, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.05498147, "balance_loss_mlp": 1.02301216, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 1.6732672610702524, "language_loss": 0.81560862, "learning_rate": 3.60036609571682e-06, "loss": 0.83729029, "num_input_tokens_seen": 81835155, "step": 3802, "time_per_iteration": 2.7188339233398438 }, { "auxiliary_loss_clip": 0.01126952, "auxiliary_loss_mlp": 0.0105215, "balance_loss_clip": 1.05203629, "balance_loss_mlp": 1.0342809, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 2.0652844737971625, "language_loss": 0.78909743, "learning_rate": 3.600132483450114e-06, "loss": 0.81088841, "num_input_tokens_seen": 81855655, "step": 3803, "time_per_iteration": 2.7760777473449707 }, { "auxiliary_loss_clip": 0.01109356, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.04399478, "balance_loss_mlp": 1.02511966, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 1.7519930287683254, "language_loss": 0.84902716, "learning_rate": 3.5998988105062235e-06, "loss": 0.87055165, "num_input_tokens_seen": 81876385, "step": 3804, "time_per_iteration": 5.891911745071411 }, { "auxiliary_loss_clip": 0.01141965, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.05229163, "balance_loss_mlp": 1.02440476, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 2.045415026345325, "language_loss": 0.76673448, "learning_rate": 3.59966507689401e-06, "loss": 0.78856367, "num_input_tokens_seen": 81893225, "step": 3805, "time_per_iteration": 2.643104076385498 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.00775286, "balance_loss_clip": 1.05192351, "balance_loss_mlp": 1.00156116, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 2.368547935700865, "language_loss": 0.78250653, "learning_rate": 3.5994312826223363e-06, "loss": 0.80154467, "num_input_tokens_seen": 81911350, "step": 3806, "time_per_iteration": 4.312817335128784 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01052484, "balance_loss_clip": 1.05244482, "balance_loss_mlp": 1.03282619, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 2.0706298183861, "language_loss": 0.700813, "learning_rate": 3.5991974277000684e-06, "loss": 0.72257227, "num_input_tokens_seen": 81935420, "step": 3807, "time_per_iteration": 2.8060836791992188 }, { "auxiliary_loss_clip": 0.01143724, "auxiliary_loss_mlp": 0.01057417, "balance_loss_clip": 1.0545013, "balance_loss_mlp": 1.03891551, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 4.007429648995762, "language_loss": 0.6543591, "learning_rate": 3.5989635121360733e-06, "loss": 0.6763705, "num_input_tokens_seen": 81953845, "step": 3808, "time_per_iteration": 2.703885078430176 }, { "auxiliary_loss_clip": 0.0109921, "auxiliary_loss_mlp": 0.01061828, "balance_loss_clip": 1.04773676, "balance_loss_mlp": 1.04295671, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 2.028069656557901, "language_loss": 0.74749511, "learning_rate": 3.598729535939222e-06, "loss": 0.76910543, "num_input_tokens_seen": 81972100, "step": 3809, "time_per_iteration": 2.726862907409668 }, { "auxiliary_loss_clip": 0.01128097, "auxiliary_loss_mlp": 0.01053112, "balance_loss_clip": 1.0527637, "balance_loss_mlp": 1.03666139, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 1.6287389468918274, "language_loss": 0.81654954, "learning_rate": 3.5984954991183862e-06, "loss": 0.83836162, "num_input_tokens_seen": 81992760, "step": 3810, "time_per_iteration": 2.6750009059906006 }, { "auxiliary_loss_clip": 0.01132496, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.05216146, "balance_loss_mlp": 1.0247184, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 2.375204791625097, "language_loss": 0.78126299, "learning_rate": 3.598261401682441e-06, "loss": 0.80299771, "num_input_tokens_seen": 82009080, "step": 3811, "time_per_iteration": 4.302153587341309 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.00775213, "balance_loss_clip": 1.05357778, "balance_loss_mlp": 1.00159776, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 1.797699433224321, "language_loss": 0.82817954, "learning_rate": 3.5980272436402632e-06, "loss": 0.84724021, "num_input_tokens_seen": 82026705, "step": 3812, "time_per_iteration": 2.635796308517456 }, { "auxiliary_loss_clip": 0.01089198, "auxiliary_loss_mlp": 0.01067747, "balance_loss_clip": 1.04705882, "balance_loss_mlp": 1.0480535, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 3.3357789636694952, "language_loss": 0.82689399, "learning_rate": 3.5977930250007324e-06, "loss": 0.84846342, "num_input_tokens_seen": 82043245, "step": 3813, "time_per_iteration": 2.7896463871002197 }, { "auxiliary_loss_clip": 0.01135441, "auxiliary_loss_mlp": 0.01044219, "balance_loss_clip": 1.05230987, "balance_loss_mlp": 1.02743411, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 1.5779710642832598, "language_loss": 0.70018709, "learning_rate": 3.5975587457727298e-06, "loss": 0.72198373, "num_input_tokens_seen": 82066870, "step": 3814, "time_per_iteration": 2.759460687637329 }, { "auxiliary_loss_clip": 0.01141204, "auxiliary_loss_mlp": 0.01046745, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.02947164, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 2.3195881009003174, "language_loss": 0.66811371, "learning_rate": 3.597324405965139e-06, "loss": 0.6899932, "num_input_tokens_seen": 82083180, "step": 3815, "time_per_iteration": 2.6878743171691895 }, { "auxiliary_loss_clip": 0.01142177, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.05412412, "balance_loss_mlp": 1.02921689, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 2.436037188170917, "language_loss": 0.83555114, "learning_rate": 3.597090005586848e-06, "loss": 0.85743231, "num_input_tokens_seen": 82102950, "step": 3816, "time_per_iteration": 2.702638626098633 }, { "auxiliary_loss_clip": 0.01142001, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.05649173, "balance_loss_mlp": 1.01952624, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 2.261586370580253, "language_loss": 0.8657164, "learning_rate": 3.596855544646742e-06, "loss": 0.88750786, "num_input_tokens_seen": 82119510, "step": 3817, "time_per_iteration": 2.6439061164855957 }, { "auxiliary_loss_clip": 0.01125222, "auxiliary_loss_mlp": 0.01048919, "balance_loss_clip": 1.0493896, "balance_loss_mlp": 1.03166902, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 3.8274774650765706, "language_loss": 0.74976468, "learning_rate": 3.5966210231537154e-06, "loss": 0.77150607, "num_input_tokens_seen": 82140095, "step": 3818, "time_per_iteration": 2.7610766887664795 }, { "auxiliary_loss_clip": 0.01146421, "auxiliary_loss_mlp": 0.01043004, "balance_loss_clip": 1.05866313, "balance_loss_mlp": 1.02550387, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 1.7490504114150227, "language_loss": 0.74682397, "learning_rate": 3.596386441116659e-06, "loss": 0.76871818, "num_input_tokens_seen": 82159510, "step": 3819, "time_per_iteration": 2.7125203609466553 }, { "auxiliary_loss_clip": 0.0114108, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.05479693, "balance_loss_mlp": 1.02630615, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 2.0230347194773732, "language_loss": 0.81103987, "learning_rate": 3.5961517985444684e-06, "loss": 0.83288836, "num_input_tokens_seen": 82179580, "step": 3820, "time_per_iteration": 2.7268714904785156 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01044606, "balance_loss_clip": 1.05326903, "balance_loss_mlp": 1.02627158, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.2801321869619153, "language_loss": 0.69099033, "learning_rate": 3.595917095446042e-06, "loss": 0.71272922, "num_input_tokens_seen": 82195585, "step": 3821, "time_per_iteration": 2.659498691558838 }, { "auxiliary_loss_clip": 0.01098739, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05118072, "balance_loss_mlp": 1.01888967, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 1.473505926288008, "language_loss": 0.82876307, "learning_rate": 3.5956823318302796e-06, "loss": 0.85012007, "num_input_tokens_seen": 82217530, "step": 3822, "time_per_iteration": 2.898287057876587 }, { "auxiliary_loss_clip": 0.01149833, "auxiliary_loss_mlp": 0.01044764, "balance_loss_clip": 1.05239797, "balance_loss_mlp": 1.02617884, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 2.077495396622281, "language_loss": 0.66552204, "learning_rate": 3.5954475077060833e-06, "loss": 0.68746805, "num_input_tokens_seen": 82237980, "step": 3823, "time_per_iteration": 2.6397016048431396 }, { "auxiliary_loss_clip": 0.01064018, "auxiliary_loss_mlp": 0.01005373, "balance_loss_clip": 1.04052305, "balance_loss_mlp": 1.00196409, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8015900374762405, "language_loss": 0.56731141, "learning_rate": 3.595212623082357e-06, "loss": 0.5880053, "num_input_tokens_seen": 82301785, "step": 3824, "time_per_iteration": 3.2301526069641113 }, { "auxiliary_loss_clip": 0.01123513, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.0506382, "balance_loss_mlp": 1.02098525, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.0770938093466995, "language_loss": 0.7301755, "learning_rate": 3.594977677968009e-06, "loss": 0.7517885, "num_input_tokens_seen": 82317355, "step": 3825, "time_per_iteration": 2.6161818504333496 }, { "auxiliary_loss_clip": 0.01147516, "auxiliary_loss_mlp": 0.01049665, "balance_loss_clip": 1.05828226, "balance_loss_mlp": 1.03119957, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 1.8689845885894332, "language_loss": 0.87652314, "learning_rate": 3.5947426723719473e-06, "loss": 0.89849496, "num_input_tokens_seen": 82336645, "step": 3826, "time_per_iteration": 2.668858766555786 }, { "auxiliary_loss_clip": 0.01134406, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.05722022, "balance_loss_mlp": 1.02697468, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 2.4660324215504312, "language_loss": 0.81861693, "learning_rate": 3.594507606303083e-06, "loss": 0.84041631, "num_input_tokens_seen": 82354225, "step": 3827, "time_per_iteration": 2.67173171043396 }, { "auxiliary_loss_clip": 0.01083629, "auxiliary_loss_mlp": 0.01046658, "balance_loss_clip": 1.04976189, "balance_loss_mlp": 1.02728689, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 1.9417227311694012, "language_loss": 0.86676306, "learning_rate": 3.5942724797703314e-06, "loss": 0.88806593, "num_input_tokens_seen": 82370240, "step": 3828, "time_per_iteration": 2.7641990184783936 }, { "auxiliary_loss_clip": 0.01126786, "auxiliary_loss_mlp": 0.01048261, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.02981901, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 2.6386744924703223, "language_loss": 0.7044189, "learning_rate": 3.594037292782607e-06, "loss": 0.72616941, "num_input_tokens_seen": 82389145, "step": 3829, "time_per_iteration": 2.6674952507019043 }, { "auxiliary_loss_clip": 0.01085573, "auxiliary_loss_mlp": 0.01045126, "balance_loss_clip": 1.04650855, "balance_loss_mlp": 1.02835345, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 1.6431866637768902, "language_loss": 0.84075069, "learning_rate": 3.5938020453488293e-06, "loss": 0.86205769, "num_input_tokens_seen": 82409185, "step": 3830, "time_per_iteration": 2.8631880283355713 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01052116, "balance_loss_clip": 1.0506047, "balance_loss_mlp": 1.03415167, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 2.3429509345019213, "language_loss": 0.67036134, "learning_rate": 3.5935667374779177e-06, "loss": 0.6922121, "num_input_tokens_seen": 82432070, "step": 3831, "time_per_iteration": 2.91282320022583 }, { "auxiliary_loss_clip": 0.0111204, "auxiliary_loss_mlp": 0.01053367, "balance_loss_clip": 1.05277622, "balance_loss_mlp": 1.03496158, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 2.3469890931023194, "language_loss": 0.75711727, "learning_rate": 3.5933313691787957e-06, "loss": 0.7787714, "num_input_tokens_seen": 82450625, "step": 3832, "time_per_iteration": 2.759467601776123 }, { "auxiliary_loss_clip": 0.0110298, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.05044174, "balance_loss_mlp": 1.02596867, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 1.7769817461106177, "language_loss": 0.87558299, "learning_rate": 3.593095940460389e-06, "loss": 0.89705843, "num_input_tokens_seen": 82468575, "step": 3833, "time_per_iteration": 2.8548035621643066 }, { "auxiliary_loss_clip": 0.01116173, "auxiliary_loss_mlp": 0.01046082, "balance_loss_clip": 1.05032015, "balance_loss_mlp": 1.02814126, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 2.030934473686878, "language_loss": 0.74736786, "learning_rate": 3.592860451331624e-06, "loss": 0.7689904, "num_input_tokens_seen": 82488655, "step": 3834, "time_per_iteration": 2.719237804412842 }, { "auxiliary_loss_clip": 0.01104525, "auxiliary_loss_mlp": 0.01064338, "balance_loss_clip": 1.04610491, "balance_loss_mlp": 1.043679, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 1.9050082770497696, "language_loss": 0.86071098, "learning_rate": 3.592624901801432e-06, "loss": 0.88239956, "num_input_tokens_seen": 82507220, "step": 3835, "time_per_iteration": 2.627782106399536 }, { "auxiliary_loss_clip": 0.01115977, "auxiliary_loss_mlp": 0.01060727, "balance_loss_clip": 1.04934275, "balance_loss_mlp": 1.03979373, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 2.798777841757382, "language_loss": 0.82434011, "learning_rate": 3.5923892918787432e-06, "loss": 0.84610713, "num_input_tokens_seen": 82527920, "step": 3836, "time_per_iteration": 2.6091606616973877 }, { "auxiliary_loss_clip": 0.01144536, "auxiliary_loss_mlp": 0.0105466, "balance_loss_clip": 1.06090033, "balance_loss_mlp": 1.03683817, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 1.7189193248017045, "language_loss": 0.79633009, "learning_rate": 3.5921536215724934e-06, "loss": 0.81832206, "num_input_tokens_seen": 82549040, "step": 3837, "time_per_iteration": 2.535435914993286 }, { "auxiliary_loss_clip": 0.01057695, "auxiliary_loss_mlp": 0.01033541, "balance_loss_clip": 1.04840386, "balance_loss_mlp": 1.03003633, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.9031703200773207, "language_loss": 0.65381849, "learning_rate": 3.5919178908916184e-06, "loss": 0.67473078, "num_input_tokens_seen": 82604070, "step": 3838, "time_per_iteration": 3.0868518352508545 }, { "auxiliary_loss_clip": 0.01138177, "auxiliary_loss_mlp": 0.01056497, "balance_loss_clip": 1.05361629, "balance_loss_mlp": 1.0395453, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 2.5143705705619097, "language_loss": 0.75403488, "learning_rate": 3.591682099845058e-06, "loss": 0.77598161, "num_input_tokens_seen": 82619665, "step": 3839, "time_per_iteration": 2.6391067504882812 }, { "auxiliary_loss_clip": 0.01125705, "auxiliary_loss_mlp": 0.01046933, "balance_loss_clip": 1.05447173, "balance_loss_mlp": 1.02882481, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 1.8684605740856612, "language_loss": 0.68962026, "learning_rate": 3.591446248441752e-06, "loss": 0.71134663, "num_input_tokens_seen": 82637530, "step": 3840, "time_per_iteration": 2.6295006275177 }, { "auxiliary_loss_clip": 0.01158019, "auxiliary_loss_mlp": 0.01046048, "balance_loss_clip": 1.05840647, "balance_loss_mlp": 1.026057, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 2.5615469809997697, "language_loss": 0.80033958, "learning_rate": 3.591210336690645e-06, "loss": 0.8223803, "num_input_tokens_seen": 82656130, "step": 3841, "time_per_iteration": 2.6512410640716553 }, { "auxiliary_loss_clip": 0.01145317, "auxiliary_loss_mlp": 0.01047066, "balance_loss_clip": 1.05756617, "balance_loss_mlp": 1.0301621, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 1.7953422744525294, "language_loss": 0.83389241, "learning_rate": 3.590974364600683e-06, "loss": 0.85581625, "num_input_tokens_seen": 82675295, "step": 3842, "time_per_iteration": 2.7676117420196533 }, { "auxiliary_loss_clip": 0.01144752, "auxiliary_loss_mlp": 0.01044783, "balance_loss_clip": 1.05491304, "balance_loss_mlp": 1.02650845, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 1.8421697704365976, "language_loss": 0.66661239, "learning_rate": 3.5907383321808135e-06, "loss": 0.68850774, "num_input_tokens_seen": 82703260, "step": 3843, "time_per_iteration": 5.82958722114563 }, { "auxiliary_loss_clip": 0.01142299, "auxiliary_loss_mlp": 0.01047166, "balance_loss_clip": 1.05609, "balance_loss_mlp": 1.02914143, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 1.8996188882256444, "language_loss": 0.77221334, "learning_rate": 3.590502239439987e-06, "loss": 0.79410803, "num_input_tokens_seen": 82725060, "step": 3844, "time_per_iteration": 2.771226406097412 }, { "auxiliary_loss_clip": 0.01141796, "auxiliary_loss_mlp": 0.01045598, "balance_loss_clip": 1.05503309, "balance_loss_mlp": 1.02607179, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 1.9651801579729304, "language_loss": 0.78155982, "learning_rate": 3.590266086387156e-06, "loss": 0.80343372, "num_input_tokens_seen": 82742960, "step": 3845, "time_per_iteration": 4.247429370880127 }, { "auxiliary_loss_clip": 0.01117167, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05274439, "balance_loss_mlp": 1.02292788, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 2.083958857623256, "language_loss": 0.76397669, "learning_rate": 3.590029873031276e-06, "loss": 0.78554261, "num_input_tokens_seen": 82760205, "step": 3846, "time_per_iteration": 2.7805917263031006 }, { "auxiliary_loss_clip": 0.01131462, "auxiliary_loss_mlp": 0.01049247, "balance_loss_clip": 1.05376291, "balance_loss_mlp": 1.03193808, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 1.8827740097117207, "language_loss": 0.70281041, "learning_rate": 3.589793599381304e-06, "loss": 0.72461748, "num_input_tokens_seen": 82778590, "step": 3847, "time_per_iteration": 2.6848642826080322 }, { "auxiliary_loss_clip": 0.01065475, "auxiliary_loss_mlp": 0.01006045, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.00356507, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.7955227467680892, "language_loss": 0.61006129, "learning_rate": 3.589557265446198e-06, "loss": 0.63077646, "num_input_tokens_seen": 82833925, "step": 3848, "time_per_iteration": 3.08832049369812 }, { "auxiliary_loss_clip": 0.01142916, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05631924, "balance_loss_mlp": 1.02640557, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 1.9602331138800266, "language_loss": 0.78082883, "learning_rate": 3.589320871234923e-06, "loss": 0.80270743, "num_input_tokens_seen": 82850625, "step": 3849, "time_per_iteration": 2.6830787658691406 }, { "auxiliary_loss_clip": 0.01137959, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.05184579, "balance_loss_mlp": 1.02630353, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 2.354271482082729, "language_loss": 0.71243513, "learning_rate": 3.5890844167564405e-06, "loss": 0.7342633, "num_input_tokens_seen": 82872105, "step": 3850, "time_per_iteration": 4.467762231826782 }, { "auxiliary_loss_clip": 0.01121609, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.00153255, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 4.184777043510671, "language_loss": 0.76577097, "learning_rate": 3.588847902019718e-06, "loss": 0.78475106, "num_input_tokens_seen": 82890595, "step": 3851, "time_per_iteration": 2.7452898025512695 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05650854, "balance_loss_mlp": 1.0206244, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 2.0528428588063914, "language_loss": 0.69642782, "learning_rate": 3.588611327033723e-06, "loss": 0.71834141, "num_input_tokens_seen": 82908910, "step": 3852, "time_per_iteration": 2.613687038421631 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.05097961, "balance_loss_mlp": 1.0303328, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 2.8596642791724993, "language_loss": 0.67063856, "learning_rate": 3.588374691807428e-06, "loss": 0.69223493, "num_input_tokens_seen": 82925405, "step": 3853, "time_per_iteration": 2.6974282264709473 }, { "auxiliary_loss_clip": 0.01146149, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.05749798, "balance_loss_mlp": 1.02340484, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 1.7603397459637538, "language_loss": 0.80139267, "learning_rate": 3.5881379963498053e-06, "loss": 0.82326943, "num_input_tokens_seen": 82945615, "step": 3854, "time_per_iteration": 2.712125062942505 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01052387, "balance_loss_clip": 1.04737794, "balance_loss_mlp": 1.03070331, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 1.9709775740629982, "language_loss": 0.65103847, "learning_rate": 3.587901240669831e-06, "loss": 0.67272007, "num_input_tokens_seen": 82967570, "step": 3855, "time_per_iteration": 2.718756675720215 }, { "auxiliary_loss_clip": 0.01153506, "auxiliary_loss_mlp": 0.01048508, "balance_loss_clip": 1.05417824, "balance_loss_mlp": 1.03050709, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 1.7803112411977504, "language_loss": 0.70386064, "learning_rate": 3.5876644247764815e-06, "loss": 0.7258808, "num_input_tokens_seen": 82987435, "step": 3856, "time_per_iteration": 2.798675060272217 }, { "auxiliary_loss_clip": 0.01103018, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.05080032, "balance_loss_mlp": 1.0200007, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 1.7837780829213195, "language_loss": 0.77101243, "learning_rate": 3.5874275486787387e-06, "loss": 0.79240191, "num_input_tokens_seen": 83010505, "step": 3857, "time_per_iteration": 2.8545501232147217 }, { "auxiliary_loss_clip": 0.01136868, "auxiliary_loss_mlp": 0.00777317, "balance_loss_clip": 1.0528996, "balance_loss_mlp": 1.00133562, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 2.445609387195472, "language_loss": 0.91629225, "learning_rate": 3.587190612385584e-06, "loss": 0.9354341, "num_input_tokens_seen": 83026705, "step": 3858, "time_per_iteration": 2.7018845081329346 }, { "auxiliary_loss_clip": 0.01095626, "auxiliary_loss_mlp": 0.01043975, "balance_loss_clip": 1.04882586, "balance_loss_mlp": 1.0263319, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 1.987074492721614, "language_loss": 0.76833785, "learning_rate": 3.5869536159060026e-06, "loss": 0.78973383, "num_input_tokens_seen": 83046500, "step": 3859, "time_per_iteration": 2.7465155124664307 }, { "auxiliary_loss_clip": 0.01136816, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.05060959, "balance_loss_mlp": 1.02316284, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 1.7166447387893018, "language_loss": 0.84341264, "learning_rate": 3.58671655924898e-06, "loss": 0.86519206, "num_input_tokens_seen": 83065280, "step": 3860, "time_per_iteration": 2.6602063179016113 }, { "auxiliary_loss_clip": 0.01091436, "auxiliary_loss_mlp": 0.01044571, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02640343, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 2.014536853896284, "language_loss": 0.83431923, "learning_rate": 3.586479442423508e-06, "loss": 0.85567933, "num_input_tokens_seen": 83082310, "step": 3861, "time_per_iteration": 2.728750228881836 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.00776368, "balance_loss_clip": 1.05122983, "balance_loss_mlp": 1.00149858, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 1.8874922149770945, "language_loss": 0.85921204, "learning_rate": 3.586242265438576e-06, "loss": 0.87828225, "num_input_tokens_seen": 83102065, "step": 3862, "time_per_iteration": 2.7289161682128906 }, { "auxiliary_loss_clip": 0.01112788, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02645802, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 1.4078274786009342, "language_loss": 0.75131166, "learning_rate": 3.5860050283031773e-06, "loss": 0.77286315, "num_input_tokens_seen": 83121445, "step": 3863, "time_per_iteration": 2.7308037281036377 }, { "auxiliary_loss_clip": 0.01109911, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.05320251, "balance_loss_mlp": 1.02840066, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 1.8195520841096788, "language_loss": 0.74952984, "learning_rate": 3.58576773102631e-06, "loss": 0.77107918, "num_input_tokens_seen": 83138175, "step": 3864, "time_per_iteration": 2.669403314590454 }, { "auxiliary_loss_clip": 0.01148697, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.05258274, "balance_loss_mlp": 1.02182317, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 1.757817857347048, "language_loss": 0.70438093, "learning_rate": 3.5855303736169714e-06, "loss": 0.72625393, "num_input_tokens_seen": 83161975, "step": 3865, "time_per_iteration": 2.766399621963501 }, { "auxiliary_loss_clip": 0.01156124, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.05352104, "balance_loss_mlp": 1.02978325, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 1.8965816841290546, "language_loss": 0.94702542, "learning_rate": 3.5852929560841617e-06, "loss": 0.96907574, "num_input_tokens_seen": 83180905, "step": 3866, "time_per_iteration": 2.659867525100708 }, { "auxiliary_loss_clip": 0.01131283, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.04904807, "balance_loss_mlp": 1.02683008, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 4.181849364953483, "language_loss": 0.73026884, "learning_rate": 3.5850554784368846e-06, "loss": 0.75202191, "num_input_tokens_seen": 83196390, "step": 3867, "time_per_iteration": 2.645481586456299 }, { "auxiliary_loss_clip": 0.0112954, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.05079126, "balance_loss_mlp": 1.02855754, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 1.9671041323983256, "language_loss": 0.82770872, "learning_rate": 3.584817940684145e-06, "loss": 0.84946775, "num_input_tokens_seen": 83216165, "step": 3868, "time_per_iteration": 2.7670326232910156 }, { "auxiliary_loss_clip": 0.01125563, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.04875207, "balance_loss_mlp": 1.02648687, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 2.1100994183362967, "language_loss": 0.72952414, "learning_rate": 3.58458034283495e-06, "loss": 0.75121534, "num_input_tokens_seen": 83233845, "step": 3869, "time_per_iteration": 2.6661763191223145 }, { "auxiliary_loss_clip": 0.01132223, "auxiliary_loss_mlp": 0.0105087, "balance_loss_clip": 1.05129242, "balance_loss_mlp": 1.03382349, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 2.500604422715561, "language_loss": 0.79142725, "learning_rate": 3.5843426848983097e-06, "loss": 0.81325811, "num_input_tokens_seen": 83254930, "step": 3870, "time_per_iteration": 2.707321882247925 }, { "auxiliary_loss_clip": 0.01152434, "auxiliary_loss_mlp": 0.01046711, "balance_loss_clip": 1.05334866, "balance_loss_mlp": 1.02924728, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 2.176894576680098, "language_loss": 0.70915782, "learning_rate": 3.5841049668832357e-06, "loss": 0.73114932, "num_input_tokens_seen": 83272095, "step": 3871, "time_per_iteration": 2.6389646530151367 }, { "auxiliary_loss_clip": 0.01139847, "auxiliary_loss_mlp": 0.01051541, "balance_loss_clip": 1.05543458, "balance_loss_mlp": 1.03244328, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 1.8306984701748774, "language_loss": 0.68877381, "learning_rate": 3.5838671887987433e-06, "loss": 0.71068764, "num_input_tokens_seen": 83290980, "step": 3872, "time_per_iteration": 2.662309408187866 }, { "auxiliary_loss_clip": 0.0114472, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.05313611, "balance_loss_mlp": 1.02388597, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.5710106481349988, "language_loss": 0.779724, "learning_rate": 3.5836293506538474e-06, "loss": 0.80159569, "num_input_tokens_seen": 83315175, "step": 3873, "time_per_iteration": 2.884542942047119 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01022765, "balance_loss_clip": 1.03691578, "balance_loss_mlp": 1.02038097, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 0.8561383552409444, "language_loss": 0.6051712, "learning_rate": 3.5833914524575687e-06, "loss": 0.62593567, "num_input_tokens_seen": 83372060, "step": 3874, "time_per_iteration": 3.165809392929077 }, { "auxiliary_loss_clip": 0.0112779, "auxiliary_loss_mlp": 0.01040869, "balance_loss_clip": 1.05157447, "balance_loss_mlp": 1.02328515, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 2.5039775977564522, "language_loss": 0.80842507, "learning_rate": 3.583153494218927e-06, "loss": 0.83011162, "num_input_tokens_seen": 83389795, "step": 3875, "time_per_iteration": 2.673657178878784 }, { "auxiliary_loss_clip": 0.01147803, "auxiliary_loss_mlp": 0.00774568, "balance_loss_clip": 1.05367982, "balance_loss_mlp": 1.00145388, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 4.3174446976030465, "language_loss": 0.6123395, "learning_rate": 3.5829154759469464e-06, "loss": 0.63156319, "num_input_tokens_seen": 83410005, "step": 3876, "time_per_iteration": 2.6973021030426025 }, { "auxiliary_loss_clip": 0.01116571, "auxiliary_loss_mlp": 0.01051971, "balance_loss_clip": 1.05002618, "balance_loss_mlp": 1.03345811, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 2.4263361529850447, "language_loss": 0.70649457, "learning_rate": 3.5826773976506523e-06, "loss": 0.72817999, "num_input_tokens_seen": 83430250, "step": 3877, "time_per_iteration": 2.7506351470947266 }, { "auxiliary_loss_clip": 0.01143537, "auxiliary_loss_mlp": 0.01051311, "balance_loss_clip": 1.05495286, "balance_loss_mlp": 1.03245187, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 2.202899784913125, "language_loss": 0.80724835, "learning_rate": 3.582439259339073e-06, "loss": 0.82919687, "num_input_tokens_seen": 83447950, "step": 3878, "time_per_iteration": 2.6945395469665527 }, { "auxiliary_loss_clip": 0.0109123, "auxiliary_loss_mlp": 0.01049547, "balance_loss_clip": 1.04632592, "balance_loss_mlp": 1.0298301, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 1.857420507716431, "language_loss": 0.7521472, "learning_rate": 3.5822010610212374e-06, "loss": 0.77355498, "num_input_tokens_seen": 83467785, "step": 3879, "time_per_iteration": 2.8909342288970947 }, { "auxiliary_loss_clip": 0.01095967, "auxiliary_loss_mlp": 0.01051433, "balance_loss_clip": 1.04621899, "balance_loss_mlp": 1.03238297, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 2.179587653719585, "language_loss": 0.89532614, "learning_rate": 3.5819628027061795e-06, "loss": 0.91680014, "num_input_tokens_seen": 83485390, "step": 3880, "time_per_iteration": 2.7358896732330322 }, { "auxiliary_loss_clip": 0.01127816, "auxiliary_loss_mlp": 0.01049697, "balance_loss_clip": 1.05119944, "balance_loss_mlp": 1.0319109, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 1.6825190155617658, "language_loss": 0.71915156, "learning_rate": 3.5817244844029334e-06, "loss": 0.74092674, "num_input_tokens_seen": 83504890, "step": 3881, "time_per_iteration": 2.702533721923828 }, { "auxiliary_loss_clip": 0.01148084, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.05186546, "balance_loss_mlp": 1.02497458, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 1.5464986217430505, "language_loss": 0.68210357, "learning_rate": 3.581486106120537e-06, "loss": 0.70401263, "num_input_tokens_seen": 83526475, "step": 3882, "time_per_iteration": 2.6449384689331055 }, { "auxiliary_loss_clip": 0.01106984, "auxiliary_loss_mlp": 0.01053219, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.03457499, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 2.180831821464153, "language_loss": 0.77379489, "learning_rate": 3.5812476678680287e-06, "loss": 0.79539698, "num_input_tokens_seen": 83546620, "step": 3883, "time_per_iteration": 5.806958913803101 }, { "auxiliary_loss_clip": 0.01053192, "auxiliary_loss_mlp": 0.01007679, "balance_loss_clip": 1.03368068, "balance_loss_mlp": 1.0053544, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.7945750769740417, "language_loss": 0.59117424, "learning_rate": 3.58100916965445e-06, "loss": 0.61178291, "num_input_tokens_seen": 83616160, "step": 3884, "time_per_iteration": 3.3524324893951416 }, { "auxiliary_loss_clip": 0.01117007, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.04925692, "balance_loss_mlp": 1.01704168, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 1.6775563031527567, "language_loss": 0.80286831, "learning_rate": 3.5807706114888455e-06, "loss": 0.82437843, "num_input_tokens_seen": 83636795, "step": 3885, "time_per_iteration": 4.295818328857422 }, { "auxiliary_loss_clip": 0.01136024, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05494285, "balance_loss_mlp": 1.02274597, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 2.2066793657203116, "language_loss": 0.88230193, "learning_rate": 3.580531993380261e-06, "loss": 0.90406859, "num_input_tokens_seen": 83654050, "step": 3886, "time_per_iteration": 2.6672091484069824 }, { "auxiliary_loss_clip": 0.01150675, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.05293703, "balance_loss_mlp": 1.02512443, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 4.0082984179074055, "language_loss": 0.73170543, "learning_rate": 3.5802933153377445e-06, "loss": 0.75363672, "num_input_tokens_seen": 83673720, "step": 3887, "time_per_iteration": 2.7338294982910156 }, { "auxiliary_loss_clip": 0.01140271, "auxiliary_loss_mlp": 0.0104923, "balance_loss_clip": 1.05201173, "balance_loss_mlp": 1.03183722, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 2.677865426107907, "language_loss": 0.84125429, "learning_rate": 3.5800545773703475e-06, "loss": 0.86314929, "num_input_tokens_seen": 83693470, "step": 3888, "time_per_iteration": 2.7020208835601807 }, { "auxiliary_loss_clip": 0.01121847, "auxiliary_loss_mlp": 0.010605, "balance_loss_clip": 1.04974008, "balance_loss_mlp": 1.04121208, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 3.2074942430893976, "language_loss": 0.87298381, "learning_rate": 3.5798157794871225e-06, "loss": 0.89480728, "num_input_tokens_seen": 83711620, "step": 3889, "time_per_iteration": 4.319674491882324 }, { "auxiliary_loss_clip": 0.01141703, "auxiliary_loss_mlp": 0.01046248, "balance_loss_clip": 1.05330396, "balance_loss_mlp": 1.02877164, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 3.8719217250511164, "language_loss": 0.76830876, "learning_rate": 3.579576921697125e-06, "loss": 0.79018819, "num_input_tokens_seen": 83727890, "step": 3890, "time_per_iteration": 2.6133198738098145 }, { "auxiliary_loss_clip": 0.01107139, "auxiliary_loss_mlp": 0.00775386, "balance_loss_clip": 1.04837406, "balance_loss_mlp": 1.00124502, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 1.8304579433009527, "language_loss": 0.73385048, "learning_rate": 3.579338004009412e-06, "loss": 0.75267571, "num_input_tokens_seen": 83749370, "step": 3891, "time_per_iteration": 3.008927583694458 }, { "auxiliary_loss_clip": 0.01145053, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05121398, "balance_loss_mlp": 1.03035665, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 1.8316289897122906, "language_loss": 0.82725632, "learning_rate": 3.5790990264330433e-06, "loss": 0.84918392, "num_input_tokens_seen": 83769560, "step": 3892, "time_per_iteration": 2.6455893516540527 }, { "auxiliary_loss_clip": 0.01100914, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.04450488, "balance_loss_mlp": 1.03491104, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 2.707564715226966, "language_loss": 0.64982933, "learning_rate": 3.578859988977082e-06, "loss": 0.67139405, "num_input_tokens_seen": 83795635, "step": 3893, "time_per_iteration": 2.9392964839935303 }, { "auxiliary_loss_clip": 0.01106007, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.04782617, "balance_loss_mlp": 1.02701449, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.5782091790717105, "language_loss": 0.79415286, "learning_rate": 3.5786208916505916e-06, "loss": 0.81566513, "num_input_tokens_seen": 83814090, "step": 3894, "time_per_iteration": 2.839935541152954 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01049748, "balance_loss_clip": 1.04747164, "balance_loss_mlp": 1.03253388, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 1.551347830991082, "language_loss": 0.81978422, "learning_rate": 3.5783817344626383e-06, "loss": 0.84162462, "num_input_tokens_seen": 83836870, "step": 3895, "time_per_iteration": 2.739955425262451 }, { "auxiliary_loss_clip": 0.01134592, "auxiliary_loss_mlp": 0.01052429, "balance_loss_clip": 1.04999852, "balance_loss_mlp": 1.03514385, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 1.8690411936118732, "language_loss": 0.80239451, "learning_rate": 3.578142517422292e-06, "loss": 0.82426476, "num_input_tokens_seen": 83853275, "step": 3896, "time_per_iteration": 2.681114435195923 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.04685259, "balance_loss_mlp": 1.02779162, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 2.2492510100498087, "language_loss": 0.83249009, "learning_rate": 3.577903240538623e-06, "loss": 0.85420382, "num_input_tokens_seen": 83872340, "step": 3897, "time_per_iteration": 2.728916645050049 }, { "auxiliary_loss_clip": 0.01134669, "auxiliary_loss_mlp": 0.01058403, "balance_loss_clip": 1.04949594, "balance_loss_mlp": 1.04016376, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 1.5875861860902294, "language_loss": 0.78903484, "learning_rate": 3.577663903820705e-06, "loss": 0.81096554, "num_input_tokens_seen": 83888795, "step": 3898, "time_per_iteration": 2.6597952842712402 }, { "auxiliary_loss_clip": 0.01109182, "auxiliary_loss_mlp": 0.01055226, "balance_loss_clip": 1.04657888, "balance_loss_mlp": 1.03785777, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 1.9975380770167093, "language_loss": 0.73769581, "learning_rate": 3.577424507277614e-06, "loss": 0.75933987, "num_input_tokens_seen": 83906820, "step": 3899, "time_per_iteration": 2.7511518001556396 }, { "auxiliary_loss_clip": 0.01110646, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.04662895, "balance_loss_mlp": 1.03530502, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 2.822835219305806, "language_loss": 0.75323856, "learning_rate": 3.5771850509184277e-06, "loss": 0.77488053, "num_input_tokens_seen": 83926370, "step": 3900, "time_per_iteration": 2.7366316318511963 }, { "auxiliary_loss_clip": 0.01097598, "auxiliary_loss_mlp": 0.01047935, "balance_loss_clip": 1.04771769, "balance_loss_mlp": 1.03019702, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 1.7042292639984586, "language_loss": 0.67123592, "learning_rate": 3.5769455347522256e-06, "loss": 0.69269133, "num_input_tokens_seen": 83944600, "step": 3901, "time_per_iteration": 2.857386589050293 }, { "auxiliary_loss_clip": 0.01029196, "auxiliary_loss_mlp": 0.01060621, "balance_loss_clip": 1.02959871, "balance_loss_mlp": 1.0584631, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.7708596717968548, "language_loss": 0.58189189, "learning_rate": 3.576705958788091e-06, "loss": 0.60279006, "num_input_tokens_seen": 84005100, "step": 3902, "time_per_iteration": 3.2769579887390137 }, { "auxiliary_loss_clip": 0.01126982, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05044544, "balance_loss_mlp": 1.02691305, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 2.0309755154884708, "language_loss": 0.80396789, "learning_rate": 3.576466323035108e-06, "loss": 0.82569516, "num_input_tokens_seen": 84023775, "step": 3903, "time_per_iteration": 2.683908462524414 }, { "auxiliary_loss_clip": 0.01092072, "auxiliary_loss_mlp": 0.01044121, "balance_loss_clip": 1.04248238, "balance_loss_mlp": 1.02614391, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 1.970422818337997, "language_loss": 0.82400727, "learning_rate": 3.5762266275023645e-06, "loss": 0.84536922, "num_input_tokens_seen": 84042605, "step": 3904, "time_per_iteration": 2.8023037910461426 }, { "auxiliary_loss_clip": 0.01147463, "auxiliary_loss_mlp": 0.01043559, "balance_loss_clip": 1.05247784, "balance_loss_mlp": 1.02620173, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 1.9105311329606578, "language_loss": 0.71330345, "learning_rate": 3.57598687219895e-06, "loss": 0.73521364, "num_input_tokens_seen": 84061520, "step": 3905, "time_per_iteration": 2.650956869125366 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.05086017, "balance_loss_mlp": 1.01877677, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 2.334164983860831, "language_loss": 0.71415532, "learning_rate": 3.5757470571339543e-06, "loss": 0.73594707, "num_input_tokens_seen": 84081800, "step": 3906, "time_per_iteration": 2.6635055541992188 }, { "auxiliary_loss_clip": 0.01138147, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.04703832, "balance_loss_mlp": 1.02246392, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 2.5527171953873693, "language_loss": 0.74024308, "learning_rate": 3.575507182316473e-06, "loss": 0.7620455, "num_input_tokens_seen": 84102340, "step": 3907, "time_per_iteration": 2.751154661178589 }, { "auxiliary_loss_clip": 0.01135101, "auxiliary_loss_mlp": 0.01047433, "balance_loss_clip": 1.04911268, "balance_loss_mlp": 1.02950394, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 1.9847054585906883, "language_loss": 0.72428519, "learning_rate": 3.575267247755601e-06, "loss": 0.74611056, "num_input_tokens_seen": 84120370, "step": 3908, "time_per_iteration": 2.631162166595459 }, { "auxiliary_loss_clip": 0.01053013, "auxiliary_loss_mlp": 0.01020478, "balance_loss_clip": 1.03362584, "balance_loss_mlp": 1.01765239, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.0307072678924762, "language_loss": 0.73359185, "learning_rate": 3.5750272534604367e-06, "loss": 0.75432676, "num_input_tokens_seen": 84165515, "step": 3909, "time_per_iteration": 2.974531650543213 }, { "auxiliary_loss_clip": 0.01136436, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05006361, "balance_loss_mlp": 1.02797985, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 1.6771333047394956, "language_loss": 0.88288009, "learning_rate": 3.5747871994400822e-06, "loss": 0.90470886, "num_input_tokens_seen": 84184540, "step": 3910, "time_per_iteration": 2.6615123748779297 }, { "auxiliary_loss_clip": 0.01134757, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.04980493, "balance_loss_mlp": 1.02188933, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 1.9388895528834493, "language_loss": 0.76067305, "learning_rate": 3.5745470857036386e-06, "loss": 0.78240794, "num_input_tokens_seen": 84202025, "step": 3911, "time_per_iteration": 2.6846752166748047 }, { "auxiliary_loss_clip": 0.01130294, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04968345, "balance_loss_mlp": 1.02546179, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 1.5851255377793763, "language_loss": 0.81651384, "learning_rate": 3.5743069122602122e-06, "loss": 0.83823043, "num_input_tokens_seen": 84221895, "step": 3912, "time_per_iteration": 2.6340627670288086 }, { "auxiliary_loss_clip": 0.01123815, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05082059, "balance_loss_mlp": 1.02836537, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 3.1390338867327165, "language_loss": 0.71748006, "learning_rate": 3.574066679118909e-06, "loss": 0.73918045, "num_input_tokens_seen": 84240455, "step": 3913, "time_per_iteration": 2.6716067790985107 }, { "auxiliary_loss_clip": 0.01141007, "auxiliary_loss_mlp": 0.00776535, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.00136077, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 1.7080087282408476, "language_loss": 0.76152158, "learning_rate": 3.57382638628884e-06, "loss": 0.78069693, "num_input_tokens_seen": 84261605, "step": 3914, "time_per_iteration": 2.706982135772705 }, { "auxiliary_loss_clip": 0.01088532, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.02153206, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.2148128973951877, "language_loss": 0.89692557, "learning_rate": 3.5735860337791174e-06, "loss": 0.91820902, "num_input_tokens_seen": 84278675, "step": 3915, "time_per_iteration": 2.8005998134613037 }, { "auxiliary_loss_clip": 0.01045613, "auxiliary_loss_mlp": 0.0100868, "balance_loss_clip": 1.02860212, "balance_loss_mlp": 1.00596201, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8066012642326402, "language_loss": 0.59382623, "learning_rate": 3.573345621598854e-06, "loss": 0.61436915, "num_input_tokens_seen": 84329765, "step": 3916, "time_per_iteration": 3.168708086013794 }, { "auxiliary_loss_clip": 0.01027738, "auxiliary_loss_mlp": 0.01005192, "balance_loss_clip": 1.03619492, "balance_loss_mlp": 1.00231957, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7680467252570666, "language_loss": 0.49518228, "learning_rate": 3.5731051497571675e-06, "loss": 0.51551157, "num_input_tokens_seen": 84393680, "step": 3917, "time_per_iteration": 3.3240060806274414 }, { "auxiliary_loss_clip": 0.01112941, "auxiliary_loss_mlp": 0.01048231, "balance_loss_clip": 1.04929173, "balance_loss_mlp": 1.03133857, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 1.9721662885337694, "language_loss": 0.76349282, "learning_rate": 3.5728646182631756e-06, "loss": 0.78510457, "num_input_tokens_seen": 84412640, "step": 3918, "time_per_iteration": 2.739431619644165 }, { "auxiliary_loss_clip": 0.0109904, "auxiliary_loss_mlp": 0.01052049, "balance_loss_clip": 1.04440236, "balance_loss_mlp": 1.03514528, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 2.001330675769641, "language_loss": 0.69002521, "learning_rate": 3.5726240271259995e-06, "loss": 0.71153617, "num_input_tokens_seen": 84431605, "step": 3919, "time_per_iteration": 2.8809926509857178 }, { "auxiliary_loss_clip": 0.01106851, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04772878, "balance_loss_mlp": 1.02221501, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 1.6908780146896767, "language_loss": 0.70500779, "learning_rate": 3.5723833763547634e-06, "loss": 0.72646987, "num_input_tokens_seen": 84454210, "step": 3920, "time_per_iteration": 2.7984554767608643 }, { "auxiliary_loss_clip": 0.01124832, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.05141807, "balance_loss_mlp": 1.03756285, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 1.7460619151295316, "language_loss": 0.77363533, "learning_rate": 3.5721426659585916e-06, "loss": 0.7954244, "num_input_tokens_seen": 84475540, "step": 3921, "time_per_iteration": 2.8038690090179443 }, { "auxiliary_loss_clip": 0.01113499, "auxiliary_loss_mlp": 0.01043793, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.02692485, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.2761735813493775, "language_loss": 0.74768102, "learning_rate": 3.571901895946612e-06, "loss": 0.76925397, "num_input_tokens_seen": 84494580, "step": 3922, "time_per_iteration": 5.741380929946899 }, { "auxiliary_loss_clip": 0.01116057, "auxiliary_loss_mlp": 0.01041318, "balance_loss_clip": 1.04831624, "balance_loss_mlp": 1.02577269, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 3.3386441952016868, "language_loss": 0.79846609, "learning_rate": 3.571661066327956e-06, "loss": 0.82003981, "num_input_tokens_seen": 84513850, "step": 3923, "time_per_iteration": 2.7889180183410645 }, { "auxiliary_loss_clip": 0.01089456, "auxiliary_loss_mlp": 0.0105728, "balance_loss_clip": 1.04471469, "balance_loss_mlp": 1.03935063, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 4.698975622885271, "language_loss": 0.74874711, "learning_rate": 3.571420177111754e-06, "loss": 0.77021456, "num_input_tokens_seen": 84532315, "step": 3924, "time_per_iteration": 4.272740125656128 }, { "auxiliary_loss_clip": 0.01145554, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.05115998, "balance_loss_mlp": 1.030568, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 2.8676741031402977, "language_loss": 0.82357788, "learning_rate": 3.5711792283071416e-06, "loss": 0.8455022, "num_input_tokens_seen": 84550970, "step": 3925, "time_per_iteration": 2.6825013160705566 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04567564, "balance_loss_mlp": 1.0315721, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 1.5755651433289561, "language_loss": 0.59533024, "learning_rate": 3.5709382199232564e-06, "loss": 0.61701441, "num_input_tokens_seen": 84571655, "step": 3926, "time_per_iteration": 2.6960842609405518 }, { "auxiliary_loss_clip": 0.01125496, "auxiliary_loss_mlp": 0.01046163, "balance_loss_clip": 1.04914129, "balance_loss_mlp": 1.0302484, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 2.4179456581838212, "language_loss": 0.7155292, "learning_rate": 3.570697151969235e-06, "loss": 0.7372458, "num_input_tokens_seen": 84593130, "step": 3927, "time_per_iteration": 2.786576986312866 }, { "auxiliary_loss_clip": 0.01120941, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.04764938, "balance_loss_mlp": 1.03125572, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 1.9380358164668718, "language_loss": 0.74792278, "learning_rate": 3.570456024454221e-06, "loss": 0.76960224, "num_input_tokens_seen": 84612410, "step": 3928, "time_per_iteration": 4.450765609741211 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01047112, "balance_loss_clip": 1.04935324, "balance_loss_mlp": 1.02949333, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 4.3448767989564745, "language_loss": 0.81905198, "learning_rate": 3.5702148373873576e-06, "loss": 0.84070963, "num_input_tokens_seen": 84627610, "step": 3929, "time_per_iteration": 2.654085874557495 }, { "auxiliary_loss_clip": 0.01151721, "auxiliary_loss_mlp": 0.0105167, "balance_loss_clip": 1.05143714, "balance_loss_mlp": 1.03314447, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 3.048788180104446, "language_loss": 0.72323942, "learning_rate": 3.569973590777789e-06, "loss": 0.74527335, "num_input_tokens_seen": 84648415, "step": 3930, "time_per_iteration": 2.67429780960083 }, { "auxiliary_loss_clip": 0.01143652, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.04880345, "balance_loss_mlp": 1.01985574, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 2.7450987997323333, "language_loss": 0.74105632, "learning_rate": 3.569732284634665e-06, "loss": 0.76285434, "num_input_tokens_seen": 84670080, "step": 3931, "time_per_iteration": 2.8017847537994385 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.05250037, "balance_loss_mlp": 1.02853799, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 2.2419024865888852, "language_loss": 0.8018778, "learning_rate": 3.569490918967136e-06, "loss": 0.82371396, "num_input_tokens_seen": 84686465, "step": 3932, "time_per_iteration": 2.6295793056488037 }, { "auxiliary_loss_clip": 0.01108498, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.04981244, "balance_loss_mlp": 1.02614117, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 2.247824561482015, "language_loss": 0.85683465, "learning_rate": 3.5692494937843537e-06, "loss": 0.87832487, "num_input_tokens_seen": 84708825, "step": 3933, "time_per_iteration": 2.7401201725006104 }, { "auxiliary_loss_clip": 0.01101933, "auxiliary_loss_mlp": 0.010512, "balance_loss_clip": 1.04680276, "balance_loss_mlp": 1.03112483, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.0287283132247547, "language_loss": 0.83179402, "learning_rate": 3.5690080090954727e-06, "loss": 0.85332537, "num_input_tokens_seen": 84726165, "step": 3934, "time_per_iteration": 2.8152921199798584 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.05208373, "balance_loss_mlp": 1.02556968, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 1.8368151879100059, "language_loss": 0.78513408, "learning_rate": 3.5687664649096515e-06, "loss": 0.80704081, "num_input_tokens_seen": 84745815, "step": 3935, "time_per_iteration": 2.6769750118255615 }, { "auxiliary_loss_clip": 0.01134595, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.05270088, "balance_loss_mlp": 1.01891589, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 1.5615220666884744, "language_loss": 0.79614085, "learning_rate": 3.5685248612360487e-06, "loss": 0.81783605, "num_input_tokens_seen": 84765415, "step": 3936, "time_per_iteration": 2.7037193775177 }, { "auxiliary_loss_clip": 0.01126163, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.04967618, "balance_loss_mlp": 1.01779902, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 1.671201383656535, "language_loss": 0.7915628, "learning_rate": 3.568283198083826e-06, "loss": 0.81317174, "num_input_tokens_seen": 84787080, "step": 3937, "time_per_iteration": 2.7639834880828857 }, { "auxiliary_loss_clip": 0.01134519, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.05320358, "balance_loss_mlp": 1.02313685, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 1.8758026172480324, "language_loss": 0.85389286, "learning_rate": 3.568041475462147e-06, "loss": 0.8756234, "num_input_tokens_seen": 84805395, "step": 3938, "time_per_iteration": 2.6919057369232178 }, { "auxiliary_loss_clip": 0.01145522, "auxiliary_loss_mlp": 0.01047488, "balance_loss_clip": 1.05159402, "balance_loss_mlp": 1.03076303, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 4.660879571039018, "language_loss": 0.9365679, "learning_rate": 3.5677996933801785e-06, "loss": 0.958498, "num_input_tokens_seen": 84818090, "step": 3939, "time_per_iteration": 2.7249948978424072 }, { "auxiliary_loss_clip": 0.01149288, "auxiliary_loss_mlp": 0.01041833, "balance_loss_clip": 1.0512023, "balance_loss_mlp": 1.02463138, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 1.884439522765895, "language_loss": 0.82347792, "learning_rate": 3.567557851847088e-06, "loss": 0.84538913, "num_input_tokens_seen": 84837695, "step": 3940, "time_per_iteration": 2.666647434234619 }, { "auxiliary_loss_clip": 0.01128412, "auxiliary_loss_mlp": 0.00775407, "balance_loss_clip": 1.05063081, "balance_loss_mlp": 1.00109661, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 2.7155330970608214, "language_loss": 0.88959104, "learning_rate": 3.5673159508720464e-06, "loss": 0.90862918, "num_input_tokens_seen": 84854630, "step": 3941, "time_per_iteration": 2.6898627281188965 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01040548, "balance_loss_clip": 1.04976177, "balance_loss_mlp": 1.0227741, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 2.436898535695529, "language_loss": 0.8484506, "learning_rate": 3.5670739904642274e-06, "loss": 0.870327, "num_input_tokens_seen": 84871805, "step": 3942, "time_per_iteration": 2.560166835784912 }, { "auxiliary_loss_clip": 0.01109105, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.04736543, "balance_loss_mlp": 1.02447248, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 1.9848651824816348, "language_loss": 0.81126499, "learning_rate": 3.5668319706328065e-06, "loss": 0.83278596, "num_input_tokens_seen": 84889815, "step": 3943, "time_per_iteration": 2.7389075756073 }, { "auxiliary_loss_clip": 0.01114013, "auxiliary_loss_mlp": 0.01044642, "balance_loss_clip": 1.0464983, "balance_loss_mlp": 1.02618814, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 2.1611381488400143, "language_loss": 0.67060351, "learning_rate": 3.566589891386959e-06, "loss": 0.69219005, "num_input_tokens_seen": 84904380, "step": 3944, "time_per_iteration": 2.6382999420166016 }, { "auxiliary_loss_clip": 0.01117531, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.04629564, "balance_loss_mlp": 1.02003753, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 1.9578725621632602, "language_loss": 0.75573617, "learning_rate": 3.566347752735866e-06, "loss": 0.77729309, "num_input_tokens_seen": 84922935, "step": 3945, "time_per_iteration": 2.678377628326416 }, { "auxiliary_loss_clip": 0.01128604, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.0493716, "balance_loss_mlp": 1.02255654, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 1.4378865328543082, "language_loss": 0.63750178, "learning_rate": 3.5661055546887094e-06, "loss": 0.65917826, "num_input_tokens_seen": 84943685, "step": 3946, "time_per_iteration": 2.77178955078125 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.0460459, "balance_loss_mlp": 1.0186162, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 2.53957699605931, "language_loss": 0.77666485, "learning_rate": 3.5658632972546734e-06, "loss": 0.79833984, "num_input_tokens_seen": 84959505, "step": 3947, "time_per_iteration": 2.65461802482605 }, { "auxiliary_loss_clip": 0.01145835, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.0566994, "balance_loss_mlp": 1.02299047, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 2.0053805098120123, "language_loss": 0.80706096, "learning_rate": 3.565620980442944e-06, "loss": 0.82892442, "num_input_tokens_seen": 84982130, "step": 3948, "time_per_iteration": 2.756716012954712 }, { "auxiliary_loss_clip": 0.01129664, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.05104828, "balance_loss_mlp": 1.02643192, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 2.5980612684471374, "language_loss": 0.80257607, "learning_rate": 3.5653786042627107e-06, "loss": 0.82431316, "num_input_tokens_seen": 85000640, "step": 3949, "time_per_iteration": 2.74457049369812 }, { "auxiliary_loss_clip": 0.0112363, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.04977036, "balance_loss_mlp": 1.02109337, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 2.0592081961125093, "language_loss": 0.73239946, "learning_rate": 3.565136168723163e-06, "loss": 0.75402236, "num_input_tokens_seen": 85018970, "step": 3950, "time_per_iteration": 2.650508165359497 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02204442, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 1.9969465766046124, "language_loss": 0.72794384, "learning_rate": 3.564893673833495e-06, "loss": 0.74977756, "num_input_tokens_seen": 85035905, "step": 3951, "time_per_iteration": 2.652399778366089 }, { "auxiliary_loss_clip": 0.01122477, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.05080223, "balance_loss_mlp": 1.0216229, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 3.398248459712791, "language_loss": 0.73703241, "learning_rate": 3.564651119602903e-06, "loss": 0.75865161, "num_input_tokens_seen": 85054560, "step": 3952, "time_per_iteration": 2.7522144317626953 }, { "auxiliary_loss_clip": 0.01100804, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04366636, "balance_loss_mlp": 1.02566266, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 1.7524267936836437, "language_loss": 0.71314329, "learning_rate": 3.564408506040583e-06, "loss": 0.73457694, "num_input_tokens_seen": 85074425, "step": 3953, "time_per_iteration": 2.7846672534942627 }, { "auxiliary_loss_clip": 0.01151909, "auxiliary_loss_mlp": 0.01047443, "balance_loss_clip": 1.05282676, "balance_loss_mlp": 1.02854872, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 1.9722222736847754, "language_loss": 0.81792426, "learning_rate": 3.5641658331557356e-06, "loss": 0.83991784, "num_input_tokens_seen": 85092865, "step": 3954, "time_per_iteration": 2.6262643337249756 }, { "auxiliary_loss_clip": 0.01127802, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05239391, "balance_loss_mlp": 1.02616453, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 2.2607510345904824, "language_loss": 0.66270143, "learning_rate": 3.5639231009575634e-06, "loss": 0.68442386, "num_input_tokens_seen": 85110175, "step": 3955, "time_per_iteration": 2.672151803970337 }, { "auxiliary_loss_clip": 0.01149182, "auxiliary_loss_mlp": 0.0104812, "balance_loss_clip": 1.05219805, "balance_loss_mlp": 1.03104961, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.4117933502593074, "language_loss": 0.83963013, "learning_rate": 3.5636803094552704e-06, "loss": 0.86160314, "num_input_tokens_seen": 85129925, "step": 3956, "time_per_iteration": 2.6483681201934814 }, { "auxiliary_loss_clip": 0.01103304, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.04726648, "balance_loss_mlp": 1.02556944, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 2.308539718278817, "language_loss": 0.8482393, "learning_rate": 3.5634374586580635e-06, "loss": 0.86970174, "num_input_tokens_seen": 85147755, "step": 3957, "time_per_iteration": 2.718961715698242 }, { "auxiliary_loss_clip": 0.01087747, "auxiliary_loss_mlp": 0.01039974, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.02428651, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 2.068360920278316, "language_loss": 0.70373344, "learning_rate": 3.563194548575151e-06, "loss": 0.72501063, "num_input_tokens_seen": 85165270, "step": 3958, "time_per_iteration": 2.818115472793579 }, { "auxiliary_loss_clip": 0.01102632, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.04540312, "balance_loss_mlp": 1.02276158, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 2.474231994209954, "language_loss": 0.66273189, "learning_rate": 3.562951579215745e-06, "loss": 0.68417823, "num_input_tokens_seen": 85181555, "step": 3959, "time_per_iteration": 2.71085786819458 }, { "auxiliary_loss_clip": 0.01103257, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.04910731, "balance_loss_mlp": 1.02760553, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 1.922923950627842, "language_loss": 0.72140026, "learning_rate": 3.5627085505890586e-06, "loss": 0.74288028, "num_input_tokens_seen": 85199455, "step": 3960, "time_per_iteration": 2.724398612976074 }, { "auxiliary_loss_clip": 0.01065725, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.04778433, "balance_loss_mlp": 1.02385175, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 1.836282299199184, "language_loss": 0.74303818, "learning_rate": 3.562465462704307e-06, "loss": 0.76410902, "num_input_tokens_seen": 85219170, "step": 3961, "time_per_iteration": 4.592544794082642 }, { "auxiliary_loss_clip": 0.01149701, "auxiliary_loss_mlp": 0.010511, "balance_loss_clip": 1.05083704, "balance_loss_mlp": 1.0321815, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 1.6798300631958207, "language_loss": 0.6562922, "learning_rate": 3.5622223155707085e-06, "loss": 0.67830026, "num_input_tokens_seen": 85238480, "step": 3962, "time_per_iteration": 4.40812087059021 }, { "auxiliary_loss_clip": 0.01121684, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.04743505, "balance_loss_mlp": 1.02511263, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 1.838705722688445, "language_loss": 0.74284148, "learning_rate": 3.561979109197483e-06, "loss": 0.76448429, "num_input_tokens_seen": 85259180, "step": 3963, "time_per_iteration": 2.7173969745635986 }, { "auxiliary_loss_clip": 0.01120014, "auxiliary_loss_mlp": 0.01045721, "balance_loss_clip": 1.0530858, "balance_loss_mlp": 1.02756512, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 2.045875790034744, "language_loss": 0.77264321, "learning_rate": 3.5617358435938538e-06, "loss": 0.79430056, "num_input_tokens_seen": 85278550, "step": 3964, "time_per_iteration": 4.25124716758728 }, { "auxiliary_loss_clip": 0.01108604, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.03124809, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 2.3097885565999894, "language_loss": 0.71521109, "learning_rate": 3.561492518769045e-06, "loss": 0.73678052, "num_input_tokens_seen": 85297345, "step": 3965, "time_per_iteration": 2.757647752761841 }, { "auxiliary_loss_clip": 0.01115176, "auxiliary_loss_mlp": 0.01043319, "balance_loss_clip": 1.04632521, "balance_loss_mlp": 1.02647483, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 2.673966650516871, "language_loss": 0.78003007, "learning_rate": 3.561249134732282e-06, "loss": 0.801615, "num_input_tokens_seen": 85315105, "step": 3966, "time_per_iteration": 2.71159291267395 }, { "auxiliary_loss_clip": 0.01124693, "auxiliary_loss_mlp": 0.01045448, "balance_loss_clip": 1.05071902, "balance_loss_mlp": 1.02899134, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 2.116401462724705, "language_loss": 0.68767631, "learning_rate": 3.561005691492797e-06, "loss": 0.70937771, "num_input_tokens_seen": 85334735, "step": 3967, "time_per_iteration": 2.7072744369506836 }, { "auxiliary_loss_clip": 0.01116174, "auxiliary_loss_mlp": 0.01055757, "balance_loss_clip": 1.04883289, "balance_loss_mlp": 1.03803015, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 3.581336577718575, "language_loss": 0.68005061, "learning_rate": 3.5607621890598185e-06, "loss": 0.70176995, "num_input_tokens_seen": 85352875, "step": 3968, "time_per_iteration": 4.378219842910767 }, { "auxiliary_loss_clip": 0.01097883, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.05052614, "balance_loss_mlp": 1.0274837, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 2.210255088762028, "language_loss": 0.77106255, "learning_rate": 3.5605186274425823e-06, "loss": 0.79248536, "num_input_tokens_seen": 85372205, "step": 3969, "time_per_iteration": 2.847663164138794 }, { "auxiliary_loss_clip": 0.01121681, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.0498476, "balance_loss_mlp": 1.02334595, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 2.1326335149840583, "language_loss": 0.7617563, "learning_rate": 3.5602750066503225e-06, "loss": 0.78337121, "num_input_tokens_seen": 85389705, "step": 3970, "time_per_iteration": 2.766862392425537 }, { "auxiliary_loss_clip": 0.01106309, "auxiliary_loss_mlp": 0.01049131, "balance_loss_clip": 1.04287159, "balance_loss_mlp": 1.03111875, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 2.3319107764636415, "language_loss": 0.85474384, "learning_rate": 3.5600313266922793e-06, "loss": 0.87629819, "num_input_tokens_seen": 85407855, "step": 3971, "time_per_iteration": 2.7597670555114746 }, { "auxiliary_loss_clip": 0.01062507, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.03465796, "balance_loss_mlp": 1.03661716, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7451796217314707, "language_loss": 0.62797832, "learning_rate": 3.5597875875776915e-06, "loss": 0.6489948, "num_input_tokens_seen": 85470885, "step": 3972, "time_per_iteration": 3.2572779655456543 }, { "auxiliary_loss_clip": 0.0112174, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.0492239, "balance_loss_mlp": 1.02109838, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 1.9449657433446057, "language_loss": 0.82093811, "learning_rate": 3.5595437893158013e-06, "loss": 0.84253484, "num_input_tokens_seen": 85488460, "step": 3973, "time_per_iteration": 2.6394145488739014 }, { "auxiliary_loss_clip": 0.01115852, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.04884124, "balance_loss_mlp": 1.03272736, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.5639820592628684, "language_loss": 0.79418832, "learning_rate": 3.5592999319158546e-06, "loss": 0.81584924, "num_input_tokens_seen": 85508590, "step": 3974, "time_per_iteration": 2.6926944255828857 }, { "auxiliary_loss_clip": 0.01134012, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.05169725, "balance_loss_mlp": 1.02475047, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 1.8382350241534648, "language_loss": 0.8420803, "learning_rate": 3.5590560153870984e-06, "loss": 0.86384743, "num_input_tokens_seen": 85525970, "step": 3975, "time_per_iteration": 2.6402463912963867 }, { "auxiliary_loss_clip": 0.01126962, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.04938245, "balance_loss_mlp": 1.02545786, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 2.129124681208868, "language_loss": 0.84249294, "learning_rate": 3.5588120397387816e-06, "loss": 0.864187, "num_input_tokens_seen": 85543700, "step": 3976, "time_per_iteration": 2.624758720397949 }, { "auxiliary_loss_clip": 0.01075224, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.0434798, "balance_loss_mlp": 1.02103186, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 1.8888081312271703, "language_loss": 0.74451673, "learning_rate": 3.5585680049801566e-06, "loss": 0.76563722, "num_input_tokens_seen": 85562765, "step": 3977, "time_per_iteration": 2.848529815673828 }, { "auxiliary_loss_clip": 0.01151335, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.05476987, "balance_loss_mlp": 1.02829063, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 1.6816446874821869, "language_loss": 0.72515011, "learning_rate": 3.5583239111204764e-06, "loss": 0.74712306, "num_input_tokens_seen": 85581755, "step": 3978, "time_per_iteration": 2.6967527866363525 }, { "auxiliary_loss_clip": 0.01123321, "auxiliary_loss_mlp": 0.01045192, "balance_loss_clip": 1.04713726, "balance_loss_mlp": 1.02802634, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 2.5130493367739413, "language_loss": 0.78474021, "learning_rate": 3.558079758168997e-06, "loss": 0.80642533, "num_input_tokens_seen": 85599455, "step": 3979, "time_per_iteration": 2.6679623126983643 }, { "auxiliary_loss_clip": 0.01123187, "auxiliary_loss_mlp": 0.01052255, "balance_loss_clip": 1.04774463, "balance_loss_mlp": 1.03390861, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 1.8353092232149775, "language_loss": 0.81943917, "learning_rate": 3.557835546134977e-06, "loss": 0.84119362, "num_input_tokens_seen": 85619970, "step": 3980, "time_per_iteration": 2.7941136360168457 }, { "auxiliary_loss_clip": 0.01094849, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.04719615, "balance_loss_mlp": 1.02036595, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 1.7388406045293963, "language_loss": 0.83562148, "learning_rate": 3.5575912750276775e-06, "loss": 0.85694849, "num_input_tokens_seen": 85638850, "step": 3981, "time_per_iteration": 2.773372173309326 }, { "auxiliary_loss_clip": 0.01126579, "auxiliary_loss_mlp": 0.01045152, "balance_loss_clip": 1.05084574, "balance_loss_mlp": 1.0267818, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 2.0270942419393676, "language_loss": 0.76690662, "learning_rate": 3.5573469448563607e-06, "loss": 0.78862393, "num_input_tokens_seen": 85656285, "step": 3982, "time_per_iteration": 2.770089864730835 }, { "auxiliary_loss_clip": 0.01107786, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.02757215, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 2.333665248317953, "language_loss": 0.78243405, "learning_rate": 3.5571025556302915e-06, "loss": 0.80394924, "num_input_tokens_seen": 85673020, "step": 3983, "time_per_iteration": 2.8361902236938477 }, { "auxiliary_loss_clip": 0.01136012, "auxiliary_loss_mlp": 0.00775416, "balance_loss_clip": 1.0530262, "balance_loss_mlp": 1.00106907, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 1.8468424363822287, "language_loss": 0.73274761, "learning_rate": 3.556858107358737e-06, "loss": 0.75186193, "num_input_tokens_seen": 85692565, "step": 3984, "time_per_iteration": 2.720289468765259 }, { "auxiliary_loss_clip": 0.01102619, "auxiliary_loss_mlp": 0.01051209, "balance_loss_clip": 1.04748976, "balance_loss_mlp": 1.0330658, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 1.906378165207968, "language_loss": 0.79090226, "learning_rate": 3.5566136000509674e-06, "loss": 0.81244051, "num_input_tokens_seen": 85709730, "step": 3985, "time_per_iteration": 2.8464138507843018 }, { "auxiliary_loss_clip": 0.01102898, "auxiliary_loss_mlp": 0.01047238, "balance_loss_clip": 1.04676175, "balance_loss_mlp": 1.02930927, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 1.780185130038595, "language_loss": 0.73194253, "learning_rate": 3.556369033716254e-06, "loss": 0.7534439, "num_input_tokens_seen": 85730045, "step": 3986, "time_per_iteration": 2.873837471008301 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.01052533, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 1.03523529, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 1.9275946084378768, "language_loss": 0.88014174, "learning_rate": 3.556124408363871e-06, "loss": 0.90210271, "num_input_tokens_seen": 85747590, "step": 3987, "time_per_iteration": 2.778970718383789 }, { "auxiliary_loss_clip": 0.01131181, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.05180991, "balance_loss_mlp": 1.02253985, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 8.94948058332038, "language_loss": 0.82985806, "learning_rate": 3.5558797240030945e-06, "loss": 0.85154212, "num_input_tokens_seen": 85763460, "step": 3988, "time_per_iteration": 2.6707162857055664 }, { "auxiliary_loss_clip": 0.01132219, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.04952908, "balance_loss_mlp": 1.02213907, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.6085860818119202, "language_loss": 0.85336304, "learning_rate": 3.5556349806432035e-06, "loss": 0.87507904, "num_input_tokens_seen": 85782050, "step": 3989, "time_per_iteration": 2.644075632095337 }, { "auxiliary_loss_clip": 0.01144734, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.05094743, "balance_loss_mlp": 1.02263403, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 1.981474679784042, "language_loss": 0.84109843, "learning_rate": 3.555390178293477e-06, "loss": 0.86293626, "num_input_tokens_seen": 85797400, "step": 3990, "time_per_iteration": 2.5778160095214844 }, { "auxiliary_loss_clip": 0.01131361, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.02565074, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.5352138463261382, "language_loss": 0.75853264, "learning_rate": 3.5551453169631994e-06, "loss": 0.78026724, "num_input_tokens_seen": 85818995, "step": 3991, "time_per_iteration": 2.7569639682769775 }, { "auxiliary_loss_clip": 0.01040828, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.02825403, "balance_loss_mlp": 1.00114298, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.8795356934357302, "language_loss": 0.63683558, "learning_rate": 3.554900396661656e-06, "loss": 0.65728366, "num_input_tokens_seen": 85876695, "step": 3992, "time_per_iteration": 3.2559213638305664 }, { "auxiliary_loss_clip": 0.01055123, "auxiliary_loss_mlp": 0.01005737, "balance_loss_clip": 1.02834392, "balance_loss_mlp": 1.00292385, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7639831296699208, "language_loss": 0.6297875, "learning_rate": 3.5546554173981334e-06, "loss": 0.65039611, "num_input_tokens_seen": 85940990, "step": 3993, "time_per_iteration": 3.2946221828460693 }, { "auxiliary_loss_clip": 0.0110983, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.05077267, "balance_loss_mlp": 1.03078759, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 1.7227387633537015, "language_loss": 0.7656548, "learning_rate": 3.5544103791819218e-06, "loss": 0.78723919, "num_input_tokens_seen": 85961165, "step": 3994, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01120115, "auxiliary_loss_mlp": 0.01051235, "balance_loss_clip": 1.04648936, "balance_loss_mlp": 1.0323168, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 1.7819538389347498, "language_loss": 0.78550023, "learning_rate": 3.5541652820223124e-06, "loss": 0.80721372, "num_input_tokens_seen": 85982710, "step": 3995, "time_per_iteration": 2.8184118270874023 }, { "auxiliary_loss_clip": 0.01034, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.02876425, "balance_loss_mlp": 1.0237658, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.9088717203971356, "language_loss": 0.6345036, "learning_rate": 3.5539201259286006e-06, "loss": 0.65510708, "num_input_tokens_seen": 86046935, "step": 3996, "time_per_iteration": 3.304704189300537 }, { "auxiliary_loss_clip": 0.01122635, "auxiliary_loss_mlp": 0.01046678, "balance_loss_clip": 1.04812241, "balance_loss_mlp": 1.02960706, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 2.5673853359086403, "language_loss": 0.69455099, "learning_rate": 3.5536749109100808e-06, "loss": 0.7162441, "num_input_tokens_seen": 86064355, "step": 3997, "time_per_iteration": 2.6638269424438477 }, { "auxiliary_loss_clip": 0.01136246, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.0500989, "balance_loss_mlp": 1.02390659, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 1.9944619018673675, "language_loss": 0.87352818, "learning_rate": 3.5534296369760535e-06, "loss": 0.89530265, "num_input_tokens_seen": 86081340, "step": 3998, "time_per_iteration": 2.6837756633758545 }, { "auxiliary_loss_clip": 0.01126262, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.02173114, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 1.5798261831400109, "language_loss": 0.75723118, "learning_rate": 3.5531843041358183e-06, "loss": 0.77888191, "num_input_tokens_seen": 86102260, "step": 3999, "time_per_iteration": 2.659717321395874 }, { "auxiliary_loss_clip": 0.01116532, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.04679537, "balance_loss_mlp": 1.03259242, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 2.380373207595884, "language_loss": 0.72602308, "learning_rate": 3.552938912398679e-06, "loss": 0.74768472, "num_input_tokens_seen": 86123400, "step": 4000, "time_per_iteration": 4.285717487335205 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.01040819, "balance_loss_clip": 1.05207551, "balance_loss_mlp": 1.02389169, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 2.3105318706157862, "language_loss": 0.67128104, "learning_rate": 3.5526934617739397e-06, "loss": 0.69307321, "num_input_tokens_seen": 86144060, "step": 4001, "time_per_iteration": 4.2180609703063965 }, { "auxiliary_loss_clip": 0.01144863, "auxiliary_loss_mlp": 0.01043304, "balance_loss_clip": 1.04859209, "balance_loss_mlp": 1.02525568, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 2.360624564793828, "language_loss": 0.82895994, "learning_rate": 3.5524479522709095e-06, "loss": 0.85084158, "num_input_tokens_seen": 86163005, "step": 4002, "time_per_iteration": 2.6369640827178955 }, { "auxiliary_loss_clip": 0.01106477, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.0493201, "balance_loss_mlp": 1.0283823, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 2.016027139567785, "language_loss": 0.83058953, "learning_rate": 3.552202383898897e-06, "loss": 0.85210502, "num_input_tokens_seen": 86182580, "step": 4003, "time_per_iteration": 4.312098979949951 }, { "auxiliary_loss_clip": 0.01114745, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.02458131, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 1.971328156333658, "language_loss": 0.8672772, "learning_rate": 3.551956756667215e-06, "loss": 0.8888458, "num_input_tokens_seen": 86200665, "step": 4004, "time_per_iteration": 2.646578311920166 }, { "auxiliary_loss_clip": 0.01115631, "auxiliary_loss_mlp": 0.01054344, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.03736866, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 1.9965130860947515, "language_loss": 0.78239757, "learning_rate": 3.551711070585177e-06, "loss": 0.80409735, "num_input_tokens_seen": 86221640, "step": 4005, "time_per_iteration": 2.7220566272735596 }, { "auxiliary_loss_clip": 0.01090518, "auxiliary_loss_mlp": 0.01039515, "balance_loss_clip": 1.04414058, "balance_loss_mlp": 1.02164578, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 1.6390993289809686, "language_loss": 0.79391652, "learning_rate": 3.5514653256620995e-06, "loss": 0.8152169, "num_input_tokens_seen": 86240795, "step": 4006, "time_per_iteration": 2.7188642024993896 }, { "auxiliary_loss_clip": 0.01130191, "auxiliary_loss_mlp": 0.00777161, "balance_loss_clip": 1.0482645, "balance_loss_mlp": 1.00115335, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.6765272633695874, "language_loss": 0.71939242, "learning_rate": 3.551219521907302e-06, "loss": 0.73846585, "num_input_tokens_seen": 86262000, "step": 4007, "time_per_iteration": 4.3504638671875 }, { "auxiliary_loss_clip": 0.01101925, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.04589975, "balance_loss_mlp": 1.03132153, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 1.6891966370612705, "language_loss": 0.76460171, "learning_rate": 3.5509736593301042e-06, "loss": 0.78609765, "num_input_tokens_seen": 86279680, "step": 4008, "time_per_iteration": 2.700744152069092 }, { "auxiliary_loss_clip": 0.01136495, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05069256, "balance_loss_mlp": 1.02192402, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 2.427830882471808, "language_loss": 0.74601823, "learning_rate": 3.5507277379398295e-06, "loss": 0.76777172, "num_input_tokens_seen": 86297180, "step": 4009, "time_per_iteration": 2.6175808906555176 }, { "auxiliary_loss_clip": 0.01134079, "auxiliary_loss_mlp": 0.01041957, "balance_loss_clip": 1.05032861, "balance_loss_mlp": 1.02532756, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 1.6643292794637636, "language_loss": 0.80064976, "learning_rate": 3.550481757745804e-06, "loss": 0.82241005, "num_input_tokens_seen": 86317660, "step": 4010, "time_per_iteration": 2.680511236190796 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.01047241, "balance_loss_clip": 1.04658401, "balance_loss_mlp": 1.02779818, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 3.8737422865874245, "language_loss": 0.70889425, "learning_rate": 3.5502357187573555e-06, "loss": 0.73055267, "num_input_tokens_seen": 86338325, "step": 4011, "time_per_iteration": 2.716404676437378 }, { "auxiliary_loss_clip": 0.01065208, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.0414176, "balance_loss_mlp": 1.02802527, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 1.675052333388822, "language_loss": 0.69279736, "learning_rate": 3.5499896209838118e-06, "loss": 0.71392041, "num_input_tokens_seen": 86357615, "step": 4012, "time_per_iteration": 2.804694890975952 }, { "auxiliary_loss_clip": 0.01138123, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.05126536, "balance_loss_mlp": 1.02213097, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 1.5084253296098848, "language_loss": 0.732813, "learning_rate": 3.5497434644345073e-06, "loss": 0.75460911, "num_input_tokens_seen": 86380355, "step": 4013, "time_per_iteration": 2.8192849159240723 }, { "auxiliary_loss_clip": 0.01148497, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.05201018, "balance_loss_mlp": 1.02044141, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 1.8372553923739565, "language_loss": 0.88272971, "learning_rate": 3.5494972491187753e-06, "loss": 0.90459263, "num_input_tokens_seen": 86399125, "step": 4014, "time_per_iteration": 2.6029160022735596 }, { "auxiliary_loss_clip": 0.0111397, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.04315281, "balance_loss_mlp": 1.0278163, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 1.9589493379590102, "language_loss": 0.94862974, "learning_rate": 3.549250975045952e-06, "loss": 0.97023225, "num_input_tokens_seen": 86418625, "step": 4015, "time_per_iteration": 2.6958773136138916 }, { "auxiliary_loss_clip": 0.01120117, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.04570341, "balance_loss_mlp": 1.02331638, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 1.5486712647521637, "language_loss": 0.8271699, "learning_rate": 3.5490046422253768e-06, "loss": 0.84878188, "num_input_tokens_seen": 86438375, "step": 4016, "time_per_iteration": 2.7045071125030518 }, { "auxiliary_loss_clip": 0.01098573, "auxiliary_loss_mlp": 0.01045564, "balance_loss_clip": 1.04334974, "balance_loss_mlp": 1.02838039, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 1.8022012115417119, "language_loss": 0.69207114, "learning_rate": 3.54875825066639e-06, "loss": 0.71351254, "num_input_tokens_seen": 86463230, "step": 4017, "time_per_iteration": 2.8596649169921875 }, { "auxiliary_loss_clip": 0.01141299, "auxiliary_loss_mlp": 0.01051243, "balance_loss_clip": 1.05106175, "balance_loss_mlp": 1.03278995, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 1.6419835865444041, "language_loss": 0.84953403, "learning_rate": 3.5485118003783353e-06, "loss": 0.87145936, "num_input_tokens_seen": 86481230, "step": 4018, "time_per_iteration": 2.627629518508911 }, { "auxiliary_loss_clip": 0.01046489, "auxiliary_loss_mlp": 0.01014362, "balance_loss_clip": 1.02139664, "balance_loss_mlp": 1.01140559, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8221446343976555, "language_loss": 0.60642469, "learning_rate": 3.548265291370558e-06, "loss": 0.62703323, "num_input_tokens_seen": 86541260, "step": 4019, "time_per_iteration": 3.269498586654663 }, { "auxiliary_loss_clip": 0.01114983, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.04582107, "balance_loss_mlp": 1.0312674, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 1.8826005215725077, "language_loss": 0.73324752, "learning_rate": 3.5480187236524055e-06, "loss": 0.75488818, "num_input_tokens_seen": 86559580, "step": 4020, "time_per_iteration": 2.7341055870056152 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01040515, "balance_loss_clip": 1.04833841, "balance_loss_mlp": 1.02315772, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 1.7964731743776612, "language_loss": 0.81617332, "learning_rate": 3.5477720972332285e-06, "loss": 0.83768916, "num_input_tokens_seen": 86577560, "step": 4021, "time_per_iteration": 2.7154345512390137 }, { "auxiliary_loss_clip": 0.01149117, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.04972911, "balance_loss_mlp": 1.03070307, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 2.078765142897874, "language_loss": 0.76601863, "learning_rate": 3.547525412122378e-06, "loss": 0.78800994, "num_input_tokens_seen": 86595350, "step": 4022, "time_per_iteration": 2.622262716293335 }, { "auxiliary_loss_clip": 0.01102927, "auxiliary_loss_mlp": 0.01053151, "balance_loss_clip": 1.042714, "balance_loss_mlp": 1.03271914, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 1.7360501926549048, "language_loss": 0.75283015, "learning_rate": 3.5472786683292083e-06, "loss": 0.774391, "num_input_tokens_seen": 86614805, "step": 4023, "time_per_iteration": 2.7339353561401367 }, { "auxiliary_loss_clip": 0.01121416, "auxiliary_loss_mlp": 0.01047921, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.0309217, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.4319797200103466, "language_loss": 0.82542646, "learning_rate": 3.5470318658630766e-06, "loss": 0.84711981, "num_input_tokens_seen": 86633700, "step": 4024, "time_per_iteration": 2.6887242794036865 }, { "auxiliary_loss_clip": 0.01133297, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.05029452, "balance_loss_mlp": 1.03038907, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 1.7776330743080708, "language_loss": 0.85974258, "learning_rate": 3.5467850047333424e-06, "loss": 0.88155425, "num_input_tokens_seen": 86650905, "step": 4025, "time_per_iteration": 2.7049782276153564 }, { "auxiliary_loss_clip": 0.01092706, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.04161918, "balance_loss_mlp": 1.04456651, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 1.8800874250001207, "language_loss": 0.71681315, "learning_rate": 3.546538084949365e-06, "loss": 0.73838508, "num_input_tokens_seen": 86669185, "step": 4026, "time_per_iteration": 2.7773284912109375 }, { "auxiliary_loss_clip": 0.01135992, "auxiliary_loss_mlp": 0.01046992, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.03088713, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 1.967847260356932, "language_loss": 0.64436764, "learning_rate": 3.546291106520509e-06, "loss": 0.66619748, "num_input_tokens_seen": 86686805, "step": 4027, "time_per_iteration": 2.6143524646759033 }, { "auxiliary_loss_clip": 0.01136637, "auxiliary_loss_mlp": 0.00775283, "balance_loss_clip": 1.05106425, "balance_loss_mlp": 1.00103092, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 3.6118562291520813, "language_loss": 0.70909715, "learning_rate": 3.5460440694561388e-06, "loss": 0.72821641, "num_input_tokens_seen": 86705520, "step": 4028, "time_per_iteration": 2.656334400177002 }, { "auxiliary_loss_clip": 0.01053475, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.02715707, "balance_loss_mlp": 1.04756165, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.865443083354021, "language_loss": 0.55302447, "learning_rate": 3.545796973765623e-06, "loss": 0.57405978, "num_input_tokens_seen": 86767320, "step": 4029, "time_per_iteration": 3.1736607551574707 }, { "auxiliary_loss_clip": 0.0113268, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.04679179, "balance_loss_mlp": 1.03252554, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 1.6290009052774777, "language_loss": 0.74065894, "learning_rate": 3.54554981945833e-06, "loss": 0.76249647, "num_input_tokens_seen": 86788110, "step": 4030, "time_per_iteration": 2.644153118133545 }, { "auxiliary_loss_clip": 0.01146282, "auxiliary_loss_mlp": 0.01053008, "balance_loss_clip": 1.04945433, "balance_loss_mlp": 1.03495932, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 2.044571760348203, "language_loss": 0.76492965, "learning_rate": 3.5453026065436343e-06, "loss": 0.78692257, "num_input_tokens_seen": 86807640, "step": 4031, "time_per_iteration": 2.608718156814575 }, { "auxiliary_loss_clip": 0.01130345, "auxiliary_loss_mlp": 0.00776083, "balance_loss_clip": 1.04857934, "balance_loss_mlp": 1.00130129, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 2.367928778009572, "language_loss": 0.65578043, "learning_rate": 3.5450553350309083e-06, "loss": 0.67484468, "num_input_tokens_seen": 86826795, "step": 4032, "time_per_iteration": 2.713796377182007 }, { "auxiliary_loss_clip": 0.01128183, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.04551542, "balance_loss_mlp": 1.02591443, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 2.055558599382263, "language_loss": 0.81589901, "learning_rate": 3.5448080049295286e-06, "loss": 0.83761466, "num_input_tokens_seen": 86843175, "step": 4033, "time_per_iteration": 2.6381332874298096 }, { "auxiliary_loss_clip": 0.01101134, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.04264998, "balance_loss_mlp": 1.02450657, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 2.655330103252085, "language_loss": 0.68830204, "learning_rate": 3.5445606162488754e-06, "loss": 0.70973849, "num_input_tokens_seen": 86863185, "step": 4034, "time_per_iteration": 2.8269567489624023 }, { "auxiliary_loss_clip": 0.01129717, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.05142426, "balance_loss_mlp": 1.01839972, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.305872962411053, "language_loss": 0.96432853, "learning_rate": 3.5443131689983283e-06, "loss": 0.98599035, "num_input_tokens_seen": 86880040, "step": 4035, "time_per_iteration": 2.687131643295288 }, { "auxiliary_loss_clip": 0.01116249, "auxiliary_loss_mlp": 0.01051012, "balance_loss_clip": 1.0467937, "balance_loss_mlp": 1.03419125, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 1.5931877581057647, "language_loss": 0.7820307, "learning_rate": 3.5440656631872715e-06, "loss": 0.80370331, "num_input_tokens_seen": 86900610, "step": 4036, "time_per_iteration": 2.7576112747192383 }, { "auxiliary_loss_clip": 0.01137826, "auxiliary_loss_mlp": 0.01049747, "balance_loss_clip": 1.05010104, "balance_loss_mlp": 1.03141224, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 1.6332934168141529, "language_loss": 0.74266672, "learning_rate": 3.5438180988250898e-06, "loss": 0.76454246, "num_input_tokens_seen": 86919385, "step": 4037, "time_per_iteration": 2.7860629558563232 }, { "auxiliary_loss_clip": 0.01100993, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04173183, "balance_loss_mlp": 1.02453303, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 8.14050816007968, "language_loss": 0.76632005, "learning_rate": 3.543570475921171e-06, "loss": 0.78775871, "num_input_tokens_seen": 86938885, "step": 4038, "time_per_iteration": 2.691695213317871 }, { "auxiliary_loss_clip": 0.01129874, "auxiliary_loss_mlp": 0.01043604, "balance_loss_clip": 1.04768467, "balance_loss_mlp": 1.0249598, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 3.2334161052349817, "language_loss": 0.71992457, "learning_rate": 3.543322794484905e-06, "loss": 0.7416594, "num_input_tokens_seen": 86957705, "step": 4039, "time_per_iteration": 4.128135442733765 }, { "auxiliary_loss_clip": 0.0112766, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.04597354, "balance_loss_mlp": 1.02921474, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 1.6158763194283545, "language_loss": 0.78655136, "learning_rate": 3.5430750545256843e-06, "loss": 0.80830908, "num_input_tokens_seen": 86975845, "step": 4040, "time_per_iteration": 4.174723863601685 }, { "auxiliary_loss_clip": 0.01090567, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.04526615, "balance_loss_mlp": 1.02268124, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 2.432557236688664, "language_loss": 0.80599713, "learning_rate": 3.5428272560529027e-06, "loss": 0.8272925, "num_input_tokens_seen": 86994800, "step": 4041, "time_per_iteration": 2.7933273315429688 }, { "auxiliary_loss_clip": 0.01108653, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.04587245, "balance_loss_mlp": 1.02733982, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 1.9967913274059828, "language_loss": 0.76708287, "learning_rate": 3.542579399075957e-06, "loss": 0.78861034, "num_input_tokens_seen": 87016845, "step": 4042, "time_per_iteration": 4.336673021316528 }, { "auxiliary_loss_clip": 0.01056541, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.04354727, "balance_loss_mlp": 1.01928389, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 1.8431659047813937, "language_loss": 0.81232125, "learning_rate": 3.542331483604246e-06, "loss": 0.83324039, "num_input_tokens_seen": 87036270, "step": 4043, "time_per_iteration": 2.9156856536865234 }, { "auxiliary_loss_clip": 0.01126576, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.04610896, "balance_loss_mlp": 1.02012897, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 2.052349433785912, "language_loss": 0.73095596, "learning_rate": 3.5420835096471706e-06, "loss": 0.75261033, "num_input_tokens_seen": 87049920, "step": 4044, "time_per_iteration": 2.6324286460876465 }, { "auxiliary_loss_clip": 0.0113453, "auxiliary_loss_mlp": 0.01042417, "balance_loss_clip": 1.04967666, "balance_loss_mlp": 1.02445269, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 1.8848950918191658, "language_loss": 0.83676481, "learning_rate": 3.5418354772141337e-06, "loss": 0.85853434, "num_input_tokens_seen": 87068230, "step": 4045, "time_per_iteration": 2.68994402885437 }, { "auxiliary_loss_clip": 0.010753, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.04608011, "balance_loss_mlp": 1.03117943, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 1.9701839557075844, "language_loss": 0.86895847, "learning_rate": 3.541587386314541e-06, "loss": 0.89019132, "num_input_tokens_seen": 87086435, "step": 4046, "time_per_iteration": 2.908737897872925 }, { "auxiliary_loss_clip": 0.01120714, "auxiliary_loss_mlp": 0.01038682, "balance_loss_clip": 1.04705977, "balance_loss_mlp": 1.02070522, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 1.8855160425980928, "language_loss": 0.72759771, "learning_rate": 3.5413392369578e-06, "loss": 0.74919164, "num_input_tokens_seen": 87105340, "step": 4047, "time_per_iteration": 4.310218095779419 }, { "auxiliary_loss_clip": 0.01124014, "auxiliary_loss_mlp": 0.01045256, "balance_loss_clip": 1.04447186, "balance_loss_mlp": 1.02637279, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 2.592486480291502, "language_loss": 0.73029542, "learning_rate": 3.5410910291533213e-06, "loss": 0.75198811, "num_input_tokens_seen": 87125780, "step": 4048, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01112707, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.04923105, "balance_loss_mlp": 1.02869391, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 1.921127999919884, "language_loss": 0.73616529, "learning_rate": 3.5408427629105155e-06, "loss": 0.7577455, "num_input_tokens_seen": 87144470, "step": 4049, "time_per_iteration": 2.6988370418548584 }, { "auxiliary_loss_clip": 0.01093349, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.04289758, "balance_loss_mlp": 1.02583802, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 2.073976648883723, "language_loss": 0.7377705, "learning_rate": 3.5405944382387985e-06, "loss": 0.75912058, "num_input_tokens_seen": 87162830, "step": 4050, "time_per_iteration": 2.718212604522705 }, { "auxiliary_loss_clip": 0.01116995, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.04518783, "balance_loss_mlp": 1.02800608, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 2.361179977901575, "language_loss": 0.75518602, "learning_rate": 3.5403460551475854e-06, "loss": 0.77679563, "num_input_tokens_seen": 87180905, "step": 4051, "time_per_iteration": 2.6522655487060547 }, { "auxiliary_loss_clip": 0.01092567, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.04197812, "balance_loss_mlp": 1.02507067, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 2.2644912923037985, "language_loss": 0.70717591, "learning_rate": 3.540097613646296e-06, "loss": 0.72852671, "num_input_tokens_seen": 87202290, "step": 4052, "time_per_iteration": 2.794059991836548 }, { "auxiliary_loss_clip": 0.0111622, "auxiliary_loss_mlp": 0.01045494, "balance_loss_clip": 1.04823005, "balance_loss_mlp": 1.02833986, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 1.7022998331113812, "language_loss": 0.80989587, "learning_rate": 3.539849113744351e-06, "loss": 0.83151299, "num_input_tokens_seen": 87221650, "step": 4053, "time_per_iteration": 2.682805299758911 }, { "auxiliary_loss_clip": 0.01148244, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.05124915, "balance_loss_mlp": 1.0210743, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 1.5338885161808513, "language_loss": 0.77628779, "learning_rate": 3.539600555451172e-06, "loss": 0.79815125, "num_input_tokens_seen": 87238515, "step": 4054, "time_per_iteration": 2.635181427001953 }, { "auxiliary_loss_clip": 0.01095192, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04067969, "balance_loss_mlp": 1.03783989, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 1.8808929031646056, "language_loss": 0.84398115, "learning_rate": 3.5393519387761866e-06, "loss": 0.86548549, "num_input_tokens_seen": 87256290, "step": 4055, "time_per_iteration": 2.757601261138916 }, { "auxiliary_loss_clip": 0.01110063, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.04298997, "balance_loss_mlp": 1.02767169, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 2.5636936013515776, "language_loss": 0.55038011, "learning_rate": 3.5391032637288217e-06, "loss": 0.57193393, "num_input_tokens_seen": 87277085, "step": 4056, "time_per_iteration": 2.7788894176483154 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.04897046, "balance_loss_mlp": 1.02876842, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 2.64902132986976, "language_loss": 0.80583262, "learning_rate": 3.538854530318506e-06, "loss": 0.82767057, "num_input_tokens_seen": 87293020, "step": 4057, "time_per_iteration": 2.78110671043396 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.04877245, "balance_loss_mlp": 1.03145027, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 1.8133503864036424, "language_loss": 0.79202968, "learning_rate": 3.538605738554673e-06, "loss": 0.81384456, "num_input_tokens_seen": 87311445, "step": 4058, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.01147749, "auxiliary_loss_mlp": 0.01045059, "balance_loss_clip": 1.04827118, "balance_loss_mlp": 1.02920449, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 3.3482411666646086, "language_loss": 0.85503888, "learning_rate": 3.538356888446756e-06, "loss": 0.87696695, "num_input_tokens_seen": 87332055, "step": 4059, "time_per_iteration": 2.724241256713867 }, { "auxiliary_loss_clip": 0.01126127, "auxiliary_loss_mlp": 0.01038967, "balance_loss_clip": 1.04837418, "balance_loss_mlp": 1.02296889, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 2.2060888459440617, "language_loss": 0.7483452, "learning_rate": 3.5381079800041913e-06, "loss": 0.76999605, "num_input_tokens_seen": 87351295, "step": 4060, "time_per_iteration": 2.6769304275512695 }, { "auxiliary_loss_clip": 0.01111679, "auxiliary_loss_mlp": 0.01051445, "balance_loss_clip": 1.04629493, "balance_loss_mlp": 1.03247917, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 2.624850134940939, "language_loss": 0.73482168, "learning_rate": 3.5378590132364182e-06, "loss": 0.75645292, "num_input_tokens_seen": 87370650, "step": 4061, "time_per_iteration": 2.7570559978485107 }, { "auxiliary_loss_clip": 0.01144554, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05180097, "balance_loss_mlp": 1.02394772, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 4.11905418985837, "language_loss": 0.76135921, "learning_rate": 3.5376099881528768e-06, "loss": 0.78320187, "num_input_tokens_seen": 87389020, "step": 4062, "time_per_iteration": 2.6387689113616943 }, { "auxiliary_loss_clip": 0.01104974, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.02458024, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 2.5628995075758954, "language_loss": 0.85376853, "learning_rate": 3.537360904763011e-06, "loss": 0.87523055, "num_input_tokens_seen": 87409695, "step": 4063, "time_per_iteration": 2.7785301208496094 }, { "auxiliary_loss_clip": 0.01119987, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.04776239, "balance_loss_mlp": 1.02789354, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 2.760332484942286, "language_loss": 0.6845879, "learning_rate": 3.5371117630762656e-06, "loss": 0.70625937, "num_input_tokens_seen": 87428250, "step": 4064, "time_per_iteration": 2.6691763401031494 }, { "auxiliary_loss_clip": 0.01138225, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.04773867, "balance_loss_mlp": 1.02892423, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 1.603702751214229, "language_loss": 0.70247531, "learning_rate": 3.536862563102088e-06, "loss": 0.72432399, "num_input_tokens_seen": 87449380, "step": 4065, "time_per_iteration": 2.6677680015563965 }, { "auxiliary_loss_clip": 0.01150465, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.05127215, "balance_loss_mlp": 1.02803993, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 1.788543447431289, "language_loss": 0.84282506, "learning_rate": 3.5366133048499282e-06, "loss": 0.86479944, "num_input_tokens_seen": 87465365, "step": 4066, "time_per_iteration": 2.5993456840515137 }, { "auxiliary_loss_clip": 0.01067736, "auxiliary_loss_mlp": 0.01002523, "balance_loss_clip": 1.03198457, "balance_loss_mlp": 1.00028193, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7359455307187547, "language_loss": 0.52283657, "learning_rate": 3.5363639883292374e-06, "loss": 0.54353911, "num_input_tokens_seen": 87522525, "step": 4067, "time_per_iteration": 3.056666374206543 }, { "auxiliary_loss_clip": 0.01123042, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.04955244, "balance_loss_mlp": 1.0279212, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 2.6392300526537493, "language_loss": 0.7185899, "learning_rate": 3.5361146135494706e-06, "loss": 0.74027765, "num_input_tokens_seen": 87539170, "step": 4068, "time_per_iteration": 2.700847864151001 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01047493, "balance_loss_clip": 1.04378593, "balance_loss_mlp": 1.02920675, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 2.4202919064349744, "language_loss": 0.78083313, "learning_rate": 3.5358651805200835e-06, "loss": 0.80216813, "num_input_tokens_seen": 87558875, "step": 4069, "time_per_iteration": 2.9363162517547607 }, { "auxiliary_loss_clip": 0.01119666, "auxiliary_loss_mlp": 0.0105204, "balance_loss_clip": 1.05164659, "balance_loss_mlp": 1.03445613, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 4.167143793475273, "language_loss": 0.80607939, "learning_rate": 3.5356156892505347e-06, "loss": 0.82779646, "num_input_tokens_seen": 87576485, "step": 4070, "time_per_iteration": 2.658191204071045 }, { "auxiliary_loss_clip": 0.01127014, "auxiliary_loss_mlp": 0.01049283, "balance_loss_clip": 1.04832387, "balance_loss_mlp": 1.03218853, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 1.5316441932107319, "language_loss": 0.84351504, "learning_rate": 3.5353661397502854e-06, "loss": 0.86527801, "num_input_tokens_seen": 87598620, "step": 4071, "time_per_iteration": 2.7118849754333496 }, { "auxiliary_loss_clip": 0.01120333, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.04778695, "balance_loss_mlp": 1.03601933, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 1.8860726044388547, "language_loss": 0.80115497, "learning_rate": 3.535116532028798e-06, "loss": 0.82293165, "num_input_tokens_seen": 87616595, "step": 4072, "time_per_iteration": 2.6662774085998535 }, { "auxiliary_loss_clip": 0.01134806, "auxiliary_loss_mlp": 0.0104215, "balance_loss_clip": 1.05156791, "balance_loss_mlp": 1.02614021, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 3.990887653020168, "language_loss": 0.70466423, "learning_rate": 3.5348668660955382e-06, "loss": 0.72643375, "num_input_tokens_seen": 87635755, "step": 4073, "time_per_iteration": 2.7366209030151367 }, { "auxiliary_loss_clip": 0.01110472, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.04666865, "balance_loss_mlp": 1.03090906, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 2.943884117668681, "language_loss": 0.67292917, "learning_rate": 3.5346171419599728e-06, "loss": 0.69450659, "num_input_tokens_seen": 87652885, "step": 4074, "time_per_iteration": 2.7158730030059814 }, { "auxiliary_loss_clip": 0.01062567, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.02741885, "balance_loss_mlp": 0.99986744, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.8927046346070237, "language_loss": 0.68608266, "learning_rate": 3.5343673596315718e-06, "loss": 0.70672953, "num_input_tokens_seen": 87713220, "step": 4075, "time_per_iteration": 3.2283740043640137 }, { "auxiliary_loss_clip": 0.01146172, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.05287361, "balance_loss_mlp": 1.02612722, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 2.3370219869490563, "language_loss": 0.79263043, "learning_rate": 3.5341175191198063e-06, "loss": 0.81451714, "num_input_tokens_seen": 87732680, "step": 4076, "time_per_iteration": 2.6744346618652344 }, { "auxiliary_loss_clip": 0.01128421, "auxiliary_loss_mlp": 0.00775989, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.001266, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 1.816414447330212, "language_loss": 0.81986046, "learning_rate": 3.533867620434151e-06, "loss": 0.83890456, "num_input_tokens_seen": 87751880, "step": 4077, "time_per_iteration": 2.729391098022461 }, { "auxiliary_loss_clip": 0.01148302, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.05185413, "balance_loss_mlp": 1.0288794, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 2.0328430965985045, "language_loss": 0.62790757, "learning_rate": 3.533617663584082e-06, "loss": 0.64986217, "num_input_tokens_seen": 87771795, "step": 4078, "time_per_iteration": 2.694767713546753 }, { "auxiliary_loss_clip": 0.01114498, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.04953861, "balance_loss_mlp": 1.02270436, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 1.5687748074794818, "language_loss": 0.75811553, "learning_rate": 3.5333676485790765e-06, "loss": 0.7796526, "num_input_tokens_seen": 87793640, "step": 4079, "time_per_iteration": 4.288895130157471 }, { "auxiliary_loss_clip": 0.01142871, "auxiliary_loss_mlp": 0.01047138, "balance_loss_clip": 1.04899406, "balance_loss_mlp": 1.02955461, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 1.8811380892336844, "language_loss": 0.74537313, "learning_rate": 3.5331175754286173e-06, "loss": 0.76727325, "num_input_tokens_seen": 87812390, "step": 4080, "time_per_iteration": 2.683969736099243 }, { "auxiliary_loss_clip": 0.01115604, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.04717278, "balance_loss_mlp": 1.02558291, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 2.2859558621761997, "language_loss": 0.83389306, "learning_rate": 3.532867444142186e-06, "loss": 0.85546505, "num_input_tokens_seen": 87830640, "step": 4081, "time_per_iteration": 2.772573947906494 }, { "auxiliary_loss_clip": 0.01114607, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02473605, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 1.8658741711896472, "language_loss": 0.73223484, "learning_rate": 3.532617254729267e-06, "loss": 0.7537877, "num_input_tokens_seen": 87850450, "step": 4082, "time_per_iteration": 4.3304970264434814 }, { "auxiliary_loss_clip": 0.01104397, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.04542649, "balance_loss_mlp": 1.03163004, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 1.7143564189307843, "language_loss": 0.72032338, "learning_rate": 3.5323670071993485e-06, "loss": 0.74183893, "num_input_tokens_seen": 87868810, "step": 4083, "time_per_iteration": 2.7463390827178955 }, { "auxiliary_loss_clip": 0.01115479, "auxiliary_loss_mlp": 0.01048832, "balance_loss_clip": 1.04441845, "balance_loss_mlp": 1.02979386, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 2.556114612666859, "language_loss": 0.74363655, "learning_rate": 3.532116701561919e-06, "loss": 0.76527965, "num_input_tokens_seen": 87885685, "step": 4084, "time_per_iteration": 2.6828086376190186 }, { "auxiliary_loss_clip": 0.01126215, "auxiliary_loss_mlp": 0.01040078, "balance_loss_clip": 1.04541206, "balance_loss_mlp": 1.02269721, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 2.030442784512354, "language_loss": 0.85540497, "learning_rate": 3.531866337826471e-06, "loss": 0.87706792, "num_input_tokens_seen": 87903715, "step": 4085, "time_per_iteration": 4.236302852630615 }, { "auxiliary_loss_clip": 0.01110493, "auxiliary_loss_mlp": 0.01046501, "balance_loss_clip": 1.04634261, "balance_loss_mlp": 1.02932286, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 2.028282258660301, "language_loss": 0.78985649, "learning_rate": 3.5316159160024982e-06, "loss": 0.8114264, "num_input_tokens_seen": 87923375, "step": 4086, "time_per_iteration": 2.6638717651367188 }, { "auxiliary_loss_clip": 0.01087456, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04792905, "balance_loss_mlp": 1.02847362, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 5.7080500305845865, "language_loss": 0.75053227, "learning_rate": 3.531365436099496e-06, "loss": 0.77185762, "num_input_tokens_seen": 87943115, "step": 4087, "time_per_iteration": 2.8027901649475098 }, { "auxiliary_loss_clip": 0.01090549, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04807436, "balance_loss_mlp": 1.02680135, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 2.066557704160291, "language_loss": 0.79291761, "learning_rate": 3.5311148981269635e-06, "loss": 0.81427807, "num_input_tokens_seen": 87959505, "step": 4088, "time_per_iteration": 2.78812575340271 }, { "auxiliary_loss_clip": 0.0110062, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.04435658, "balance_loss_mlp": 1.01949525, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 1.4918864539426413, "language_loss": 0.77053773, "learning_rate": 3.5308643020944e-06, "loss": 0.79189926, "num_input_tokens_seen": 87979725, "step": 4089, "time_per_iteration": 2.75034761428833 }, { "auxiliary_loss_clip": 0.01125156, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.04609382, "balance_loss_mlp": 1.02470064, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 2.3383647352821737, "language_loss": 0.81814516, "learning_rate": 3.530613648011309e-06, "loss": 0.83981681, "num_input_tokens_seen": 87998270, "step": 4090, "time_per_iteration": 2.891878604888916 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.04687834, "balance_loss_mlp": 1.03163147, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 1.8221600402702927, "language_loss": 0.73833978, "learning_rate": 3.5303629358871946e-06, "loss": 0.76005995, "num_input_tokens_seen": 88016760, "step": 4091, "time_per_iteration": 2.6410961151123047 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01045509, "balance_loss_clip": 1.05517268, "balance_loss_mlp": 1.0279969, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 1.8983812190731213, "language_loss": 0.7706998, "learning_rate": 3.5301121657315653e-06, "loss": 0.79229522, "num_input_tokens_seen": 88036465, "step": 4092, "time_per_iteration": 2.7038323879241943 }, { "auxiliary_loss_clip": 0.01115501, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.04371238, "balance_loss_mlp": 1.02255797, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 3.1365051823944627, "language_loss": 0.81200075, "learning_rate": 3.5298613375539287e-06, "loss": 0.83356375, "num_input_tokens_seen": 88053270, "step": 4093, "time_per_iteration": 2.680634021759033 }, { "auxiliary_loss_clip": 0.01135527, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.04879606, "balance_loss_mlp": 1.02613521, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 1.9167765067224862, "language_loss": 0.86932534, "learning_rate": 3.529610451363797e-06, "loss": 0.89111882, "num_input_tokens_seen": 88072305, "step": 4094, "time_per_iteration": 2.6558003425598145 }, { "auxiliary_loss_clip": 0.01007267, "auxiliary_loss_mlp": 0.01019789, "balance_loss_clip": 1.03124738, "balance_loss_mlp": 1.01697576, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 0.7554163750993251, "language_loss": 0.57503664, "learning_rate": 3.5293595071706833e-06, "loss": 0.59530711, "num_input_tokens_seen": 88137995, "step": 4095, "time_per_iteration": 3.3576478958129883 }, { "auxiliary_loss_clip": 0.01051219, "auxiliary_loss_mlp": 0.0102022, "balance_loss_clip": 1.03409493, "balance_loss_mlp": 1.01790738, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.655284075812517, "language_loss": 0.56260574, "learning_rate": 3.5291085049841042e-06, "loss": 0.58332014, "num_input_tokens_seen": 88208490, "step": 4096, "time_per_iteration": 3.376516580581665 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.05330801, "balance_loss_mlp": 1.0236733, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 1.7306008966026363, "language_loss": 0.77629399, "learning_rate": 3.5288574448135773e-06, "loss": 0.79796875, "num_input_tokens_seen": 88228050, "step": 4097, "time_per_iteration": 2.6973912715911865 }, { "auxiliary_loss_clip": 0.01114293, "auxiliary_loss_mlp": 0.01047339, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.02842093, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 2.4079595240953613, "language_loss": 0.75890571, "learning_rate": 3.5286063266686235e-06, "loss": 0.78052205, "num_input_tokens_seen": 88248090, "step": 4098, "time_per_iteration": 2.739947557449341 }, { "auxiliary_loss_clip": 0.0112794, "auxiliary_loss_mlp": 0.01046194, "balance_loss_clip": 1.05179596, "balance_loss_mlp": 1.03002954, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 2.5671853201902737, "language_loss": 0.68179071, "learning_rate": 3.528355150558764e-06, "loss": 0.7035321, "num_input_tokens_seen": 88267545, "step": 4099, "time_per_iteration": 2.7144618034362793 }, { "auxiliary_loss_clip": 0.01133513, "auxiliary_loss_mlp": 0.01045673, "balance_loss_clip": 1.05187321, "balance_loss_mlp": 1.02897191, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 2.0343787496625656, "language_loss": 0.65915, "learning_rate": 3.5281039164935237e-06, "loss": 0.68094188, "num_input_tokens_seen": 88289785, "step": 4100, "time_per_iteration": 2.724008560180664 }, { "auxiliary_loss_clip": 0.01054067, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.03763318, "balance_loss_mlp": 1.03830957, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7229502883874133, "language_loss": 0.61514676, "learning_rate": 3.5278526244824304e-06, "loss": 0.63609749, "num_input_tokens_seen": 88357320, "step": 4101, "time_per_iteration": 3.3748011589050293 }, { "auxiliary_loss_clip": 0.01144305, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.05133915, "balance_loss_mlp": 1.02455676, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 2.2333045722985028, "language_loss": 0.73272061, "learning_rate": 3.527601274535012e-06, "loss": 0.754583, "num_input_tokens_seen": 88377040, "step": 4102, "time_per_iteration": 2.7457518577575684 }, { "auxiliary_loss_clip": 0.01124231, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.04909408, "balance_loss_mlp": 1.02699423, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 2.9311552217427774, "language_loss": 0.76528364, "learning_rate": 3.5273498666608004e-06, "loss": 0.78696227, "num_input_tokens_seen": 88395085, "step": 4103, "time_per_iteration": 2.732285499572754 }, { "auxiliary_loss_clip": 0.01128751, "auxiliary_loss_mlp": 0.01051695, "balance_loss_clip": 1.04730439, "balance_loss_mlp": 1.03313375, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 2.3173933836652902, "language_loss": 0.78658336, "learning_rate": 3.5270984008693288e-06, "loss": 0.80838788, "num_input_tokens_seen": 88413205, "step": 4104, "time_per_iteration": 2.7234179973602295 }, { "auxiliary_loss_clip": 0.01134641, "auxiliary_loss_mlp": 0.01045411, "balance_loss_clip": 1.05110276, "balance_loss_mlp": 1.02601588, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 1.883953093480743, "language_loss": 0.8375451, "learning_rate": 3.526846877170133e-06, "loss": 0.85934561, "num_input_tokens_seen": 88431525, "step": 4105, "time_per_iteration": 2.7051403522491455 }, { "auxiliary_loss_clip": 0.01149885, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.05490828, "balance_loss_mlp": 1.03340602, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 1.9903096770852142, "language_loss": 0.76503521, "learning_rate": 3.52659529557275e-06, "loss": 0.78704607, "num_input_tokens_seen": 88451210, "step": 4106, "time_per_iteration": 2.6324243545532227 }, { "auxiliary_loss_clip": 0.01107346, "auxiliary_loss_mlp": 0.01058334, "balance_loss_clip": 1.0438261, "balance_loss_mlp": 1.03743649, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 2.3469304270549487, "language_loss": 0.72399199, "learning_rate": 3.5263436560867205e-06, "loss": 0.74564874, "num_input_tokens_seen": 88467790, "step": 4107, "time_per_iteration": 2.6767516136169434 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01055902, "balance_loss_clip": 1.05365527, "balance_loss_mlp": 1.03840184, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 2.655550859638868, "language_loss": 0.65495557, "learning_rate": 3.526091958721587e-06, "loss": 0.67700469, "num_input_tokens_seen": 88490330, "step": 4108, "time_per_iteration": 2.666501760482788 }, { "auxiliary_loss_clip": 0.01095567, "auxiliary_loss_mlp": 0.01053352, "balance_loss_clip": 1.04577923, "balance_loss_mlp": 1.0351851, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 1.631565192024798, "language_loss": 0.72685403, "learning_rate": 3.5258402034868936e-06, "loss": 0.74834323, "num_input_tokens_seen": 88512435, "step": 4109, "time_per_iteration": 2.8588712215423584 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01048877, "balance_loss_clip": 1.04754984, "balance_loss_mlp": 1.03132939, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 1.9000447272053396, "language_loss": 0.79328829, "learning_rate": 3.5255883903921866e-06, "loss": 0.81488264, "num_input_tokens_seen": 88529780, "step": 4110, "time_per_iteration": 2.7403078079223633 }, { "auxiliary_loss_clip": 0.01114435, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02536333, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 1.9757162932013852, "language_loss": 0.80630267, "learning_rate": 3.5253365194470144e-06, "loss": 0.82788301, "num_input_tokens_seen": 88547200, "step": 4111, "time_per_iteration": 2.6893255710601807 }, { "auxiliary_loss_clip": 0.01143907, "auxiliary_loss_mlp": 0.0104799, "balance_loss_clip": 1.0493356, "balance_loss_mlp": 1.03203976, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 1.928179444788623, "language_loss": 0.75401616, "learning_rate": 3.5250845906609294e-06, "loss": 0.77593511, "num_input_tokens_seen": 88566415, "step": 4112, "time_per_iteration": 2.641103506088257 }, { "auxiliary_loss_clip": 0.01112249, "auxiliary_loss_mlp": 0.00775958, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.00114262, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 2.1227710866712908, "language_loss": 0.8244158, "learning_rate": 3.5248326040434835e-06, "loss": 0.84329784, "num_input_tokens_seen": 88585225, "step": 4113, "time_per_iteration": 2.831209182739258 }, { "auxiliary_loss_clip": 0.01143893, "auxiliary_loss_mlp": 0.01043423, "balance_loss_clip": 1.04927897, "balance_loss_mlp": 1.02574396, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 2.5263325514304813, "language_loss": 0.8704375, "learning_rate": 3.5245805596042322e-06, "loss": 0.89231074, "num_input_tokens_seen": 88603280, "step": 4114, "time_per_iteration": 2.7264626026153564 }, { "auxiliary_loss_clip": 0.01096969, "auxiliary_loss_mlp": 0.01047533, "balance_loss_clip": 1.04748011, "balance_loss_mlp": 1.03005731, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 1.6498261942323098, "language_loss": 0.75283766, "learning_rate": 3.524328457352734e-06, "loss": 0.77428269, "num_input_tokens_seen": 88624925, "step": 4115, "time_per_iteration": 2.755342483520508 }, { "auxiliary_loss_clip": 0.01018711, "auxiliary_loss_mlp": 0.01070163, "balance_loss_clip": 1.03186083, "balance_loss_mlp": 1.06756425, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 0.6879904854197085, "language_loss": 0.58123159, "learning_rate": 3.5240762972985475e-06, "loss": 0.60212028, "num_input_tokens_seen": 88691475, "step": 4116, "time_per_iteration": 3.4015462398529053 }, { "auxiliary_loss_clip": 0.01122111, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.04813063, "balance_loss_mlp": 1.02213693, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 19.427234883564427, "language_loss": 0.83599627, "learning_rate": 3.523824079451235e-06, "loss": 0.85760617, "num_input_tokens_seen": 88713425, "step": 4117, "time_per_iteration": 2.7881336212158203 }, { "auxiliary_loss_clip": 0.01041379, "auxiliary_loss_mlp": 0.00755386, "balance_loss_clip": 1.02616835, "balance_loss_mlp": 1.0023396, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.909523411860611, "language_loss": 0.63518536, "learning_rate": 3.5235718038203602e-06, "loss": 0.65315294, "num_input_tokens_seen": 88769995, "step": 4118, "time_per_iteration": 3.1125216484069824 }, { "auxiliary_loss_clip": 0.01126335, "auxiliary_loss_mlp": 0.01048787, "balance_loss_clip": 1.04487431, "balance_loss_mlp": 1.03127515, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 2.1708029437062546, "language_loss": 0.79272264, "learning_rate": 3.523319470415491e-06, "loss": 0.81447387, "num_input_tokens_seen": 88789970, "step": 4119, "time_per_iteration": 6.294121503829956 }, { "auxiliary_loss_clip": 0.01133521, "auxiliary_loss_mlp": 0.01044138, "balance_loss_clip": 1.05223441, "balance_loss_mlp": 1.02707899, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 1.7395275513138477, "language_loss": 0.74590164, "learning_rate": 3.5230670792461943e-06, "loss": 0.76767826, "num_input_tokens_seen": 88810000, "step": 4120, "time_per_iteration": 2.6947290897369385 }, { "auxiliary_loss_clip": 0.01135162, "auxiliary_loss_mlp": 0.01051636, "balance_loss_clip": 1.04963648, "balance_loss_mlp": 1.03435111, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 3.32651820696464, "language_loss": 0.88006538, "learning_rate": 3.522814630322041e-06, "loss": 0.90193337, "num_input_tokens_seen": 88827515, "step": 4121, "time_per_iteration": 4.181556224822998 }, { "auxiliary_loss_clip": 0.01147178, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.05039763, "balance_loss_mlp": 1.02431381, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 2.0457274986343204, "language_loss": 0.69676709, "learning_rate": 3.5225621236526045e-06, "loss": 0.71866482, "num_input_tokens_seen": 88845025, "step": 4122, "time_per_iteration": 2.7041239738464355 }, { "auxiliary_loss_clip": 0.01147132, "auxiliary_loss_mlp": 0.01045532, "balance_loss_clip": 1.05045271, "balance_loss_mlp": 1.02655339, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 2.4058135017179976, "language_loss": 0.8026911, "learning_rate": 3.5223095592474596e-06, "loss": 0.82461774, "num_input_tokens_seen": 88861740, "step": 4123, "time_per_iteration": 2.6154532432556152 }, { "auxiliary_loss_clip": 0.01085408, "auxiliary_loss_mlp": 0.0105298, "balance_loss_clip": 1.04720712, "balance_loss_mlp": 1.0354923, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 2.2195758993023578, "language_loss": 0.74967635, "learning_rate": 3.5220569371161846e-06, "loss": 0.77106017, "num_input_tokens_seen": 88879740, "step": 4124, "time_per_iteration": 2.787986993789673 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01044392, "balance_loss_clip": 1.04892588, "balance_loss_mlp": 1.02809608, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 1.4128536066198873, "language_loss": 0.73432529, "learning_rate": 3.521804257268357e-06, "loss": 0.75608873, "num_input_tokens_seen": 88904095, "step": 4125, "time_per_iteration": 4.472416162490845 }, { "auxiliary_loss_clip": 0.01109646, "auxiliary_loss_mlp": 0.00776697, "balance_loss_clip": 1.04420686, "balance_loss_mlp": 1.00122678, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 1.9607758383710057, "language_loss": 0.69630861, "learning_rate": 3.5215515197135595e-06, "loss": 0.71517205, "num_input_tokens_seen": 88920740, "step": 4126, "time_per_iteration": 2.7412056922912598 }, { "auxiliary_loss_clip": 0.01133758, "auxiliary_loss_mlp": 0.01051914, "balance_loss_clip": 1.047984, "balance_loss_mlp": 1.03331721, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 2.275786464609162, "language_loss": 0.81219494, "learning_rate": 3.5212987244613764e-06, "loss": 0.83405173, "num_input_tokens_seen": 88938510, "step": 4127, "time_per_iteration": 2.620143413543701 }, { "auxiliary_loss_clip": 0.01136685, "auxiliary_loss_mlp": 0.00775421, "balance_loss_clip": 1.04974318, "balance_loss_mlp": 1.00120401, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 6.503475382998669, "language_loss": 0.8435086, "learning_rate": 3.5210458715213927e-06, "loss": 0.86262965, "num_input_tokens_seen": 88955235, "step": 4128, "time_per_iteration": 2.6764745712280273 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01057179, "balance_loss_clip": 1.04831362, "balance_loss_mlp": 1.03814149, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 7.318299516736359, "language_loss": 0.6572547, "learning_rate": 3.5207929609031973e-06, "loss": 0.67900276, "num_input_tokens_seen": 88975210, "step": 4129, "time_per_iteration": 2.7178256511688232 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.04595077, "balance_loss_mlp": 1.02570498, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 1.8507928533331595, "language_loss": 0.7496134, "learning_rate": 3.5205399926163806e-06, "loss": 0.77104557, "num_input_tokens_seen": 88996120, "step": 4130, "time_per_iteration": 2.82098126411438 }, { "auxiliary_loss_clip": 0.01078173, "auxiliary_loss_mlp": 0.01050295, "balance_loss_clip": 1.04238284, "balance_loss_mlp": 1.03163934, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 2.098795320061471, "language_loss": 0.7680133, "learning_rate": 3.520286966670535e-06, "loss": 0.78929794, "num_input_tokens_seen": 89008685, "step": 4131, "time_per_iteration": 2.7543740272521973 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.0104424, "balance_loss_clip": 1.04992545, "balance_loss_mlp": 1.02781272, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 2.181098565661814, "language_loss": 0.83579504, "learning_rate": 3.520033883075255e-06, "loss": 0.85755503, "num_input_tokens_seen": 89031160, "step": 4132, "time_per_iteration": 2.681339979171753 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01043901, "balance_loss_clip": 1.04574823, "balance_loss_mlp": 1.02506626, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 1.8557605687682572, "language_loss": 0.71320271, "learning_rate": 3.5197807418401386e-06, "loss": 0.73484504, "num_input_tokens_seen": 89047235, "step": 4133, "time_per_iteration": 2.6573541164398193 }, { "auxiliary_loss_clip": 0.01150987, "auxiliary_loss_mlp": 0.0104789, "balance_loss_clip": 1.05105197, "balance_loss_mlp": 1.02624202, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 3.222598228665933, "language_loss": 0.61894202, "learning_rate": 3.5195275429747834e-06, "loss": 0.64093071, "num_input_tokens_seen": 89064790, "step": 4134, "time_per_iteration": 2.5639493465423584 }, { "auxiliary_loss_clip": 0.01135356, "auxiliary_loss_mlp": 0.01045434, "balance_loss_clip": 1.04877877, "balance_loss_mlp": 1.02764797, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 1.882175713893398, "language_loss": 0.78382719, "learning_rate": 3.5192742864887914e-06, "loss": 0.80563509, "num_input_tokens_seen": 89083250, "step": 4135, "time_per_iteration": 2.6075639724731445 }, { "auxiliary_loss_clip": 0.01123928, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.05297661, "balance_loss_mlp": 1.01917946, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 2.4269193192884186, "language_loss": 0.83582413, "learning_rate": 3.5190209723917662e-06, "loss": 0.85742044, "num_input_tokens_seen": 89100905, "step": 4136, "time_per_iteration": 2.623377799987793 }, { "auxiliary_loss_clip": 0.01119838, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.05071807, "balance_loss_mlp": 1.02713883, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 2.1322549527950665, "language_loss": 0.7057327, "learning_rate": 3.518767600693314e-06, "loss": 0.72738326, "num_input_tokens_seen": 89122630, "step": 4137, "time_per_iteration": 2.814115524291992 }, { "auxiliary_loss_clip": 0.01133507, "auxiliary_loss_mlp": 0.00775347, "balance_loss_clip": 1.0449059, "balance_loss_mlp": 1.00107706, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 2.085766315480858, "language_loss": 0.66914427, "learning_rate": 3.518514171403042e-06, "loss": 0.68823284, "num_input_tokens_seen": 89141050, "step": 4138, "time_per_iteration": 2.646043539047241 }, { "auxiliary_loss_clip": 0.01103579, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.04612446, "balance_loss_mlp": 1.02000237, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 1.983116672544965, "language_loss": 0.83913636, "learning_rate": 3.51826068453056e-06, "loss": 0.86053687, "num_input_tokens_seen": 89160810, "step": 4139, "time_per_iteration": 2.741090774536133 }, { "auxiliary_loss_clip": 0.01111549, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04586422, "balance_loss_mlp": 1.02192068, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 1.4951428686450043, "language_loss": 0.78923917, "learning_rate": 3.518007140085481e-06, "loss": 0.81075907, "num_input_tokens_seen": 89180610, "step": 4140, "time_per_iteration": 2.712780237197876 }, { "auxiliary_loss_clip": 0.01048621, "auxiliary_loss_mlp": 0.01096526, "balance_loss_clip": 1.02931261, "balance_loss_mlp": 1.09464228, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.8293539951671052, "language_loss": 0.61007011, "learning_rate": 3.51775353807742e-06, "loss": 0.63152146, "num_input_tokens_seen": 89241880, "step": 4141, "time_per_iteration": 3.240020513534546 }, { "auxiliary_loss_clip": 0.01147379, "auxiliary_loss_mlp": 0.01049841, "balance_loss_clip": 1.05116534, "balance_loss_mlp": 1.03240097, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 2.1246942361961025, "language_loss": 0.72794569, "learning_rate": 3.5174998785159913e-06, "loss": 0.74991786, "num_input_tokens_seen": 89263340, "step": 4142, "time_per_iteration": 2.7316160202026367 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.04780602, "balance_loss_mlp": 1.02705276, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 1.7635050074541005, "language_loss": 0.80630821, "learning_rate": 3.5172461614108157e-06, "loss": 0.82808483, "num_input_tokens_seen": 89282870, "step": 4143, "time_per_iteration": 2.6763389110565186 }, { "auxiliary_loss_clip": 0.01117552, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.04615402, "balance_loss_mlp": 1.02026916, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 2.7235452599944145, "language_loss": 0.59766376, "learning_rate": 3.5169923867715137e-06, "loss": 0.61920542, "num_input_tokens_seen": 89303830, "step": 4144, "time_per_iteration": 2.789417266845703 }, { "auxiliary_loss_clip": 0.01128344, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.04464769, "balance_loss_mlp": 1.02850127, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 2.1754585056135047, "language_loss": 0.78476733, "learning_rate": 3.516738554607708e-06, "loss": 0.80650467, "num_input_tokens_seen": 89324350, "step": 4145, "time_per_iteration": 2.8416056632995605 }, { "auxiliary_loss_clip": 0.01140077, "auxiliary_loss_mlp": 0.00778414, "balance_loss_clip": 1.04980016, "balance_loss_mlp": 1.00122261, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 2.035933799021365, "language_loss": 0.64925039, "learning_rate": 3.5164846649290253e-06, "loss": 0.66843534, "num_input_tokens_seen": 89342875, "step": 4146, "time_per_iteration": 2.818240165710449 }, { "auxiliary_loss_clip": 0.01036642, "auxiliary_loss_mlp": 0.0100618, "balance_loss_clip": 1.02582741, "balance_loss_mlp": 1.00403452, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9560925601012792, "language_loss": 0.67304933, "learning_rate": 3.5162307177450915e-06, "loss": 0.69347757, "num_input_tokens_seen": 89404925, "step": 4147, "time_per_iteration": 3.339989185333252 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.0104863, "balance_loss_clip": 1.04991198, "balance_loss_mlp": 1.03078485, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 2.4221411280533554, "language_loss": 0.89285177, "learning_rate": 3.5159767130655366e-06, "loss": 0.9145695, "num_input_tokens_seen": 89425090, "step": 4148, "time_per_iteration": 2.7497105598449707 }, { "auxiliary_loss_clip": 0.01098234, "auxiliary_loss_mlp": 0.01049718, "balance_loss_clip": 1.04725289, "balance_loss_mlp": 1.02874899, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 1.90098046882646, "language_loss": 0.68272161, "learning_rate": 3.5157226508999935e-06, "loss": 0.70420116, "num_input_tokens_seen": 89442615, "step": 4149, "time_per_iteration": 2.7739884853363037 }, { "auxiliary_loss_clip": 0.01134907, "auxiliary_loss_mlp": 0.01044357, "balance_loss_clip": 1.0508213, "balance_loss_mlp": 1.02747655, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 1.67166255010053, "language_loss": 0.71424097, "learning_rate": 3.515468531258095e-06, "loss": 0.73603356, "num_input_tokens_seen": 89463025, "step": 4150, "time_per_iteration": 2.6801233291625977 }, { "auxiliary_loss_clip": 0.01098898, "auxiliary_loss_mlp": 0.0104939, "balance_loss_clip": 1.04628861, "balance_loss_mlp": 1.03149676, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 4.371450104119659, "language_loss": 0.72732216, "learning_rate": 3.515214354149478e-06, "loss": 0.74880505, "num_input_tokens_seen": 89480225, "step": 4151, "time_per_iteration": 2.7118351459503174 }, { "auxiliary_loss_clip": 0.01142805, "auxiliary_loss_mlp": 0.01054095, "balance_loss_clip": 1.05117846, "balance_loss_mlp": 1.0357486, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 3.4200711789217397, "language_loss": 0.63707078, "learning_rate": 3.514960119583781e-06, "loss": 0.65903974, "num_input_tokens_seen": 89496985, "step": 4152, "time_per_iteration": 2.6352219581604004 }, { "auxiliary_loss_clip": 0.01128057, "auxiliary_loss_mlp": 0.01043812, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02628791, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 3.664579624689737, "language_loss": 0.77259195, "learning_rate": 3.514705827570645e-06, "loss": 0.79431069, "num_input_tokens_seen": 89514420, "step": 4153, "time_per_iteration": 2.6120872497558594 }, { "auxiliary_loss_clip": 0.01135035, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.05221617, "balance_loss_mlp": 1.02620757, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 2.5781435797973833, "language_loss": 0.7677725, "learning_rate": 3.514451478119711e-06, "loss": 0.78955191, "num_input_tokens_seen": 89532925, "step": 4154, "time_per_iteration": 2.7488853931427 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.03251421, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 1.9052782276095375, "language_loss": 0.70335877, "learning_rate": 3.5141970712406258e-06, "loss": 0.72524405, "num_input_tokens_seen": 89552855, "step": 4155, "time_per_iteration": 2.6622395515441895 }, { "auxiliary_loss_clip": 0.01127695, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05243564, "balance_loss_mlp": 1.03074658, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 1.6974192026095432, "language_loss": 0.74953228, "learning_rate": 3.513942606943036e-06, "loss": 0.77128726, "num_input_tokens_seen": 89572830, "step": 4156, "time_per_iteration": 2.7599329948425293 }, { "auxiliary_loss_clip": 0.01127061, "auxiliary_loss_mlp": 0.01040498, "balance_loss_clip": 1.04922485, "balance_loss_mlp": 1.02404737, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 2.6541448192787858, "language_loss": 0.76703429, "learning_rate": 3.513688085236591e-06, "loss": 0.78870988, "num_input_tokens_seen": 89590345, "step": 4157, "time_per_iteration": 4.172720432281494 }, { "auxiliary_loss_clip": 0.01087279, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.04686046, "balance_loss_mlp": 1.03302717, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 6.508490360255271, "language_loss": 0.81656492, "learning_rate": 3.513433506130942e-06, "loss": 0.83794451, "num_input_tokens_seen": 89610295, "step": 4158, "time_per_iteration": 4.373260736465454 }, { "auxiliary_loss_clip": 0.01115824, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.04740119, "balance_loss_mlp": 1.02166879, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 2.799032697181286, "language_loss": 0.76568067, "learning_rate": 3.5131788696357427e-06, "loss": 0.78723395, "num_input_tokens_seen": 89627795, "step": 4159, "time_per_iteration": 2.6529338359832764 }, { "auxiliary_loss_clip": 0.01139337, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.05149508, "balance_loss_mlp": 1.02013946, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 2.4918403268433122, "language_loss": 0.71557873, "learning_rate": 3.512924175760649e-06, "loss": 0.73735791, "num_input_tokens_seen": 89648090, "step": 4160, "time_per_iteration": 4.178418874740601 }, { "auxiliary_loss_clip": 0.01062459, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.02823949, "balance_loss_mlp": 0.99992067, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 0.7611682305123987, "language_loss": 0.56783372, "learning_rate": 3.5126694245153186e-06, "loss": 0.58847755, "num_input_tokens_seen": 89710345, "step": 4161, "time_per_iteration": 3.1690969467163086 }, { "auxiliary_loss_clip": 0.0114076, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.05206347, "balance_loss_mlp": 1.0308131, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 4.523737291621751, "language_loss": 0.80654883, "learning_rate": 3.5124146159094125e-06, "loss": 0.82844305, "num_input_tokens_seen": 89729390, "step": 4162, "time_per_iteration": 2.630491018295288 }, { "auxiliary_loss_clip": 0.01127145, "auxiliary_loss_mlp": 0.00776859, "balance_loss_clip": 1.04807281, "balance_loss_mlp": 1.00124371, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 3.0029202967601107, "language_loss": 0.87312925, "learning_rate": 3.5121597499525927e-06, "loss": 0.89216936, "num_input_tokens_seen": 89742805, "step": 4163, "time_per_iteration": 2.660985231399536 }, { "auxiliary_loss_clip": 0.01133331, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.0538981, "balance_loss_mlp": 1.02234972, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 1.700690076898522, "language_loss": 0.83170879, "learning_rate": 3.5119048266545232e-06, "loss": 0.85343885, "num_input_tokens_seen": 89761145, "step": 4164, "time_per_iteration": 4.217406988143921 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05681539, "balance_loss_mlp": 1.0309732, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 1.61687510361108, "language_loss": 0.73889691, "learning_rate": 3.5116498460248716e-06, "loss": 0.76068473, "num_input_tokens_seen": 89780905, "step": 4165, "time_per_iteration": 2.7395150661468506 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.01043589, "balance_loss_clip": 1.04912043, "balance_loss_mlp": 1.02611279, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 1.856982928728685, "language_loss": 0.74739552, "learning_rate": 3.5113948080733062e-06, "loss": 0.7689606, "num_input_tokens_seen": 89799230, "step": 4166, "time_per_iteration": 2.7567081451416016 }, { "auxiliary_loss_clip": 0.01110594, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.04968488, "balance_loss_mlp": 1.02651834, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 2.0013578528528724, "language_loss": 0.82254446, "learning_rate": 3.5111397128094973e-06, "loss": 0.84407687, "num_input_tokens_seen": 89818240, "step": 4167, "time_per_iteration": 2.692664384841919 }, { "auxiliary_loss_clip": 0.01130059, "auxiliary_loss_mlp": 0.01043694, "balance_loss_clip": 1.05185139, "balance_loss_mlp": 1.02695727, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 2.4392619558537407, "language_loss": 0.79381847, "learning_rate": 3.51088456024312e-06, "loss": 0.81555605, "num_input_tokens_seen": 89834485, "step": 4168, "time_per_iteration": 2.6286962032318115 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.05118442, "balance_loss_mlp": 1.02704966, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 2.2753262043393243, "language_loss": 0.69603884, "learning_rate": 3.510629350383849e-06, "loss": 0.71789157, "num_input_tokens_seen": 89855645, "step": 4169, "time_per_iteration": 2.7935590744018555 }, { "auxiliary_loss_clip": 0.01110761, "auxiliary_loss_mlp": 0.01049625, "balance_loss_clip": 1.04870963, "balance_loss_mlp": 1.03274524, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 1.8250030020409629, "language_loss": 0.78045398, "learning_rate": 3.510374083241361e-06, "loss": 0.80205786, "num_input_tokens_seen": 89874895, "step": 4170, "time_per_iteration": 2.7728679180145264 }, { "auxiliary_loss_clip": 0.01128286, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.05320668, "balance_loss_mlp": 1.02662849, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 2.5073993684848004, "language_loss": 0.76440209, "learning_rate": 3.5101187588253368e-06, "loss": 0.78611928, "num_input_tokens_seen": 89891700, "step": 4171, "time_per_iteration": 2.7825160026550293 }, { "auxiliary_loss_clip": 0.01061117, "auxiliary_loss_mlp": 0.01002396, "balance_loss_clip": 1.027282, "balance_loss_mlp": 1.00034571, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 0.8424544393272001, "language_loss": 0.6006161, "learning_rate": 3.509863377145458e-06, "loss": 0.62125123, "num_input_tokens_seen": 89955775, "step": 4172, "time_per_iteration": 3.1981940269470215 }, { "auxiliary_loss_clip": 0.01125517, "auxiliary_loss_mlp": 0.01046213, "balance_loss_clip": 1.05005789, "balance_loss_mlp": 1.02821243, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 1.4368714421460043, "language_loss": 0.79106563, "learning_rate": 3.509607938211409e-06, "loss": 0.81278288, "num_input_tokens_seen": 89977150, "step": 4173, "time_per_iteration": 2.8311028480529785 }, { "auxiliary_loss_clip": 0.01152553, "auxiliary_loss_mlp": 0.0104675, "balance_loss_clip": 1.05725241, "balance_loss_mlp": 1.02986968, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 2.103663042812158, "language_loss": 0.83371937, "learning_rate": 3.509352442032875e-06, "loss": 0.85571229, "num_input_tokens_seen": 89994925, "step": 4174, "time_per_iteration": 2.696199893951416 }, { "auxiliary_loss_clip": 0.01095749, "auxiliary_loss_mlp": 0.01049206, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03095484, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 43.022796554959484, "language_loss": 0.71023381, "learning_rate": 3.509096888619545e-06, "loss": 0.73168337, "num_input_tokens_seen": 90013235, "step": 4175, "time_per_iteration": 2.8337926864624023 }, { "auxiliary_loss_clip": 0.01119154, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.05135846, "balance_loss_mlp": 1.02145982, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 2.017414900854033, "language_loss": 0.80957019, "learning_rate": 3.50884127798111e-06, "loss": 0.83115101, "num_input_tokens_seen": 90032150, "step": 4176, "time_per_iteration": 2.936908483505249 }, { "auxiliary_loss_clip": 0.01127542, "auxiliary_loss_mlp": 0.0104611, "balance_loss_clip": 1.0535233, "balance_loss_mlp": 1.02753711, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 2.475574978330162, "language_loss": 0.82294285, "learning_rate": 3.5085856101272623e-06, "loss": 0.84467936, "num_input_tokens_seen": 90049085, "step": 4177, "time_per_iteration": 2.7630460262298584 }, { "auxiliary_loss_clip": 0.01110202, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05168724, "balance_loss_mlp": 1.03386414, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 2.1761277698635593, "language_loss": 0.82517993, "learning_rate": 3.508329885067698e-06, "loss": 0.84679693, "num_input_tokens_seen": 90067695, "step": 4178, "time_per_iteration": 2.7356274127960205 }, { "auxiliary_loss_clip": 0.01145101, "auxiliary_loss_mlp": 0.00775573, "balance_loss_clip": 1.05324888, "balance_loss_mlp": 1.00148535, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 2.1475299559000947, "language_loss": 0.75229692, "learning_rate": 3.508074102812112e-06, "loss": 0.77150369, "num_input_tokens_seen": 90083890, "step": 4179, "time_per_iteration": 2.631096363067627 }, { "auxiliary_loss_clip": 0.01109293, "auxiliary_loss_mlp": 0.01056583, "balance_loss_clip": 1.04920673, "balance_loss_mlp": 1.03833175, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 1.9599833122138943, "language_loss": 0.69976825, "learning_rate": 3.507818263370206e-06, "loss": 0.72142696, "num_input_tokens_seen": 90100995, "step": 4180, "time_per_iteration": 2.708122730255127 }, { "auxiliary_loss_clip": 0.01147992, "auxiliary_loss_mlp": 0.01045783, "balance_loss_clip": 1.05343485, "balance_loss_mlp": 1.02909422, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 1.8622914556591927, "language_loss": 0.85940182, "learning_rate": 3.5075623667516796e-06, "loss": 0.88133955, "num_input_tokens_seen": 90120365, "step": 4181, "time_per_iteration": 2.633091449737549 }, { "auxiliary_loss_clip": 0.01148017, "auxiliary_loss_mlp": 0.01049707, "balance_loss_clip": 1.05351245, "balance_loss_mlp": 1.03270781, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 2.0695978407502467, "language_loss": 0.6856631, "learning_rate": 3.507306412966238e-06, "loss": 0.70764029, "num_input_tokens_seen": 90142610, "step": 4182, "time_per_iteration": 2.8169894218444824 }, { "auxiliary_loss_clip": 0.01041202, "auxiliary_loss_mlp": 0.010083, "balance_loss_clip": 1.02456141, "balance_loss_mlp": 1.00577307, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8403189096432666, "language_loss": 0.70032597, "learning_rate": 3.5070504020235853e-06, "loss": 0.72082102, "num_input_tokens_seen": 90200555, "step": 4183, "time_per_iteration": 3.2070610523223877 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01042834, "balance_loss_clip": 1.05145216, "balance_loss_mlp": 1.02441609, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 1.8802113118438855, "language_loss": 0.73834902, "learning_rate": 3.506794333933431e-06, "loss": 0.76007938, "num_input_tokens_seen": 90218120, "step": 4184, "time_per_iteration": 2.691950559616089 }, { "auxiliary_loss_clip": 0.01136971, "auxiliary_loss_mlp": 0.01047362, "balance_loss_clip": 1.05233765, "balance_loss_mlp": 1.0297792, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 1.8676646084141537, "language_loss": 0.8334859, "learning_rate": 3.506538208705484e-06, "loss": 0.85532916, "num_input_tokens_seen": 90236790, "step": 4185, "time_per_iteration": 2.6931228637695312 }, { "auxiliary_loss_clip": 0.01022217, "auxiliary_loss_mlp": 0.01010846, "balance_loss_clip": 1.03471541, "balance_loss_mlp": 1.00902176, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.7883550117667959, "language_loss": 0.61448294, "learning_rate": 3.5062820263494574e-06, "loss": 0.63481361, "num_input_tokens_seen": 90297070, "step": 4186, "time_per_iteration": 3.175295829772949 }, { "auxiliary_loss_clip": 0.01107804, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.04873872, "balance_loss_mlp": 1.02405787, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 1.8553357788385085, "language_loss": 0.79070914, "learning_rate": 3.5060257868750656e-06, "loss": 0.81220555, "num_input_tokens_seen": 90315255, "step": 4187, "time_per_iteration": 2.887378215789795 }, { "auxiliary_loss_clip": 0.01091434, "auxiliary_loss_mlp": 0.01049489, "balance_loss_clip": 1.0482558, "balance_loss_mlp": 1.03138089, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 3.7749228259968586, "language_loss": 0.79629189, "learning_rate": 3.5057694902920244e-06, "loss": 0.8177011, "num_input_tokens_seen": 90334990, "step": 4188, "time_per_iteration": 2.8985629081726074 }, { "auxiliary_loss_clip": 0.01133381, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05168021, "balance_loss_mlp": 1.03012538, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 1.7363151422402578, "language_loss": 0.74419165, "learning_rate": 3.5055131366100534e-06, "loss": 0.76599538, "num_input_tokens_seen": 90351825, "step": 4189, "time_per_iteration": 2.697097063064575 }, { "auxiliary_loss_clip": 0.01118534, "auxiliary_loss_mlp": 0.01044827, "balance_loss_clip": 1.04871011, "balance_loss_mlp": 1.02862656, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 2.0536634388060078, "language_loss": 0.84721291, "learning_rate": 3.5052567258388745e-06, "loss": 0.86884648, "num_input_tokens_seen": 90369860, "step": 4190, "time_per_iteration": 2.731227397918701 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01044895, "balance_loss_clip": 1.04597688, "balance_loss_mlp": 1.02633369, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 2.0130913170662783, "language_loss": 0.75695485, "learning_rate": 3.5050002579882082e-06, "loss": 0.77858591, "num_input_tokens_seen": 90389245, "step": 4191, "time_per_iteration": 2.7403173446655273 }, { "auxiliary_loss_clip": 0.01048031, "auxiliary_loss_mlp": 0.01014765, "balance_loss_clip": 1.02375531, "balance_loss_mlp": 1.0122261, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7280864395517058, "language_loss": 0.57129633, "learning_rate": 3.5047437330677823e-06, "loss": 0.59192419, "num_input_tokens_seen": 90456735, "step": 4192, "time_per_iteration": 3.237478017807007 }, { "auxiliary_loss_clip": 0.01121978, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.05535698, "balance_loss_mlp": 1.02374434, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 1.8423117439969312, "language_loss": 0.76066267, "learning_rate": 3.504487151087323e-06, "loss": 0.78229821, "num_input_tokens_seen": 90474165, "step": 4193, "time_per_iteration": 2.699486255645752 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.01046125, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.02869618, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 3.5003037089711437, "language_loss": 0.84335077, "learning_rate": 3.5042305120565598e-06, "loss": 0.86519086, "num_input_tokens_seen": 90491660, "step": 4194, "time_per_iteration": 2.6561896800994873 }, { "auxiliary_loss_clip": 0.01149932, "auxiliary_loss_mlp": 0.01050793, "balance_loss_clip": 1.05253458, "balance_loss_mlp": 1.03461599, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 1.3753304678825264, "language_loss": 0.88249695, "learning_rate": 3.5039738159852253e-06, "loss": 0.90450418, "num_input_tokens_seen": 90514025, "step": 4195, "time_per_iteration": 2.67887806892395 }, { "auxiliary_loss_clip": 0.01150202, "auxiliary_loss_mlp": 0.01041959, "balance_loss_clip": 1.05412734, "balance_loss_mlp": 1.02199149, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 2.4146072325129087, "language_loss": 0.85488242, "learning_rate": 3.503717062883053e-06, "loss": 0.87680399, "num_input_tokens_seen": 90533530, "step": 4196, "time_per_iteration": 2.6358916759490967 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.01049246, "balance_loss_clip": 1.05213511, "balance_loss_mlp": 1.03193665, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 1.9329643035636839, "language_loss": 0.8319478, "learning_rate": 3.5034602527597786e-06, "loss": 0.8538183, "num_input_tokens_seen": 90554025, "step": 4197, "time_per_iteration": 5.738839387893677 }, { "auxiliary_loss_clip": 0.01140063, "auxiliary_loss_mlp": 0.01051416, "balance_loss_clip": 1.05392218, "balance_loss_mlp": 1.03224671, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 2.1358917159416104, "language_loss": 0.72820318, "learning_rate": 3.5032033856251405e-06, "loss": 0.75011802, "num_input_tokens_seen": 90576930, "step": 4198, "time_per_iteration": 2.8819963932037354 }, { "auxiliary_loss_clip": 0.01152924, "auxiliary_loss_mlp": 0.01048555, "balance_loss_clip": 1.05455935, "balance_loss_mlp": 1.03045893, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 6.722547943004915, "language_loss": 0.76560014, "learning_rate": 3.50294646148888e-06, "loss": 0.78761488, "num_input_tokens_seen": 90595710, "step": 4199, "time_per_iteration": 2.636993169784546 }, { "auxiliary_loss_clip": 0.01125413, "auxiliary_loss_mlp": 0.00776026, "balance_loss_clip": 1.05274642, "balance_loss_mlp": 1.00117147, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 1.814097809936595, "language_loss": 0.73571241, "learning_rate": 3.502689480360739e-06, "loss": 0.75472683, "num_input_tokens_seen": 90617945, "step": 4200, "time_per_iteration": 4.297755002975464 }, { "auxiliary_loss_clip": 0.01137136, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.05050063, "balance_loss_mlp": 1.03187585, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 1.6490086858694837, "language_loss": 0.8223114, "learning_rate": 3.5024324422504616e-06, "loss": 0.84416234, "num_input_tokens_seen": 90640855, "step": 4201, "time_per_iteration": 2.859703302383423 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01048, "balance_loss_clip": 1.05422068, "balance_loss_mlp": 1.03126347, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 1.9307441853812024, "language_loss": 0.74854887, "learning_rate": 3.5021753471677965e-06, "loss": 0.77001321, "num_input_tokens_seen": 90661350, "step": 4202, "time_per_iteration": 2.7475366592407227 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.05362439, "balance_loss_mlp": 1.02392364, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 1.882597455778369, "language_loss": 0.7323755, "learning_rate": 3.501918195122491e-06, "loss": 0.75412554, "num_input_tokens_seen": 90680540, "step": 4203, "time_per_iteration": 2.6547653675079346 }, { "auxiliary_loss_clip": 0.01128208, "auxiliary_loss_mlp": 0.01039636, "balance_loss_clip": 1.05176711, "balance_loss_mlp": 1.02239835, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 1.4386036639708744, "language_loss": 0.77731073, "learning_rate": 3.501660986124297e-06, "loss": 0.79898918, "num_input_tokens_seen": 90703460, "step": 4204, "time_per_iteration": 4.4116432666778564 }, { "auxiliary_loss_clip": 0.01115267, "auxiliary_loss_mlp": 0.01052396, "balance_loss_clip": 1.05262613, "balance_loss_mlp": 1.03453815, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 1.9357035590368088, "language_loss": 0.72175288, "learning_rate": 3.5014037201829684e-06, "loss": 0.74342954, "num_input_tokens_seen": 90718815, "step": 4205, "time_per_iteration": 2.6750712394714355 }, { "auxiliary_loss_clip": 0.01124756, "auxiliary_loss_mlp": 0.01044172, "balance_loss_clip": 1.05032194, "balance_loss_mlp": 1.02801895, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 1.4680577763339375, "language_loss": 0.75594378, "learning_rate": 3.50114639730826e-06, "loss": 0.77763301, "num_input_tokens_seen": 90742125, "step": 4206, "time_per_iteration": 2.876408815383911 }, { "auxiliary_loss_clip": 0.01107683, "auxiliary_loss_mlp": 0.01044618, "balance_loss_clip": 1.04771221, "balance_loss_mlp": 1.02780974, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 1.5378963492414741, "language_loss": 0.78807724, "learning_rate": 3.5008890175099296e-06, "loss": 0.80960023, "num_input_tokens_seen": 90760785, "step": 4207, "time_per_iteration": 2.7176475524902344 }, { "auxiliary_loss_clip": 0.01133715, "auxiliary_loss_mlp": 0.01055631, "balance_loss_clip": 1.0547328, "balance_loss_mlp": 1.03984797, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 1.5723877129370716, "language_loss": 0.76399815, "learning_rate": 3.5006315807977375e-06, "loss": 0.78589159, "num_input_tokens_seen": 90780045, "step": 4208, "time_per_iteration": 2.797658920288086 }, { "auxiliary_loss_clip": 0.01131059, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.05162513, "balance_loss_mlp": 1.02465391, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 3.9595354320915166, "language_loss": 0.69848049, "learning_rate": 3.5003740871814456e-06, "loss": 0.72019976, "num_input_tokens_seen": 90797980, "step": 4209, "time_per_iteration": 2.738159418106079 }, { "auxiliary_loss_clip": 0.01046521, "auxiliary_loss_mlp": 0.0100386, "balance_loss_clip": 1.02250004, "balance_loss_mlp": 1.0015471, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.7787603502724176, "language_loss": 0.55091059, "learning_rate": 3.5001165366708175e-06, "loss": 0.57141441, "num_input_tokens_seen": 90864865, "step": 4210, "time_per_iteration": 3.196953535079956 }, { "auxiliary_loss_clip": 0.01113643, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.05103207, "balance_loss_mlp": 1.02215338, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 1.8504444580052586, "language_loss": 0.8006835, "learning_rate": 3.4998589292756204e-06, "loss": 0.82220757, "num_input_tokens_seen": 90882885, "step": 4211, "time_per_iteration": 2.7241647243499756 }, { "auxiliary_loss_clip": 0.01095085, "auxiliary_loss_mlp": 0.01044368, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.02844775, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 1.531596575729193, "language_loss": 0.78362429, "learning_rate": 3.499601265005622e-06, "loss": 0.80501878, "num_input_tokens_seen": 90902985, "step": 4212, "time_per_iteration": 2.788607358932495 }, { "auxiliary_loss_clip": 0.01133893, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.04857254, "balance_loss_mlp": 1.02401471, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 2.123277134845907, "language_loss": 0.53516036, "learning_rate": 3.4993435438705938e-06, "loss": 0.55690968, "num_input_tokens_seen": 90923550, "step": 4213, "time_per_iteration": 2.6675784587860107 }, { "auxiliary_loss_clip": 0.01120924, "auxiliary_loss_mlp": 0.01044765, "balance_loss_clip": 1.05005503, "balance_loss_mlp": 1.0273726, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 2.4965805840577002, "language_loss": 0.65416414, "learning_rate": 3.499085765880308e-06, "loss": 0.67582107, "num_input_tokens_seen": 90943260, "step": 4214, "time_per_iteration": 2.691359281539917 }, { "auxiliary_loss_clip": 0.01046401, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.02238619, "balance_loss_mlp": 1.00056791, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 0.8515065776804692, "language_loss": 0.58004916, "learning_rate": 3.4988279310445396e-06, "loss": 0.60054076, "num_input_tokens_seen": 90996295, "step": 4215, "time_per_iteration": 2.981840133666992 }, { "auxiliary_loss_clip": 0.01124794, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.05316496, "balance_loss_mlp": 1.02655554, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 1.7497766885830588, "language_loss": 0.83251095, "learning_rate": 3.498570039373066e-06, "loss": 0.85419416, "num_input_tokens_seen": 91017545, "step": 4216, "time_per_iteration": 2.912137508392334 }, { "auxiliary_loss_clip": 0.0112972, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.05088937, "balance_loss_mlp": 1.02338624, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 3.3733415491927996, "language_loss": 0.80008072, "learning_rate": 3.498312090875666e-06, "loss": 0.82177842, "num_input_tokens_seen": 91037715, "step": 4217, "time_per_iteration": 2.6532363891601562 }, { "auxiliary_loss_clip": 0.01116019, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04436612, "balance_loss_mlp": 1.02234793, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 2.333881972650505, "language_loss": 0.75585902, "learning_rate": 3.4980540855621218e-06, "loss": 0.77740264, "num_input_tokens_seen": 91055295, "step": 4218, "time_per_iteration": 2.650867223739624 }, { "auxiliary_loss_clip": 0.0113544, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.04940748, "balance_loss_mlp": 1.0229727, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 2.040148074486094, "language_loss": 0.74188256, "learning_rate": 3.4977960234422167e-06, "loss": 0.76363909, "num_input_tokens_seen": 91075485, "step": 4219, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.05222011, "balance_loss_mlp": 1.03138447, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 4.990704095966988, "language_loss": 0.81355274, "learning_rate": 3.497537904525736e-06, "loss": 0.83542132, "num_input_tokens_seen": 91093620, "step": 4220, "time_per_iteration": 2.6146652698516846 }, { "auxiliary_loss_clip": 0.01100698, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.04988587, "balance_loss_mlp": 1.03041148, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 2.3092995740689197, "language_loss": 0.70819569, "learning_rate": 3.497279728822468e-06, "loss": 0.72969389, "num_input_tokens_seen": 91114110, "step": 4221, "time_per_iteration": 2.851747751235962 }, { "auxiliary_loss_clip": 0.0114682, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.05224657, "balance_loss_mlp": 1.02257586, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 2.4229893622177188, "language_loss": 0.61689377, "learning_rate": 3.497021496342202e-06, "loss": 0.63875645, "num_input_tokens_seen": 91133135, "step": 4222, "time_per_iteration": 2.6394412517547607 }, { "auxiliary_loss_clip": 0.01138378, "auxiliary_loss_mlp": 0.01051871, "balance_loss_clip": 1.05371165, "balance_loss_mlp": 1.03528929, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 1.6839261376783914, "language_loss": 0.74744058, "learning_rate": 3.496763207094731e-06, "loss": 0.76934308, "num_input_tokens_seen": 91151805, "step": 4223, "time_per_iteration": 2.648322105407715 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.04767203, "balance_loss_mlp": 1.02325082, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 1.7092524284111348, "language_loss": 0.80226004, "learning_rate": 3.49650486108985e-06, "loss": 0.82357341, "num_input_tokens_seen": 91172270, "step": 4224, "time_per_iteration": 2.7572662830352783 }, { "auxiliary_loss_clip": 0.01130506, "auxiliary_loss_mlp": 0.00774076, "balance_loss_clip": 1.05102324, "balance_loss_mlp": 1.00112057, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 1.4497407173280796, "language_loss": 0.77330017, "learning_rate": 3.496246458337354e-06, "loss": 0.792346, "num_input_tokens_seen": 91192080, "step": 4225, "time_per_iteration": 2.7661190032958984 }, { "auxiliary_loss_clip": 0.01130647, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.04919255, "balance_loss_mlp": 1.03271639, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 2.0615353379683055, "language_loss": 0.84638137, "learning_rate": 3.4959879988470426e-06, "loss": 0.86818743, "num_input_tokens_seen": 91211450, "step": 4226, "time_per_iteration": 2.690683126449585 }, { "auxiliary_loss_clip": 0.01143268, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.05067408, "balance_loss_mlp": 1.02613425, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 1.5600656222031943, "language_loss": 0.70886129, "learning_rate": 3.4957294826287164e-06, "loss": 0.73072731, "num_input_tokens_seen": 91231835, "step": 4227, "time_per_iteration": 2.6647307872772217 }, { "auxiliary_loss_clip": 0.01055229, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.02168798, "balance_loss_mlp": 0.9995476, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 0.9869295588353136, "language_loss": 0.61927998, "learning_rate": 3.4954709096921785e-06, "loss": 0.63985181, "num_input_tokens_seen": 91288755, "step": 4228, "time_per_iteration": 2.986067533493042 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.0464859, "balance_loss_mlp": 1.02212501, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 2.314170874410929, "language_loss": 0.86946094, "learning_rate": 3.4952122800472336e-06, "loss": 0.8911112, "num_input_tokens_seen": 91302485, "step": 4229, "time_per_iteration": 2.629518985748291 }, { "auxiliary_loss_clip": 0.01102882, "auxiliary_loss_mlp": 0.01042519, "balance_loss_clip": 1.04811144, "balance_loss_mlp": 1.0241369, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 1.7811216632522446, "language_loss": 0.77265114, "learning_rate": 3.4949535937036892e-06, "loss": 0.79410517, "num_input_tokens_seen": 91321120, "step": 4230, "time_per_iteration": 2.715655565261841 }, { "auxiliary_loss_clip": 0.01133364, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0504818, "balance_loss_mlp": 1.03074503, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 1.8956341732473607, "language_loss": 0.7550717, "learning_rate": 3.4946948506713544e-06, "loss": 0.77688015, "num_input_tokens_seen": 91338575, "step": 4231, "time_per_iteration": 2.6945316791534424 }, { "auxiliary_loss_clip": 0.0113214, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.04939127, "balance_loss_mlp": 1.0230999, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 1.6179274617095247, "language_loss": 0.73618764, "learning_rate": 3.4944360509600416e-06, "loss": 0.75789881, "num_input_tokens_seen": 91357355, "step": 4232, "time_per_iteration": 2.6219112873077393 }, { "auxiliary_loss_clip": 0.01149145, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05579972, "balance_loss_mlp": 1.02589035, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 2.2856831174377388, "language_loss": 0.86333203, "learning_rate": 3.4941771945795637e-06, "loss": 0.88525456, "num_input_tokens_seen": 91376515, "step": 4233, "time_per_iteration": 2.675877809524536 }, { "auxiliary_loss_clip": 0.01080108, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02457917, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 1.5382450997432586, "language_loss": 0.75319451, "learning_rate": 3.493918281539737e-06, "loss": 0.77439684, "num_input_tokens_seen": 91397595, "step": 4234, "time_per_iteration": 2.9050087928771973 }, { "auxiliary_loss_clip": 0.01117427, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02897787, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 2.6382014960101765, "language_loss": 0.74923635, "learning_rate": 3.493659311850379e-06, "loss": 0.77085495, "num_input_tokens_seen": 91417775, "step": 4235, "time_per_iteration": 2.788041353225708 }, { "auxiliary_loss_clip": 0.01124445, "auxiliary_loss_mlp": 0.00776537, "balance_loss_clip": 1.05315781, "balance_loss_mlp": 1.00115323, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 1.9882672691222136, "language_loss": 0.64451182, "learning_rate": 3.4934002855213106e-06, "loss": 0.66352159, "num_input_tokens_seen": 91437665, "step": 4236, "time_per_iteration": 2.8649141788482666 }, { "auxiliary_loss_clip": 0.01144465, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.05185175, "balance_loss_mlp": 1.02122915, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 1.6410229940010734, "language_loss": 0.6714325, "learning_rate": 3.493141202562354e-06, "loss": 0.69323969, "num_input_tokens_seen": 91456705, "step": 4237, "time_per_iteration": 4.262012958526611 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01049012, "balance_loss_clip": 1.05240059, "balance_loss_mlp": 1.03203678, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 2.0013967295828237, "language_loss": 0.75415373, "learning_rate": 3.492882062983333e-06, "loss": 0.77611452, "num_input_tokens_seen": 91475535, "step": 4238, "time_per_iteration": 2.6378636360168457 }, { "auxiliary_loss_clip": 0.01137265, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.05366278, "balance_loss_mlp": 1.02843964, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 3.4417299363308613, "language_loss": 0.80712521, "learning_rate": 3.492622866794074e-06, "loss": 0.82895833, "num_input_tokens_seen": 91499140, "step": 4239, "time_per_iteration": 4.348390579223633 }, { "auxiliary_loss_clip": 0.01128023, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.0522213, "balance_loss_mlp": 1.02870631, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 1.7312526359597522, "language_loss": 0.77521586, "learning_rate": 3.492363614004407e-06, "loss": 0.79695487, "num_input_tokens_seen": 91518335, "step": 4240, "time_per_iteration": 2.7501273155212402 }, { "auxiliary_loss_clip": 0.01151347, "auxiliary_loss_mlp": 0.01040734, "balance_loss_clip": 1.05296493, "balance_loss_mlp": 1.0226146, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 3.3593092651087595, "language_loss": 0.83430749, "learning_rate": 3.492104304624162e-06, "loss": 0.85622829, "num_input_tokens_seen": 91537655, "step": 4241, "time_per_iteration": 2.7480928897857666 }, { "auxiliary_loss_clip": 0.01137407, "auxiliary_loss_mlp": 0.01045384, "balance_loss_clip": 1.05306387, "balance_loss_mlp": 1.02887392, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 1.6379574895871623, "language_loss": 0.73322648, "learning_rate": 3.4918449386631725e-06, "loss": 0.75505441, "num_input_tokens_seen": 91557545, "step": 4242, "time_per_iteration": 2.713635206222534 }, { "auxiliary_loss_clip": 0.0114709, "auxiliary_loss_mlp": 0.00774169, "balance_loss_clip": 1.05182981, "balance_loss_mlp": 1.00115824, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 3.2486673035230993, "language_loss": 0.72336024, "learning_rate": 3.491585516131273e-06, "loss": 0.7425729, "num_input_tokens_seen": 91574405, "step": 4243, "time_per_iteration": 4.298815727233887 }, { "auxiliary_loss_clip": 0.0113532, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.05183125, "balance_loss_mlp": 1.02797616, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 1.8323151946393021, "language_loss": 0.82076979, "learning_rate": 3.491326037038301e-06, "loss": 0.842574, "num_input_tokens_seen": 91593755, "step": 4244, "time_per_iteration": 2.6497015953063965 }, { "auxiliary_loss_clip": 0.01054616, "auxiliary_loss_mlp": 0.01017916, "balance_loss_clip": 1.03294289, "balance_loss_mlp": 1.01572227, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6914168393706984, "language_loss": 0.57701397, "learning_rate": 3.4910665013940967e-06, "loss": 0.59773928, "num_input_tokens_seen": 91660335, "step": 4245, "time_per_iteration": 3.2938833236694336 }, { "auxiliary_loss_clip": 0.01146552, "auxiliary_loss_mlp": 0.01052395, "balance_loss_clip": 1.0508852, "balance_loss_mlp": 1.03577745, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 2.1326330958670567, "language_loss": 0.65120399, "learning_rate": 3.4908069092085015e-06, "loss": 0.6731934, "num_input_tokens_seen": 91678500, "step": 4246, "time_per_iteration": 2.5949065685272217 }, { "auxiliary_loss_clip": 0.01127579, "auxiliary_loss_mlp": 0.01044633, "balance_loss_clip": 1.04806828, "balance_loss_mlp": 1.02944601, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 1.7151532201527704, "language_loss": 0.81580049, "learning_rate": 3.4905472604913585e-06, "loss": 0.83752257, "num_input_tokens_seen": 91696430, "step": 4247, "time_per_iteration": 2.673624277114868 }, { "auxiliary_loss_clip": 0.01140059, "auxiliary_loss_mlp": 0.01044068, "balance_loss_clip": 1.05152941, "balance_loss_mlp": 1.02543616, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 2.241724474505105, "language_loss": 0.83335149, "learning_rate": 3.490287555252514e-06, "loss": 0.85519278, "num_input_tokens_seen": 91713270, "step": 4248, "time_per_iteration": 2.617570400238037 }, { "auxiliary_loss_clip": 0.01112618, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.04433584, "balance_loss_mlp": 1.02458215, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 2.084670538042193, "language_loss": 0.84011936, "learning_rate": 3.4900277935018166e-06, "loss": 0.8616671, "num_input_tokens_seen": 91728865, "step": 4249, "time_per_iteration": 2.6617467403411865 }, { "auxiliary_loss_clip": 0.01001275, "auxiliary_loss_mlp": 0.01002657, "balance_loss_clip": 1.0228157, "balance_loss_mlp": 0.9996174, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 0.765792812565725, "language_loss": 0.56274796, "learning_rate": 3.489767975249115e-06, "loss": 0.58278728, "num_input_tokens_seen": 91787470, "step": 4250, "time_per_iteration": 3.24300479888916 }, { "auxiliary_loss_clip": 0.01117816, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.04929769, "balance_loss_mlp": 1.01839769, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 2.294460262471245, "language_loss": 0.80566651, "learning_rate": 3.4895081005042632e-06, "loss": 0.82720602, "num_input_tokens_seen": 91805640, "step": 4251, "time_per_iteration": 2.732752561569214 }, { "auxiliary_loss_clip": 0.01030367, "auxiliary_loss_mlp": 0.01001193, "balance_loss_clip": 1.02468216, "balance_loss_mlp": 0.99888068, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.7932625116211053, "language_loss": 0.6608988, "learning_rate": 3.4892481692771146e-06, "loss": 0.68121445, "num_input_tokens_seen": 91869695, "step": 4252, "time_per_iteration": 3.304985523223877 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.0499115, "balance_loss_mlp": 1.02097619, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 2.60951363435401, "language_loss": 0.73882902, "learning_rate": 3.4889881815775267e-06, "loss": 0.76049387, "num_input_tokens_seen": 91889920, "step": 4253, "time_per_iteration": 2.706052303314209 }, { "auxiliary_loss_clip": 0.01097964, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04340124, "balance_loss_mlp": 1.02782309, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 2.978807414856607, "language_loss": 0.72565317, "learning_rate": 3.488728137415357e-06, "loss": 0.7470758, "num_input_tokens_seen": 91908665, "step": 4254, "time_per_iteration": 2.7579715251922607 }, { "auxiliary_loss_clip": 0.01098791, "auxiliary_loss_mlp": 0.00774228, "balance_loss_clip": 1.04665136, "balance_loss_mlp": 1.001104, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 1.7240740787107458, "language_loss": 0.80729312, "learning_rate": 3.4884680368004675e-06, "loss": 0.82602334, "num_input_tokens_seen": 91927855, "step": 4255, "time_per_iteration": 2.788978099822998 }, { "auxiliary_loss_clip": 0.01124525, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.05111384, "balance_loss_mlp": 1.02414227, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 1.5275751549355678, "language_loss": 0.85734111, "learning_rate": 3.488207879742721e-06, "loss": 0.87899381, "num_input_tokens_seen": 91948500, "step": 4256, "time_per_iteration": 2.7916831970214844 }, { "auxiliary_loss_clip": 0.01102599, "auxiliary_loss_mlp": 0.01049743, "balance_loss_clip": 1.04525566, "balance_loss_mlp": 1.03164732, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 1.8301502951270987, "language_loss": 0.74872649, "learning_rate": 3.4879476662519826e-06, "loss": 0.77024996, "num_input_tokens_seen": 91968375, "step": 4257, "time_per_iteration": 2.7754952907562256 }, { "auxiliary_loss_clip": 0.0102418, "auxiliary_loss_mlp": 0.01011535, "balance_loss_clip": 1.03534186, "balance_loss_mlp": 1.00959146, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.8003890262370261, "language_loss": 0.65255105, "learning_rate": 3.4876873963381196e-06, "loss": 0.67290819, "num_input_tokens_seen": 92028490, "step": 4258, "time_per_iteration": 3.269063949584961 }, { "auxiliary_loss_clip": 0.01091736, "auxiliary_loss_mlp": 0.00773347, "balance_loss_clip": 1.04549718, "balance_loss_mlp": 1.00111449, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 1.5266978755669562, "language_loss": 0.76443565, "learning_rate": 3.4874270700110013e-06, "loss": 0.78308654, "num_input_tokens_seen": 92048060, "step": 4259, "time_per_iteration": 2.805574893951416 }, { "auxiliary_loss_clip": 0.01026212, "auxiliary_loss_mlp": 0.01016368, "balance_loss_clip": 1.02208054, "balance_loss_mlp": 1.01372147, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.7927643603688844, "language_loss": 0.58455491, "learning_rate": 3.4871666872804994e-06, "loss": 0.60498071, "num_input_tokens_seen": 92118180, "step": 4260, "time_per_iteration": 3.3904550075531006 }, { "auxiliary_loss_clip": 0.01133193, "auxiliary_loss_mlp": 0.01048996, "balance_loss_clip": 1.04874313, "balance_loss_mlp": 1.03204465, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 3.3188145253338543, "language_loss": 0.77064955, "learning_rate": 3.4869062481564875e-06, "loss": 0.79247141, "num_input_tokens_seen": 92137570, "step": 4261, "time_per_iteration": 2.769864082336426 }, { "auxiliary_loss_clip": 0.01144035, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.05178332, "balance_loss_mlp": 1.02465355, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 1.5699122250769224, "language_loss": 0.83367205, "learning_rate": 3.486645752648842e-06, "loss": 0.85551333, "num_input_tokens_seen": 92157625, "step": 4262, "time_per_iteration": 2.682828426361084 }, { "auxiliary_loss_clip": 0.01134556, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.05219626, "balance_loss_mlp": 1.02344143, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 2.340862226505914, "language_loss": 0.73892939, "learning_rate": 3.4863852007674405e-06, "loss": 0.76068795, "num_input_tokens_seen": 92175350, "step": 4263, "time_per_iteration": 2.70947003364563 }, { "auxiliary_loss_clip": 0.0111297, "auxiliary_loss_mlp": 0.00773371, "balance_loss_clip": 1.05221081, "balance_loss_mlp": 1.00093555, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 1.8143922917988324, "language_loss": 0.82766259, "learning_rate": 3.486124592522163e-06, "loss": 0.84652603, "num_input_tokens_seen": 92196070, "step": 4264, "time_per_iteration": 2.7249553203582764 }, { "auxiliary_loss_clip": 0.01133012, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.05265546, "balance_loss_mlp": 1.02468669, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 2.8986425954305206, "language_loss": 0.74346334, "learning_rate": 3.4858639279228924e-06, "loss": 0.76521224, "num_input_tokens_seen": 92216310, "step": 4265, "time_per_iteration": 2.7149150371551514 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.04754925, "balance_loss_mlp": 1.02034247, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 15.50909821859273, "language_loss": 0.81623137, "learning_rate": 3.485603206979513e-06, "loss": 0.83769822, "num_input_tokens_seen": 92234510, "step": 4266, "time_per_iteration": 2.6890153884887695 }, { "auxiliary_loss_clip": 0.01083702, "auxiliary_loss_mlp": 0.01050109, "balance_loss_clip": 1.0468955, "balance_loss_mlp": 1.0318346, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 2.4522850064786037, "language_loss": 0.79120672, "learning_rate": 3.4853424297019103e-06, "loss": 0.81254482, "num_input_tokens_seen": 92254070, "step": 4267, "time_per_iteration": 2.8390700817108154 }, { "auxiliary_loss_clip": 0.01094597, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.0276804, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 1.6765306902124857, "language_loss": 0.79241312, "learning_rate": 3.4850815960999736e-06, "loss": 0.81380516, "num_input_tokens_seen": 92275060, "step": 4268, "time_per_iteration": 2.7324178218841553 }, { "auxiliary_loss_clip": 0.01106667, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.04940808, "balance_loss_mlp": 1.00098729, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 1.8248642507450341, "language_loss": 0.67737979, "learning_rate": 3.484820706183595e-06, "loss": 0.69623303, "num_input_tokens_seen": 92293610, "step": 4269, "time_per_iteration": 2.7897677421569824 }, { "auxiliary_loss_clip": 0.01123993, "auxiliary_loss_mlp": 0.01043408, "balance_loss_clip": 1.05155373, "balance_loss_mlp": 1.02596736, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 3.069203267679029, "language_loss": 0.79117787, "learning_rate": 3.484559759962666e-06, "loss": 0.81285185, "num_input_tokens_seen": 92308305, "step": 4270, "time_per_iteration": 2.8076114654541016 }, { "auxiliary_loss_clip": 0.01094814, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.04357839, "balance_loss_mlp": 1.02393079, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 2.413207422396751, "language_loss": 0.68088073, "learning_rate": 3.4842987574470816e-06, "loss": 0.7022649, "num_input_tokens_seen": 92329875, "step": 4271, "time_per_iteration": 2.8195667266845703 }, { "auxiliary_loss_clip": 0.01136281, "auxiliary_loss_mlp": 0.00774788, "balance_loss_clip": 1.05146289, "balance_loss_mlp": 1.00110972, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 3.3671515903121216, "language_loss": 0.87362605, "learning_rate": 3.4840376986467403e-06, "loss": 0.89273679, "num_input_tokens_seen": 92348780, "step": 4272, "time_per_iteration": 2.6910364627838135 }, { "auxiliary_loss_clip": 0.01122968, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.05348301, "balance_loss_mlp": 1.02854192, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 1.6813472119330561, "language_loss": 0.81420678, "learning_rate": 3.483776583571541e-06, "loss": 0.83589977, "num_input_tokens_seen": 92368175, "step": 4273, "time_per_iteration": 2.6883673667907715 }, { "auxiliary_loss_clip": 0.01097944, "auxiliary_loss_mlp": 0.01041741, "balance_loss_clip": 1.043715, "balance_loss_mlp": 1.02459884, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 3.3251008044769947, "language_loss": 0.76944637, "learning_rate": 3.4835154122313846e-06, "loss": 0.79084325, "num_input_tokens_seen": 92387755, "step": 4274, "time_per_iteration": 2.7613401412963867 }, { "auxiliary_loss_clip": 0.01112797, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04380774, "balance_loss_mlp": 1.02220166, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 2.1172072427968933, "language_loss": 0.83780324, "learning_rate": 3.4832541846361743e-06, "loss": 0.85932392, "num_input_tokens_seen": 92409850, "step": 4275, "time_per_iteration": 2.7835779190063477 }, { "auxiliary_loss_clip": 0.01120289, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.05141211, "balance_loss_mlp": 1.02223814, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 2.725989678545036, "language_loss": 0.7874397, "learning_rate": 3.4829929007958175e-06, "loss": 0.80903983, "num_input_tokens_seen": 92431250, "step": 4276, "time_per_iteration": 5.679298400878906 }, { "auxiliary_loss_clip": 0.01136261, "auxiliary_loss_mlp": 0.01046327, "balance_loss_clip": 1.05269814, "balance_loss_mlp": 1.02982879, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 4.7083902318823885, "language_loss": 0.79273927, "learning_rate": 3.4827315607202214e-06, "loss": 0.81456512, "num_input_tokens_seen": 92452065, "step": 4277, "time_per_iteration": 2.691035270690918 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.05214763, "balance_loss_mlp": 1.02367437, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 2.017980063834791, "language_loss": 0.78986102, "learning_rate": 3.482470164419295e-06, "loss": 0.81172454, "num_input_tokens_seen": 92470025, "step": 4278, "time_per_iteration": 4.2404680252075195 }, { "auxiliary_loss_clip": 0.01126121, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.02102113, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 2.8070462448385904, "language_loss": 0.74898899, "learning_rate": 3.482208711902952e-06, "loss": 0.77062923, "num_input_tokens_seen": 92489825, "step": 4279, "time_per_iteration": 2.65977144241333 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.01051687, "balance_loss_clip": 1.04973292, "balance_loss_mlp": 1.03423464, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.4256697448035687, "language_loss": 0.85603923, "learning_rate": 3.4819472031811065e-06, "loss": 0.87790298, "num_input_tokens_seen": 92507270, "step": 4280, "time_per_iteration": 2.6072864532470703 }, { "auxiliary_loss_clip": 0.01136623, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.05183434, "balance_loss_mlp": 1.02147269, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 3.9579835716917695, "language_loss": 0.79381943, "learning_rate": 3.4816856382636744e-06, "loss": 0.8155762, "num_input_tokens_seen": 92526300, "step": 4281, "time_per_iteration": 2.613163471221924 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.04847932, "balance_loss_mlp": 1.02099478, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 2.240063499401578, "language_loss": 0.87314785, "learning_rate": 3.4814240171605737e-06, "loss": 0.89462996, "num_input_tokens_seen": 92546465, "step": 4282, "time_per_iteration": 4.489396333694458 }, { "auxiliary_loss_clip": 0.01148783, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.0526619, "balance_loss_mlp": 1.02959502, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 1.5167715532309152, "language_loss": 0.70110047, "learning_rate": 3.4811623398817267e-06, "loss": 0.72305429, "num_input_tokens_seen": 92567260, "step": 4283, "time_per_iteration": 2.619131565093994 }, { "auxiliary_loss_clip": 0.01144466, "auxiliary_loss_mlp": 0.00774605, "balance_loss_clip": 1.05443883, "balance_loss_mlp": 1.0010494, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 1.950947388276708, "language_loss": 0.80411774, "learning_rate": 3.4809006064370553e-06, "loss": 0.82330847, "num_input_tokens_seen": 92585425, "step": 4284, "time_per_iteration": 2.656998634338379 }, { "auxiliary_loss_clip": 0.01105473, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.05797076, "balance_loss_mlp": 1.02488899, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 2.2559612506718434, "language_loss": 0.70473522, "learning_rate": 3.4806388168364835e-06, "loss": 0.72618985, "num_input_tokens_seen": 92604770, "step": 4285, "time_per_iteration": 2.880835771560669 }, { "auxiliary_loss_clip": 0.01127807, "auxiliary_loss_mlp": 0.0104515, "balance_loss_clip": 1.05229783, "balance_loss_mlp": 1.02971268, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 1.8739093647405893, "language_loss": 0.58494061, "learning_rate": 3.4803769710899402e-06, "loss": 0.6066702, "num_input_tokens_seen": 92622635, "step": 4286, "time_per_iteration": 2.63923978805542 }, { "auxiliary_loss_clip": 0.01138174, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.05271184, "balance_loss_mlp": 1.03020048, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.4732857929087761, "language_loss": 0.63687879, "learning_rate": 3.480115069207354e-06, "loss": 0.65872842, "num_input_tokens_seen": 92642960, "step": 4287, "time_per_iteration": 2.67764949798584 }, { "auxiliary_loss_clip": 0.01127889, "auxiliary_loss_mlp": 0.01045385, "balance_loss_clip": 1.05252934, "balance_loss_mlp": 1.02769411, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 2.134546441867425, "language_loss": 0.71780413, "learning_rate": 3.4798531111986557e-06, "loss": 0.73953688, "num_input_tokens_seen": 92662455, "step": 4288, "time_per_iteration": 2.7174036502838135 }, { "auxiliary_loss_clip": 0.0110996, "auxiliary_loss_mlp": 0.01042748, "balance_loss_clip": 1.04934072, "balance_loss_mlp": 1.02691674, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 1.4449800602700236, "language_loss": 0.77059102, "learning_rate": 3.4795910970737786e-06, "loss": 0.79211813, "num_input_tokens_seen": 92683520, "step": 4289, "time_per_iteration": 2.748249053955078 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.00775089, "balance_loss_clip": 1.05252326, "balance_loss_mlp": 1.001122, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 2.0235699584636295, "language_loss": 0.85416883, "learning_rate": 3.4793290268426592e-06, "loss": 0.87338245, "num_input_tokens_seen": 92701450, "step": 4290, "time_per_iteration": 2.593461751937866 }, { "auxiliary_loss_clip": 0.01114221, "auxiliary_loss_mlp": 0.01056837, "balance_loss_clip": 1.05081999, "balance_loss_mlp": 1.03660691, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 2.4272093439618847, "language_loss": 0.72360331, "learning_rate": 3.4790669005152354e-06, "loss": 0.74531388, "num_input_tokens_seen": 92720355, "step": 4291, "time_per_iteration": 2.6838138103485107 }, { "auxiliary_loss_clip": 0.01150945, "auxiliary_loss_mlp": 0.0104494, "balance_loss_clip": 1.05378067, "balance_loss_mlp": 1.02758288, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 2.78045823134535, "language_loss": 0.80846477, "learning_rate": 3.4788047181014458e-06, "loss": 0.83042365, "num_input_tokens_seen": 92736755, "step": 4292, "time_per_iteration": 2.595710277557373 }, { "auxiliary_loss_clip": 0.0115367, "auxiliary_loss_mlp": 0.01044878, "balance_loss_clip": 1.05773902, "balance_loss_mlp": 1.02702022, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 2.057533015633898, "language_loss": 0.67592025, "learning_rate": 3.4785424796112337e-06, "loss": 0.69790578, "num_input_tokens_seen": 92757655, "step": 4293, "time_per_iteration": 2.699570894241333 }, { "auxiliary_loss_clip": 0.0110485, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.04971898, "balance_loss_mlp": 1.03190207, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 2.0097854631835217, "language_loss": 0.75671911, "learning_rate": 3.478280185054542e-06, "loss": 0.77824801, "num_input_tokens_seen": 92776100, "step": 4294, "time_per_iteration": 2.7217960357666016 }, { "auxiliary_loss_clip": 0.01098332, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.0444684, "balance_loss_mlp": 1.03404188, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 1.7798433628760433, "language_loss": 0.8047998, "learning_rate": 3.478017834441318e-06, "loss": 0.82631868, "num_input_tokens_seen": 92798880, "step": 4295, "time_per_iteration": 2.871460437774658 }, { "auxiliary_loss_clip": 0.01055358, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04843688, "balance_loss_mlp": 1.0256989, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 2.1012913939780753, "language_loss": 0.72843397, "learning_rate": 3.4777554277815096e-06, "loss": 0.74942946, "num_input_tokens_seen": 92817750, "step": 4296, "time_per_iteration": 3.173367738723755 }, { "auxiliary_loss_clip": 0.01091622, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.05392241, "balance_loss_mlp": 1.02106011, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 1.5772062283828172, "language_loss": 0.86928564, "learning_rate": 3.477492965085067e-06, "loss": 0.8905865, "num_input_tokens_seen": 92837995, "step": 4297, "time_per_iteration": 3.1598868370056152 }, { "auxiliary_loss_clip": 0.01149748, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.03090191, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 1.8030727150796175, "language_loss": 0.84720427, "learning_rate": 3.477230446361943e-06, "loss": 0.86917591, "num_input_tokens_seen": 92857245, "step": 4298, "time_per_iteration": 2.632448196411133 }, { "auxiliary_loss_clip": 0.01135108, "auxiliary_loss_mlp": 0.00775458, "balance_loss_clip": 1.05262494, "balance_loss_mlp": 1.00111055, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 2.0124667048247686, "language_loss": 0.83514953, "learning_rate": 3.4769678716220927e-06, "loss": 0.8542552, "num_input_tokens_seen": 92873265, "step": 4299, "time_per_iteration": 2.631248950958252 }, { "auxiliary_loss_clip": 0.01117485, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.05216849, "balance_loss_mlp": 1.01868308, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 2.419754138344463, "language_loss": 0.82422709, "learning_rate": 3.4767052408754726e-06, "loss": 0.84575242, "num_input_tokens_seen": 92890880, "step": 4300, "time_per_iteration": 2.650834083557129 }, { "auxiliary_loss_clip": 0.0113846, "auxiliary_loss_mlp": 0.01041208, "balance_loss_clip": 1.0535903, "balance_loss_mlp": 1.02343392, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 2.971673559214411, "language_loss": 0.66949177, "learning_rate": 3.4764425541320417e-06, "loss": 0.69128841, "num_input_tokens_seen": 92910770, "step": 4301, "time_per_iteration": 2.729519844055176 }, { "auxiliary_loss_clip": 0.01139778, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05335701, "balance_loss_mlp": 1.02245533, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 2.29820997177689, "language_loss": 0.81177735, "learning_rate": 3.4761798114017617e-06, "loss": 0.83357668, "num_input_tokens_seen": 92929520, "step": 4302, "time_per_iteration": 2.5496692657470703 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.05242491, "balance_loss_mlp": 1.02542388, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 1.8036447001063776, "language_loss": 0.92147923, "learning_rate": 3.475917012694595e-06, "loss": 0.94295776, "num_input_tokens_seen": 92947890, "step": 4303, "time_per_iteration": 2.686222791671753 }, { "auxiliary_loss_clip": 0.01141887, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.05643094, "balance_loss_mlp": 1.02322304, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 2.7085759571044368, "language_loss": 0.67138135, "learning_rate": 3.475654158020507e-06, "loss": 0.69320166, "num_input_tokens_seen": 92967690, "step": 4304, "time_per_iteration": 2.665797472000122 }, { "auxiliary_loss_clip": 0.01113882, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.0509342, "balance_loss_mlp": 1.02498007, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 2.126938769919949, "language_loss": 0.72085559, "learning_rate": 3.4753912473894657e-06, "loss": 0.74241412, "num_input_tokens_seen": 92986830, "step": 4305, "time_per_iteration": 2.7514076232910156 }, { "auxiliary_loss_clip": 0.01103045, "auxiliary_loss_mlp": 0.00775987, "balance_loss_clip": 1.04804707, "balance_loss_mlp": 1.00122118, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 6.414506312387852, "language_loss": 0.76175749, "learning_rate": 3.4751282808114403e-06, "loss": 0.78054774, "num_input_tokens_seen": 93002740, "step": 4306, "time_per_iteration": 2.7326161861419678 }, { "auxiliary_loss_clip": 0.01049461, "auxiliary_loss_mlp": 0.0102188, "balance_loss_clip": 1.03476799, "balance_loss_mlp": 1.01943636, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.8427062291747792, "language_loss": 0.57128024, "learning_rate": 3.474865258296403e-06, "loss": 0.59199357, "num_input_tokens_seen": 93058645, "step": 4307, "time_per_iteration": 3.1499595642089844 }, { "auxiliary_loss_clip": 0.01123356, "auxiliary_loss_mlp": 0.01045032, "balance_loss_clip": 1.0514828, "balance_loss_mlp": 1.02858078, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 1.5299746109283647, "language_loss": 0.71727359, "learning_rate": 3.474602179854327e-06, "loss": 0.73895752, "num_input_tokens_seen": 93077140, "step": 4308, "time_per_iteration": 2.6824283599853516 }, { "auxiliary_loss_clip": 0.01152705, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05659723, "balance_loss_mlp": 1.02976048, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 1.8339599204524273, "language_loss": 0.83940542, "learning_rate": 3.4743390454951886e-06, "loss": 0.86140084, "num_input_tokens_seen": 93093580, "step": 4309, "time_per_iteration": 2.560194253921509 }, { "auxiliary_loss_clip": 0.01137306, "auxiliary_loss_mlp": 0.01044025, "balance_loss_clip": 1.05587196, "balance_loss_mlp": 1.02815771, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 1.5397823214091813, "language_loss": 0.84657532, "learning_rate": 3.474075855228966e-06, "loss": 0.86838865, "num_input_tokens_seen": 93112345, "step": 4310, "time_per_iteration": 2.627716064453125 }, { "auxiliary_loss_clip": 0.01143598, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05802059, "balance_loss_mlp": 1.03141904, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 2.0190220849922094, "language_loss": 0.77145267, "learning_rate": 3.473812609065639e-06, "loss": 0.79336536, "num_input_tokens_seen": 93131545, "step": 4311, "time_per_iteration": 2.694856643676758 }, { "auxiliary_loss_clip": 0.01110239, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.04629123, "balance_loss_mlp": 1.03498793, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 1.9233367952735905, "language_loss": 0.72848439, "learning_rate": 3.4735493070151904e-06, "loss": 0.75011057, "num_input_tokens_seen": 93150730, "step": 4312, "time_per_iteration": 2.7577714920043945 }, { "auxiliary_loss_clip": 0.01150768, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05618715, "balance_loss_mlp": 1.02845287, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 1.8485738044524733, "language_loss": 0.70193493, "learning_rate": 3.4732859490876044e-06, "loss": 0.72388697, "num_input_tokens_seen": 93167895, "step": 4313, "time_per_iteration": 2.6447813510894775 }, { "auxiliary_loss_clip": 0.01150117, "auxiliary_loss_mlp": 0.01054192, "balance_loss_clip": 1.05624926, "balance_loss_mlp": 1.03845656, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 1.8538125013537565, "language_loss": 0.80462205, "learning_rate": 3.473022535292867e-06, "loss": 0.82666522, "num_input_tokens_seen": 93187650, "step": 4314, "time_per_iteration": 2.6073296070098877 }, { "auxiliary_loss_clip": 0.01110006, "auxiliary_loss_mlp": 0.01049511, "balance_loss_clip": 1.04867387, "balance_loss_mlp": 1.03253555, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 2.061113629574459, "language_loss": 0.670748, "learning_rate": 3.472759065640968e-06, "loss": 0.69234318, "num_input_tokens_seen": 93207370, "step": 4315, "time_per_iteration": 6.427948236465454 }, { "auxiliary_loss_clip": 0.01096074, "auxiliary_loss_mlp": 0.01056601, "balance_loss_clip": 1.04853845, "balance_loss_mlp": 1.0407939, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 2.0096953575355125, "language_loss": 0.79649067, "learning_rate": 3.4724955401418976e-06, "loss": 0.81801736, "num_input_tokens_seen": 93227925, "step": 4316, "time_per_iteration": 2.7463796138763428 }, { "auxiliary_loss_clip": 0.01096584, "auxiliary_loss_mlp": 0.01048328, "balance_loss_clip": 1.0487628, "balance_loss_mlp": 1.03112638, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 3.2727308584132584, "language_loss": 0.77498394, "learning_rate": 3.4722319588056487e-06, "loss": 0.79643309, "num_input_tokens_seen": 93250020, "step": 4317, "time_per_iteration": 4.658867359161377 }, { "auxiliary_loss_clip": 0.01155612, "auxiliary_loss_mlp": 0.01054128, "balance_loss_clip": 1.05959845, "balance_loss_mlp": 1.03734958, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 2.117435309152476, "language_loss": 0.77656054, "learning_rate": 3.4719683216422163e-06, "loss": 0.79865795, "num_input_tokens_seen": 93269070, "step": 4318, "time_per_iteration": 2.5934906005859375 }, { "auxiliary_loss_clip": 0.01146449, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.0530901, "balance_loss_mlp": 1.02733302, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 1.6144223240331488, "language_loss": 0.76362926, "learning_rate": 3.471704628661598e-06, "loss": 0.78554815, "num_input_tokens_seen": 93290250, "step": 4319, "time_per_iteration": 2.607649564743042 }, { "auxiliary_loss_clip": 0.01125042, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.05419481, "balance_loss_mlp": 1.02587628, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 1.6090277746740278, "language_loss": 0.76549125, "learning_rate": 3.4714408798737925e-06, "loss": 0.78715789, "num_input_tokens_seen": 93310090, "step": 4320, "time_per_iteration": 2.722574472427368 }, { "auxiliary_loss_clip": 0.01116281, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.05157554, "balance_loss_mlp": 1.02546, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 1.6564648175426406, "language_loss": 0.71067965, "learning_rate": 3.471177075288801e-06, "loss": 0.73226953, "num_input_tokens_seen": 93329570, "step": 4321, "time_per_iteration": 4.276093244552612 }, { "auxiliary_loss_clip": 0.01125031, "auxiliary_loss_mlp": 0.01055033, "balance_loss_clip": 1.05191207, "balance_loss_mlp": 1.03549457, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 1.9031382952841078, "language_loss": 0.74805915, "learning_rate": 3.4709132149166277e-06, "loss": 0.76985979, "num_input_tokens_seen": 93347920, "step": 4322, "time_per_iteration": 2.6573097705841064 }, { "auxiliary_loss_clip": 0.0111558, "auxiliary_loss_mlp": 0.0104757, "balance_loss_clip": 1.05213332, "balance_loss_mlp": 1.03004622, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 1.8978708709823064, "language_loss": 0.73837054, "learning_rate": 3.470649298767278e-06, "loss": 0.76000202, "num_input_tokens_seen": 93367145, "step": 4323, "time_per_iteration": 2.75765061378479 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.00775622, "balance_loss_clip": 1.0509938, "balance_loss_mlp": 1.00099182, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 2.107506603705316, "language_loss": 0.67186093, "learning_rate": 3.4703853268507597e-06, "loss": 0.69101399, "num_input_tokens_seen": 93386555, "step": 4324, "time_per_iteration": 2.752307891845703 }, { "auxiliary_loss_clip": 0.0109649, "auxiliary_loss_mlp": 0.01045367, "balance_loss_clip": 1.05030632, "balance_loss_mlp": 1.03026319, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 2.121769328280442, "language_loss": 0.71064055, "learning_rate": 3.470121299177082e-06, "loss": 0.732059, "num_input_tokens_seen": 93405590, "step": 4325, "time_per_iteration": 2.824281692504883 }, { "auxiliary_loss_clip": 0.01134613, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.04941416, "balance_loss_mlp": 1.01839304, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 1.8496839878379767, "language_loss": 0.73106551, "learning_rate": 3.469857215756257e-06, "loss": 0.75276732, "num_input_tokens_seen": 93424750, "step": 4326, "time_per_iteration": 2.7235658168792725 }, { "auxiliary_loss_clip": 0.01118123, "auxiliary_loss_mlp": 0.00776184, "balance_loss_clip": 1.05001175, "balance_loss_mlp": 1.00100303, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 1.7229255626307804, "language_loss": 0.86908734, "learning_rate": 3.4695930765982997e-06, "loss": 0.88803041, "num_input_tokens_seen": 93443465, "step": 4327, "time_per_iteration": 2.7072155475616455 }, { "auxiliary_loss_clip": 0.01153995, "auxiliary_loss_mlp": 0.00775932, "balance_loss_clip": 1.05640841, "balance_loss_mlp": 1.0008533, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 1.4664721830580452, "language_loss": 0.80265766, "learning_rate": 3.4693288817132255e-06, "loss": 0.82195687, "num_input_tokens_seen": 93462580, "step": 4328, "time_per_iteration": 2.6463024616241455 }, { "auxiliary_loss_clip": 0.0111992, "auxiliary_loss_mlp": 0.00774533, "balance_loss_clip": 1.04837036, "balance_loss_mlp": 1.00092077, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 1.6317826670237516, "language_loss": 0.88094193, "learning_rate": 3.4690646311110525e-06, "loss": 0.89988649, "num_input_tokens_seen": 93482790, "step": 4329, "time_per_iteration": 2.7130861282348633 }, { "auxiliary_loss_clip": 0.011478, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.05545115, "balance_loss_mlp": 1.02431321, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 1.8335620949826397, "language_loss": 0.77834195, "learning_rate": 3.468800324801802e-06, "loss": 0.80022621, "num_input_tokens_seen": 93498795, "step": 4330, "time_per_iteration": 2.6223180294036865 }, { "auxiliary_loss_clip": 0.01148961, "auxiliary_loss_mlp": 0.01047898, "balance_loss_clip": 1.0536809, "balance_loss_mlp": 1.03081572, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 1.5875829464999673, "language_loss": 0.75683081, "learning_rate": 3.4685359627954958e-06, "loss": 0.77879941, "num_input_tokens_seen": 93518335, "step": 4331, "time_per_iteration": 2.6383559703826904 }, { "auxiliary_loss_clip": 0.01130325, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.05964541, "balance_loss_mlp": 1.0261023, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 1.3798785286413686, "language_loss": 0.69174874, "learning_rate": 3.4682715451021584e-06, "loss": 0.71347773, "num_input_tokens_seen": 93539170, "step": 4332, "time_per_iteration": 2.675203800201416 }, { "auxiliary_loss_clip": 0.01117119, "auxiliary_loss_mlp": 0.01048864, "balance_loss_clip": 1.04849494, "balance_loss_mlp": 1.03203201, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 6.1371153370044915, "language_loss": 0.79897749, "learning_rate": 3.4680070717318174e-06, "loss": 0.82063735, "num_input_tokens_seen": 93558480, "step": 4333, "time_per_iteration": 2.7595479488372803 }, { "auxiliary_loss_clip": 0.01144159, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.05260658, "balance_loss_mlp": 1.02317452, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 1.9478362516602954, "language_loss": 0.80919975, "learning_rate": 3.467742542694501e-06, "loss": 0.83103544, "num_input_tokens_seen": 93575220, "step": 4334, "time_per_iteration": 2.585676670074463 }, { "auxiliary_loss_clip": 0.01121127, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.04868293, "balance_loss_mlp": 1.02051783, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 1.8490049893982383, "language_loss": 0.8027274, "learning_rate": 3.46747795800024e-06, "loss": 0.82431591, "num_input_tokens_seen": 93597015, "step": 4335, "time_per_iteration": 2.730853796005249 }, { "auxiliary_loss_clip": 0.01060862, "auxiliary_loss_mlp": 0.01054521, "balance_loss_clip": 1.03598261, "balance_loss_mlp": 1.05267298, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 1.1166557113782816, "language_loss": 0.60850358, "learning_rate": 3.467213317659068e-06, "loss": 0.62965739, "num_input_tokens_seen": 93657775, "step": 4336, "time_per_iteration": 3.1322128772735596 }, { "auxiliary_loss_clip": 0.01111016, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.05039525, "balance_loss_mlp": 1.02976441, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 2.784557437613843, "language_loss": 0.7679469, "learning_rate": 3.46694862168102e-06, "loss": 0.78952539, "num_input_tokens_seen": 93676145, "step": 4337, "time_per_iteration": 2.704305410385132 }, { "auxiliary_loss_clip": 0.0112146, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04997659, "balance_loss_mlp": 1.02728987, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 2.7677016823816976, "language_loss": 0.74653983, "learning_rate": 3.4666838700761334e-06, "loss": 0.76820505, "num_input_tokens_seen": 93692480, "step": 4338, "time_per_iteration": 2.652679204940796 }, { "auxiliary_loss_clip": 0.01140171, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.05246329, "balance_loss_mlp": 1.02314997, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 2.378816803290104, "language_loss": 0.81061137, "learning_rate": 3.466419062854447e-06, "loss": 0.8324182, "num_input_tokens_seen": 93710165, "step": 4339, "time_per_iteration": 2.7237682342529297 }, { "auxiliary_loss_clip": 0.01090328, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.04649866, "balance_loss_mlp": 1.02436984, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 1.6860698424881835, "language_loss": 0.76643449, "learning_rate": 3.4661542000260033e-06, "loss": 0.78773987, "num_input_tokens_seen": 93730185, "step": 4340, "time_per_iteration": 2.817647695541382 }, { "auxiliary_loss_clip": 0.01082903, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04781985, "balance_loss_mlp": 1.02381396, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 1.954971477972507, "language_loss": 0.82689369, "learning_rate": 3.465889281600845e-06, "loss": 0.84813106, "num_input_tokens_seen": 93747690, "step": 4341, "time_per_iteration": 2.822387218475342 }, { "auxiliary_loss_clip": 0.01148407, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.0550344, "balance_loss_mlp": 1.02387536, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 2.3225619433460083, "language_loss": 0.76828772, "learning_rate": 3.4656243075890183e-06, "loss": 0.79018521, "num_input_tokens_seen": 93767405, "step": 4342, "time_per_iteration": 2.7091987133026123 }, { "auxiliary_loss_clip": 0.01137117, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.05262113, "balance_loss_mlp": 1.01837635, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 1.8380809165191976, "language_loss": 0.66072762, "learning_rate": 3.4653592780005707e-06, "loss": 0.68246007, "num_input_tokens_seen": 93789950, "step": 4343, "time_per_iteration": 2.7885191440582275 }, { "auxiliary_loss_clip": 0.01076135, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.04419374, "balance_loss_mlp": 1.02715397, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 1.9033089414913282, "language_loss": 0.73626471, "learning_rate": 3.465094192845553e-06, "loss": 0.75747907, "num_input_tokens_seen": 93807835, "step": 4344, "time_per_iteration": 2.7622575759887695 }, { "auxiliary_loss_clip": 0.01150726, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.05625904, "balance_loss_mlp": 1.02560019, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 2.7815673216786045, "language_loss": 0.86820161, "learning_rate": 3.4648290521340165e-06, "loss": 0.89013231, "num_input_tokens_seen": 93825670, "step": 4345, "time_per_iteration": 2.615021228790283 }, { "auxiliary_loss_clip": 0.01121997, "auxiliary_loss_mlp": 0.01036853, "balance_loss_clip": 1.05178094, "balance_loss_mlp": 1.02056956, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 1.9109970692142244, "language_loss": 0.76235008, "learning_rate": 3.464563855876015e-06, "loss": 0.78393853, "num_input_tokens_seen": 93844045, "step": 4346, "time_per_iteration": 2.660766363143921 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01045855, "balance_loss_clip": 1.05571795, "balance_loss_mlp": 1.02870095, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 1.6628741865434964, "language_loss": 0.75995654, "learning_rate": 3.464298604081606e-06, "loss": 0.78181791, "num_input_tokens_seen": 93864380, "step": 4347, "time_per_iteration": 2.6985979080200195 }, { "auxiliary_loss_clip": 0.0110699, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.05063343, "balance_loss_mlp": 1.02501726, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 1.7474860409603998, "language_loss": 0.73196864, "learning_rate": 3.4640332967608476e-06, "loss": 0.75345594, "num_input_tokens_seen": 93885475, "step": 4348, "time_per_iteration": 2.7511887550354004 }, { "auxiliary_loss_clip": 0.01110529, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.0290519, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 2.6377025292028944, "language_loss": 0.91262084, "learning_rate": 3.463767933923799e-06, "loss": 0.93418467, "num_input_tokens_seen": 93905545, "step": 4349, "time_per_iteration": 2.720240354537964 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01048228, "balance_loss_clip": 1.05569661, "balance_loss_mlp": 1.03184831, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 1.7232851278977876, "language_loss": 0.80046499, "learning_rate": 3.463502515580524e-06, "loss": 0.82231867, "num_input_tokens_seen": 93924185, "step": 4350, "time_per_iteration": 2.652054786682129 }, { "auxiliary_loss_clip": 0.0113538, "auxiliary_loss_mlp": 0.01049567, "balance_loss_clip": 1.05652642, "balance_loss_mlp": 1.03299654, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 10.816271600027287, "language_loss": 0.62736505, "learning_rate": 3.4632370417410866e-06, "loss": 0.64921451, "num_input_tokens_seen": 93942825, "step": 4351, "time_per_iteration": 2.6674954891204834 }, { "auxiliary_loss_clip": 0.01138265, "auxiliary_loss_mlp": 0.01048518, "balance_loss_clip": 1.05201697, "balance_loss_mlp": 1.03168559, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 1.9014393183165526, "language_loss": 0.84131002, "learning_rate": 3.462971512415555e-06, "loss": 0.86317784, "num_input_tokens_seen": 93962045, "step": 4352, "time_per_iteration": 2.8033063411712646 }, { "auxiliary_loss_clip": 0.01065372, "auxiliary_loss_mlp": 0.0102292, "balance_loss_clip": 1.04145527, "balance_loss_mlp": 1.02078664, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.8050815788583346, "language_loss": 0.70591724, "learning_rate": 3.462705927613996e-06, "loss": 0.7268002, "num_input_tokens_seen": 94021175, "step": 4353, "time_per_iteration": 3.101954936981201 }, { "auxiliary_loss_clip": 0.01115948, "auxiliary_loss_mlp": 0.01069336, "balance_loss_clip": 1.04858005, "balance_loss_mlp": 1.05013168, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 1.6494861832481549, "language_loss": 0.77562749, "learning_rate": 3.4624402873464816e-06, "loss": 0.79748034, "num_input_tokens_seen": 94043370, "step": 4354, "time_per_iteration": 2.772723436355591 }, { "auxiliary_loss_clip": 0.01089887, "auxiliary_loss_mlp": 0.01058882, "balance_loss_clip": 1.04805279, "balance_loss_mlp": 1.04082203, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 1.8339738923409379, "language_loss": 0.68351537, "learning_rate": 3.462174591623085e-06, "loss": 0.70500308, "num_input_tokens_seen": 94063510, "step": 4355, "time_per_iteration": 5.908639430999756 }, { "auxiliary_loss_clip": 0.01094509, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.0486095, "balance_loss_mlp": 1.02164054, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 1.9440617828376934, "language_loss": 0.67573452, "learning_rate": 3.4619088404538815e-06, "loss": 0.69707847, "num_input_tokens_seen": 94083865, "step": 4356, "time_per_iteration": 4.351539611816406 }, { "auxiliary_loss_clip": 0.01057297, "auxiliary_loss_mlp": 0.0100707, "balance_loss_clip": 1.03335488, "balance_loss_mlp": 1.00484037, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.6809064288126679, "language_loss": 0.53124392, "learning_rate": 3.4616430338489487e-06, "loss": 0.55188763, "num_input_tokens_seen": 94144095, "step": 4357, "time_per_iteration": 3.0896964073181152 }, { "auxiliary_loss_clip": 0.01139918, "auxiliary_loss_mlp": 0.0104768, "balance_loss_clip": 1.05365348, "balance_loss_mlp": 1.03106248, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 1.8814759411194193, "language_loss": 0.84233022, "learning_rate": 3.4613771718183654e-06, "loss": 0.86420614, "num_input_tokens_seen": 94163035, "step": 4358, "time_per_iteration": 2.723057746887207 }, { "auxiliary_loss_clip": 0.01127273, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.04886353, "balance_loss_mlp": 1.02411628, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 2.354545555797757, "language_loss": 0.67324048, "learning_rate": 3.4611112543722127e-06, "loss": 0.69494629, "num_input_tokens_seen": 94182520, "step": 4359, "time_per_iteration": 2.7128403186798096 }, { "auxiliary_loss_clip": 0.01118602, "auxiliary_loss_mlp": 0.01045018, "balance_loss_clip": 1.04637527, "balance_loss_mlp": 1.02880526, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 1.8862311303010293, "language_loss": 0.78726596, "learning_rate": 3.4608452815205757e-06, "loss": 0.80890214, "num_input_tokens_seen": 94201795, "step": 4360, "time_per_iteration": 4.41027569770813 }, { "auxiliary_loss_clip": 0.01119481, "auxiliary_loss_mlp": 0.01042435, "balance_loss_clip": 1.04831719, "balance_loss_mlp": 1.02640164, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 1.8399079957082187, "language_loss": 0.67980468, "learning_rate": 3.4605792532735387e-06, "loss": 0.70142382, "num_input_tokens_seen": 94222390, "step": 4361, "time_per_iteration": 2.7642054557800293 }, { "auxiliary_loss_clip": 0.01139509, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.03842545, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 2.1489496912575166, "language_loss": 0.84068632, "learning_rate": 3.46031316964119e-06, "loss": 0.86264122, "num_input_tokens_seen": 94239980, "step": 4362, "time_per_iteration": 2.6152050495147705 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01046107, "balance_loss_clip": 1.04752779, "balance_loss_mlp": 1.02867842, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 2.0545933935481835, "language_loss": 0.65068752, "learning_rate": 3.4600470306336197e-06, "loss": 0.67220271, "num_input_tokens_seen": 94260715, "step": 4363, "time_per_iteration": 2.7297046184539795 }, { "auxiliary_loss_clip": 0.01040739, "auxiliary_loss_mlp": 0.01017272, "balance_loss_clip": 1.02776587, "balance_loss_mlp": 1.01506662, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 0.9195643121956573, "language_loss": 0.61104208, "learning_rate": 3.4597808362609194e-06, "loss": 0.6316222, "num_input_tokens_seen": 94321285, "step": 4364, "time_per_iteration": 3.3122286796569824 }, { "auxiliary_loss_clip": 0.01151556, "auxiliary_loss_mlp": 0.01050336, "balance_loss_clip": 1.0550462, "balance_loss_mlp": 1.03201365, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 2.6922753747731387, "language_loss": 0.7223357, "learning_rate": 3.459514586533184e-06, "loss": 0.74435461, "num_input_tokens_seen": 94335420, "step": 4365, "time_per_iteration": 2.588611364364624 }, { "auxiliary_loss_clip": 0.01123747, "auxiliary_loss_mlp": 0.00776591, "balance_loss_clip": 1.05296087, "balance_loss_mlp": 1.00093484, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 1.9684942716361389, "language_loss": 0.77178609, "learning_rate": 3.459248281460509e-06, "loss": 0.79078948, "num_input_tokens_seen": 94357440, "step": 4366, "time_per_iteration": 2.7489407062530518 }, { "auxiliary_loss_clip": 0.01149499, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.05433846, "balance_loss_mlp": 1.02652764, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 1.9587652436204308, "language_loss": 0.76205176, "learning_rate": 3.4589819210529927e-06, "loss": 0.78397727, "num_input_tokens_seen": 94375690, "step": 4367, "time_per_iteration": 2.63778018951416 }, { "auxiliary_loss_clip": 0.01136158, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0523572, "balance_loss_mlp": 1.02903318, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 2.055472748506688, "language_loss": 0.69400585, "learning_rate": 3.458715505320736e-06, "loss": 0.71581888, "num_input_tokens_seen": 94393190, "step": 4368, "time_per_iteration": 2.6515018939971924 }, { "auxiliary_loss_clip": 0.01123905, "auxiliary_loss_mlp": 0.01045619, "balance_loss_clip": 1.05272579, "balance_loss_mlp": 1.02791643, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 1.8794244148025279, "language_loss": 0.79255176, "learning_rate": 3.458449034273841e-06, "loss": 0.81424701, "num_input_tokens_seen": 94410975, "step": 4369, "time_per_iteration": 2.717142343521118 }, { "auxiliary_loss_clip": 0.01119662, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.05190969, "balance_loss_mlp": 1.02344334, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 4.796099217910503, "language_loss": 0.83591807, "learning_rate": 3.4581825079224133e-06, "loss": 0.85752219, "num_input_tokens_seen": 94429985, "step": 4370, "time_per_iteration": 2.742966890335083 }, { "auxiliary_loss_clip": 0.01137822, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.05178714, "balance_loss_mlp": 1.0345341, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 1.7275848609842401, "language_loss": 0.71854705, "learning_rate": 3.4579159262765575e-06, "loss": 0.7404635, "num_input_tokens_seen": 94448660, "step": 4371, "time_per_iteration": 2.691899538040161 }, { "auxiliary_loss_clip": 0.01062293, "auxiliary_loss_mlp": 0.01003561, "balance_loss_clip": 1.02797341, "balance_loss_mlp": 1.00147498, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.6802377941963699, "language_loss": 0.56387627, "learning_rate": 3.457649289346384e-06, "loss": 0.58453482, "num_input_tokens_seen": 94515630, "step": 4372, "time_per_iteration": 3.279158115386963 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.05295706, "balance_loss_mlp": 1.02169585, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 1.9842369613103452, "language_loss": 0.77777553, "learning_rate": 3.4573825971420042e-06, "loss": 0.79940796, "num_input_tokens_seen": 94535385, "step": 4373, "time_per_iteration": 2.8367159366607666 }, { "auxiliary_loss_clip": 0.01104424, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05070519, "balance_loss_mlp": 1.02314186, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 7.588420148526772, "language_loss": 0.71397603, "learning_rate": 3.4571158496735294e-06, "loss": 0.73541456, "num_input_tokens_seen": 94552650, "step": 4374, "time_per_iteration": 2.722332239151001 }, { "auxiliary_loss_clip": 0.0112606, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05836225, "balance_loss_mlp": 1.02748489, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 1.8414201938467747, "language_loss": 0.81212163, "learning_rate": 3.4568490469510756e-06, "loss": 0.83383965, "num_input_tokens_seen": 94574075, "step": 4375, "time_per_iteration": 2.7654781341552734 }, { "auxiliary_loss_clip": 0.01118996, "auxiliary_loss_mlp": 0.01045139, "balance_loss_clip": 1.04959798, "balance_loss_mlp": 1.02901626, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 1.6461571134793078, "language_loss": 0.6613251, "learning_rate": 3.4565821889847603e-06, "loss": 0.68296647, "num_input_tokens_seen": 94594255, "step": 4376, "time_per_iteration": 2.778731107711792 }, { "auxiliary_loss_clip": 0.01096695, "auxiliary_loss_mlp": 0.0106417, "balance_loss_clip": 1.04752398, "balance_loss_mlp": 1.04587138, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 1.7628322447974545, "language_loss": 0.69351411, "learning_rate": 3.4563152757847026e-06, "loss": 0.71512282, "num_input_tokens_seen": 94611410, "step": 4377, "time_per_iteration": 2.7606706619262695 }, { "auxiliary_loss_clip": 0.01141095, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.0561285, "balance_loss_mlp": 1.02606952, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 2.1982489321824352, "language_loss": 0.79961169, "learning_rate": 3.4560483073610233e-06, "loss": 0.82145292, "num_input_tokens_seen": 94636575, "step": 4378, "time_per_iteration": 2.9000468254089355 }, { "auxiliary_loss_clip": 0.01127331, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05713558, "balance_loss_mlp": 1.03063893, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 1.912468890890116, "language_loss": 0.76285684, "learning_rate": 3.455781283723846e-06, "loss": 0.78458679, "num_input_tokens_seen": 94654345, "step": 4379, "time_per_iteration": 2.6757192611694336 }, { "auxiliary_loss_clip": 0.01114814, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05360019, "balance_loss_mlp": 1.02465415, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 1.982346793660648, "language_loss": 0.77895945, "learning_rate": 3.4555142048832975e-06, "loss": 0.80053759, "num_input_tokens_seen": 94673985, "step": 4380, "time_per_iteration": 2.745392084121704 }, { "auxiliary_loss_clip": 0.01125918, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04945278, "balance_loss_mlp": 1.02351093, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 2.2040025999375215, "language_loss": 0.64148676, "learning_rate": 3.4552470708495036e-06, "loss": 0.66315508, "num_input_tokens_seen": 94693145, "step": 4381, "time_per_iteration": 2.8020689487457275 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.05113709, "balance_loss_mlp": 1.02225995, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 1.9675616702193486, "language_loss": 0.82470775, "learning_rate": 3.454979881632595e-06, "loss": 0.8464632, "num_input_tokens_seen": 94710185, "step": 4382, "time_per_iteration": 2.66001558303833 }, { "auxiliary_loss_clip": 0.01106019, "auxiliary_loss_mlp": 0.01045742, "balance_loss_clip": 1.04899645, "balance_loss_mlp": 1.02726483, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 4.511875880791621, "language_loss": 0.70333207, "learning_rate": 3.4547126372427035e-06, "loss": 0.7248497, "num_input_tokens_seen": 94730280, "step": 4383, "time_per_iteration": 2.851227045059204 }, { "auxiliary_loss_clip": 0.01136676, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.05237031, "balance_loss_mlp": 1.0239253, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 3.019496854013466, "language_loss": 0.69455528, "learning_rate": 3.4544453376899638e-06, "loss": 0.71631902, "num_input_tokens_seen": 94748560, "step": 4384, "time_per_iteration": 2.670023202896118 }, { "auxiliary_loss_clip": 0.01135763, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.05114567, "balance_loss_mlp": 1.02275276, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 2.2712502599605036, "language_loss": 0.70067525, "learning_rate": 3.45417798298451e-06, "loss": 0.72242868, "num_input_tokens_seen": 94767570, "step": 4385, "time_per_iteration": 2.7232449054718018 }, { "auxiliary_loss_clip": 0.01112529, "auxiliary_loss_mlp": 0.0104946, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.03190076, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.8128608655109948, "language_loss": 0.85684925, "learning_rate": 3.453910573136482e-06, "loss": 0.87846911, "num_input_tokens_seen": 94784985, "step": 4386, "time_per_iteration": 2.727924108505249 }, { "auxiliary_loss_clip": 0.01126521, "auxiliary_loss_mlp": 0.01046433, "balance_loss_clip": 1.0510478, "balance_loss_mlp": 1.02955282, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 2.174412940978395, "language_loss": 0.7796396, "learning_rate": 3.4536431081560196e-06, "loss": 0.80136907, "num_input_tokens_seen": 94802545, "step": 4387, "time_per_iteration": 2.666287660598755 }, { "auxiliary_loss_clip": 0.01134058, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.05609179, "balance_loss_mlp": 1.02537298, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 2.003302761742054, "language_loss": 0.76126039, "learning_rate": 3.453375588053264e-06, "loss": 0.78301507, "num_input_tokens_seen": 94820730, "step": 4388, "time_per_iteration": 2.6321358680725098 }, { "auxiliary_loss_clip": 0.01148944, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.05455542, "balance_loss_mlp": 1.02002645, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 2.534815675842734, "language_loss": 0.86675179, "learning_rate": 3.4531080128383617e-06, "loss": 0.88861108, "num_input_tokens_seen": 94839175, "step": 4389, "time_per_iteration": 2.6122422218322754 }, { "auxiliary_loss_clip": 0.01048602, "auxiliary_loss_mlp": 0.01002085, "balance_loss_clip": 1.03000987, "balance_loss_mlp": 0.99961758, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.8388510572165676, "language_loss": 0.60285747, "learning_rate": 3.452840382521457e-06, "loss": 0.62336433, "num_input_tokens_seen": 94898865, "step": 4390, "time_per_iteration": 3.1867401599884033 }, { "auxiliary_loss_clip": 0.01128567, "auxiliary_loss_mlp": 0.01040305, "balance_loss_clip": 1.05022383, "balance_loss_mlp": 1.02319825, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 1.6144448841655068, "language_loss": 0.77730125, "learning_rate": 3.4525726971127e-06, "loss": 0.79899001, "num_input_tokens_seen": 94917490, "step": 4391, "time_per_iteration": 2.707310676574707 }, { "auxiliary_loss_clip": 0.01031384, "auxiliary_loss_mlp": 0.00755302, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.00244236, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.8840896383522404, "language_loss": 0.58758044, "learning_rate": 3.45230495662224e-06, "loss": 0.60544735, "num_input_tokens_seen": 94969065, "step": 4392, "time_per_iteration": 3.211859941482544 }, { "auxiliary_loss_clip": 0.01136937, "auxiliary_loss_mlp": 0.0105019, "balance_loss_clip": 1.05295539, "balance_loss_mlp": 1.03322649, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 1.9286153229889427, "language_loss": 0.68954027, "learning_rate": 3.4520371610602306e-06, "loss": 0.71141154, "num_input_tokens_seen": 94988540, "step": 4393, "time_per_iteration": 2.6483278274536133 }, { "auxiliary_loss_clip": 0.01140079, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.05395103, "balance_loss_mlp": 1.02398562, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 2.0454829511435193, "language_loss": 0.84071863, "learning_rate": 3.4517693104368267e-06, "loss": 0.86253464, "num_input_tokens_seen": 95004810, "step": 4394, "time_per_iteration": 4.3396079540252686 }, { "auxiliary_loss_clip": 0.01124083, "auxiliary_loss_mlp": 0.01045374, "balance_loss_clip": 1.04999089, "balance_loss_mlp": 1.02661061, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 2.096391063208514, "language_loss": 0.70044839, "learning_rate": 3.4515014047621856e-06, "loss": 0.72214299, "num_input_tokens_seen": 95024085, "step": 4395, "time_per_iteration": 2.8730056285858154 }, { "auxiliary_loss_clip": 0.01110387, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.04736662, "balance_loss_mlp": 1.02071214, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 2.1761517020490606, "language_loss": 0.86876452, "learning_rate": 3.4512334440464655e-06, "loss": 0.89024228, "num_input_tokens_seen": 95042515, "step": 4396, "time_per_iteration": 4.384250640869141 }, { "auxiliary_loss_clip": 0.01010716, "auxiliary_loss_mlp": 0.01021406, "balance_loss_clip": 1.02197146, "balance_loss_mlp": 1.01856887, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 0.7957760850485174, "language_loss": 0.55022657, "learning_rate": 3.4509654282998277e-06, "loss": 0.57054776, "num_input_tokens_seen": 95094835, "step": 4397, "time_per_iteration": 3.0656893253326416 }, { "auxiliary_loss_clip": 0.01132938, "auxiliary_loss_mlp": 0.01050463, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.03357744, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 1.9110208887501443, "language_loss": 0.77881467, "learning_rate": 3.450697357532435e-06, "loss": 0.80064869, "num_input_tokens_seen": 95113480, "step": 4398, "time_per_iteration": 2.740917444229126 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.05469537, "balance_loss_mlp": 1.02347112, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 1.7657486248278176, "language_loss": 0.67534482, "learning_rate": 3.4504292317544534e-06, "loss": 0.69715106, "num_input_tokens_seen": 95132580, "step": 4399, "time_per_iteration": 4.305487871170044 }, { "auxiliary_loss_clip": 0.01097219, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04840231, "balance_loss_mlp": 1.02503681, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 1.6309197312133479, "language_loss": 0.86614597, "learning_rate": 3.4501610509760504e-06, "loss": 0.88753855, "num_input_tokens_seen": 95152375, "step": 4400, "time_per_iteration": 2.695883274078369 }, { "auxiliary_loss_clip": 0.01119339, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.0483284, "balance_loss_mlp": 1.0226419, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 3.1942141071602546, "language_loss": 0.76518428, "learning_rate": 3.4498928152073944e-06, "loss": 0.78677756, "num_input_tokens_seen": 95170265, "step": 4401, "time_per_iteration": 2.69415545463562 }, { "auxiliary_loss_clip": 0.01100665, "auxiliary_loss_mlp": 0.01046326, "balance_loss_clip": 1.04473615, "balance_loss_mlp": 1.02758598, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 2.336049134907364, "language_loss": 0.88363832, "learning_rate": 3.4496245244586577e-06, "loss": 0.90510821, "num_input_tokens_seen": 95188655, "step": 4402, "time_per_iteration": 2.7073450088500977 }, { "auxiliary_loss_clip": 0.01105803, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.04894042, "balance_loss_mlp": 1.02327585, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 1.7301089969072252, "language_loss": 0.7811445, "learning_rate": 3.4493561787400137e-06, "loss": 0.80260956, "num_input_tokens_seen": 95209615, "step": 4403, "time_per_iteration": 2.7213027477264404 }, { "auxiliary_loss_clip": 0.01128649, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.04674816, "balance_loss_mlp": 1.02050877, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 2.1369132533571604, "language_loss": 0.88594282, "learning_rate": 3.4490877780616387e-06, "loss": 0.90760964, "num_input_tokens_seen": 95227810, "step": 4404, "time_per_iteration": 2.6888909339904785 }, { "auxiliary_loss_clip": 0.01123789, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04607344, "balance_loss_mlp": 1.02416539, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 1.7519644069859235, "language_loss": 0.76134694, "learning_rate": 3.448819322433709e-06, "loss": 0.78299075, "num_input_tokens_seen": 95245890, "step": 4405, "time_per_iteration": 2.7172482013702393 }, { "auxiliary_loss_clip": 0.01148976, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.05348206, "balance_loss_mlp": 1.02266204, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 1.711457274305917, "language_loss": 0.69873697, "learning_rate": 3.4485508118664066e-06, "loss": 0.72062874, "num_input_tokens_seen": 95264955, "step": 4406, "time_per_iteration": 2.584300994873047 }, { "auxiliary_loss_clip": 0.01121151, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.05182838, "balance_loss_mlp": 1.03432453, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 1.7200250795424956, "language_loss": 0.83956587, "learning_rate": 3.448282246369912e-06, "loss": 0.86128193, "num_input_tokens_seen": 95284245, "step": 4407, "time_per_iteration": 2.731316328048706 }, { "auxiliary_loss_clip": 0.01108599, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.04695201, "balance_loss_mlp": 1.01501989, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 1.8896460113896294, "language_loss": 0.7597363, "learning_rate": 3.4480136259544084e-06, "loss": 0.78113985, "num_input_tokens_seen": 95307125, "step": 4408, "time_per_iteration": 2.8600730895996094 }, { "auxiliary_loss_clip": 0.01091919, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 1.04267502, "balance_loss_mlp": 1.02679181, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 1.7769050714437231, "language_loss": 0.70612216, "learning_rate": 3.447744950630084e-06, "loss": 0.72748852, "num_input_tokens_seen": 95329150, "step": 4409, "time_per_iteration": 2.936380386352539 }, { "auxiliary_loss_clip": 0.01131548, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.04774857, "balance_loss_mlp": 1.02218497, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 1.7357795205395667, "language_loss": 0.7337513, "learning_rate": 3.4474762204071253e-06, "loss": 0.75546867, "num_input_tokens_seen": 95349880, "step": 4410, "time_per_iteration": 2.7315077781677246 }, { "auxiliary_loss_clip": 0.01141374, "auxiliary_loss_mlp": 0.0104966, "balance_loss_clip": 1.05183268, "balance_loss_mlp": 1.03216028, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 1.8886288474708937, "language_loss": 0.73828322, "learning_rate": 3.4472074352957244e-06, "loss": 0.76019359, "num_input_tokens_seen": 95368570, "step": 4411, "time_per_iteration": 2.641920566558838 }, { "auxiliary_loss_clip": 0.01099594, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.04986739, "balance_loss_mlp": 1.02431464, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 1.9943391034693418, "language_loss": 0.82447588, "learning_rate": 3.446938595306071e-06, "loss": 0.84588754, "num_input_tokens_seen": 95387065, "step": 4412, "time_per_iteration": 2.8344247341156006 }, { "auxiliary_loss_clip": 0.01135402, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.05143464, "balance_loss_mlp": 1.03544593, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 1.775443234311944, "language_loss": 0.7446382, "learning_rate": 3.4466697004483622e-06, "loss": 0.76651239, "num_input_tokens_seen": 95406345, "step": 4413, "time_per_iteration": 2.657975196838379 }, { "auxiliary_loss_clip": 0.01056582, "auxiliary_loss_mlp": 0.01008584, "balance_loss_clip": 1.03258443, "balance_loss_mlp": 1.00659275, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 0.873557285042922, "language_loss": 0.56965125, "learning_rate": 3.446400750732793e-06, "loss": 0.59030288, "num_input_tokens_seen": 95463595, "step": 4414, "time_per_iteration": 3.1158244609832764 }, { "auxiliary_loss_clip": 0.01107803, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.04481411, "balance_loss_mlp": 1.03048313, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 1.5786807831647507, "language_loss": 0.74238014, "learning_rate": 3.4461317461695625e-06, "loss": 0.76392424, "num_input_tokens_seen": 95484115, "step": 4415, "time_per_iteration": 2.7223031520843506 }, { "auxiliary_loss_clip": 0.01095743, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04215193, "balance_loss_mlp": 1.02402353, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 2.5102345694159016, "language_loss": 0.86855936, "learning_rate": 3.4458626867688707e-06, "loss": 0.88996005, "num_input_tokens_seen": 95501435, "step": 4416, "time_per_iteration": 2.7001683712005615 }, { "auxiliary_loss_clip": 0.01141467, "auxiliary_loss_mlp": 0.01046153, "balance_loss_clip": 1.05359149, "balance_loss_mlp": 1.02761602, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 1.6343137061510633, "language_loss": 0.76870787, "learning_rate": 3.4455935725409217e-06, "loss": 0.79058409, "num_input_tokens_seen": 95520135, "step": 4417, "time_per_iteration": 2.662196397781372 }, { "auxiliary_loss_clip": 0.01119441, "auxiliary_loss_mlp": 0.01041503, "balance_loss_clip": 1.04989183, "balance_loss_mlp": 1.02242982, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 1.6334113226277946, "language_loss": 0.80320108, "learning_rate": 3.4453244034959196e-06, "loss": 0.82481045, "num_input_tokens_seen": 95541705, "step": 4418, "time_per_iteration": 2.7742624282836914 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01045476, "balance_loss_clip": 1.05182683, "balance_loss_mlp": 1.02721274, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 2.164903581235647, "language_loss": 0.67788607, "learning_rate": 3.445055179644071e-06, "loss": 0.69972998, "num_input_tokens_seen": 95560300, "step": 4419, "time_per_iteration": 2.6437718868255615 }, { "auxiliary_loss_clip": 0.01149692, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.05360699, "balance_loss_mlp": 1.02711296, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 1.9366129468869788, "language_loss": 0.79625547, "learning_rate": 3.444785900995585e-06, "loss": 0.81821501, "num_input_tokens_seen": 95580150, "step": 4420, "time_per_iteration": 2.6594905853271484 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01053725, "balance_loss_clip": 1.05294895, "balance_loss_mlp": 1.03368592, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 1.9122536358412747, "language_loss": 0.81690109, "learning_rate": 3.444516567560673e-06, "loss": 0.83870822, "num_input_tokens_seen": 95597570, "step": 4421, "time_per_iteration": 2.681410551071167 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.01046737, "balance_loss_clip": 1.05015123, "balance_loss_mlp": 1.02904677, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 1.6112293393448585, "language_loss": 0.65704989, "learning_rate": 3.444247179349548e-06, "loss": 0.6788348, "num_input_tokens_seen": 95619415, "step": 4422, "time_per_iteration": 2.8766117095947266 }, { "auxiliary_loss_clip": 0.01130944, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.04903376, "balance_loss_mlp": 1.03039181, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 2.1017056533749896, "language_loss": 0.74229872, "learning_rate": 3.4439777363724252e-06, "loss": 0.76408041, "num_input_tokens_seen": 95639155, "step": 4423, "time_per_iteration": 2.6983659267425537 }, { "auxiliary_loss_clip": 0.01130559, "auxiliary_loss_mlp": 0.01057709, "balance_loss_clip": 1.04790974, "balance_loss_mlp": 1.03822982, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 1.6865310965149165, "language_loss": 0.77855694, "learning_rate": 3.443708238639522e-06, "loss": 0.80043966, "num_input_tokens_seen": 95663320, "step": 4424, "time_per_iteration": 2.900214433670044 }, { "auxiliary_loss_clip": 0.01132339, "auxiliary_loss_mlp": 0.01049395, "balance_loss_clip": 1.04963291, "balance_loss_mlp": 1.03181148, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 2.0755220631041684, "language_loss": 0.78940654, "learning_rate": 3.4434386861610573e-06, "loss": 0.81122386, "num_input_tokens_seen": 95680260, "step": 4425, "time_per_iteration": 2.6266820430755615 }, { "auxiliary_loss_clip": 0.01123867, "auxiliary_loss_mlp": 0.01043959, "balance_loss_clip": 1.05143404, "balance_loss_mlp": 1.02767467, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 1.5673316066045293, "language_loss": 0.80135047, "learning_rate": 3.4431690789472532e-06, "loss": 0.82302874, "num_input_tokens_seen": 95701140, "step": 4426, "time_per_iteration": 2.7015280723571777 }, { "auxiliary_loss_clip": 0.01150747, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.0554285, "balance_loss_mlp": 1.02678883, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 1.617839398314704, "language_loss": 0.77174348, "learning_rate": 3.442899417008333e-06, "loss": 0.79369569, "num_input_tokens_seen": 95722060, "step": 4427, "time_per_iteration": 2.6438984870910645 }, { "auxiliary_loss_clip": 0.01112968, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.05125654, "balance_loss_mlp": 1.02069747, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 1.5634759975385293, "language_loss": 0.76754683, "learning_rate": 3.4426297003545227e-06, "loss": 0.78905165, "num_input_tokens_seen": 95742495, "step": 4428, "time_per_iteration": 2.7695741653442383 }, { "auxiliary_loss_clip": 0.01114899, "auxiliary_loss_mlp": 0.00775922, "balance_loss_clip": 1.04922283, "balance_loss_mlp": 1.0008111, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 1.815928660217762, "language_loss": 0.82900071, "learning_rate": 3.4423599289960495e-06, "loss": 0.84790885, "num_input_tokens_seen": 95761510, "step": 4429, "time_per_iteration": 2.764183282852173 }, { "auxiliary_loss_clip": 0.01106492, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.02201009, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 1.6463341595476202, "language_loss": 0.71996218, "learning_rate": 3.442090102943143e-06, "loss": 0.74141741, "num_input_tokens_seen": 95782385, "step": 4430, "time_per_iteration": 2.7244491577148438 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.05231071, "balance_loss_mlp": 1.03068352, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 1.9574919733512919, "language_loss": 0.82021642, "learning_rate": 3.441820222206035e-06, "loss": 0.84219164, "num_input_tokens_seen": 95800595, "step": 4431, "time_per_iteration": 2.5910067558288574 }, { "auxiliary_loss_clip": 0.01143334, "auxiliary_loss_mlp": 0.01050031, "balance_loss_clip": 1.0540812, "balance_loss_mlp": 1.03141046, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 2.074794485495937, "language_loss": 0.76745522, "learning_rate": 3.44155028679496e-06, "loss": 0.7893889, "num_input_tokens_seen": 95818480, "step": 4432, "time_per_iteration": 2.6548166275024414 }, { "auxiliary_loss_clip": 0.01089372, "auxiliary_loss_mlp": 0.01052807, "balance_loss_clip": 1.04526138, "balance_loss_mlp": 1.03232694, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 1.872584196626497, "language_loss": 0.82903433, "learning_rate": 3.441280296720154e-06, "loss": 0.85045612, "num_input_tokens_seen": 95837205, "step": 4433, "time_per_iteration": 4.2740867137908936 }, { "auxiliary_loss_clip": 0.01142798, "auxiliary_loss_mlp": 0.01045231, "balance_loss_clip": 1.05565643, "balance_loss_mlp": 1.02671802, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 2.548777168378285, "language_loss": 0.76308644, "learning_rate": 3.441010251991854e-06, "loss": 0.78496677, "num_input_tokens_seen": 95858395, "step": 4434, "time_per_iteration": 4.203384160995483 }, { "auxiliary_loss_clip": 0.0114611, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05197668, "balance_loss_mlp": 1.02772319, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 2.3452347637055393, "language_loss": 0.82496321, "learning_rate": 3.440740152620301e-06, "loss": 0.84687358, "num_input_tokens_seen": 95877875, "step": 4435, "time_per_iteration": 4.102782964706421 }, { "auxiliary_loss_clip": 0.01104916, "auxiliary_loss_mlp": 0.01062101, "balance_loss_clip": 1.04567468, "balance_loss_mlp": 1.04245555, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 1.994258420562806, "language_loss": 0.87634504, "learning_rate": 3.4404699986157376e-06, "loss": 0.89801526, "num_input_tokens_seen": 95895820, "step": 4436, "time_per_iteration": 2.8048155307769775 }, { "auxiliary_loss_clip": 0.01121439, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.04637265, "balance_loss_mlp": 1.03054643, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 1.4763923958478316, "language_loss": 0.787242, "learning_rate": 3.440199789988407e-06, "loss": 0.80893254, "num_input_tokens_seen": 95918025, "step": 4437, "time_per_iteration": 2.7382607460021973 }, { "auxiliary_loss_clip": 0.01093686, "auxiliary_loss_mlp": 0.01048829, "balance_loss_clip": 1.05000877, "balance_loss_mlp": 1.03117394, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 4.5178491997969115, "language_loss": 0.63910848, "learning_rate": 3.439929526748556e-06, "loss": 0.66053367, "num_input_tokens_seen": 95937725, "step": 4438, "time_per_iteration": 2.956014633178711 }, { "auxiliary_loss_clip": 0.01080658, "auxiliary_loss_mlp": 0.01047394, "balance_loss_clip": 1.0432179, "balance_loss_mlp": 1.02994168, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 1.84569516037299, "language_loss": 0.75897747, "learning_rate": 3.4396592089064334e-06, "loss": 0.78025794, "num_input_tokens_seen": 95956335, "step": 4439, "time_per_iteration": 4.428173065185547 }, { "auxiliary_loss_clip": 0.01089075, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.02181315, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 2.10654378697334, "language_loss": 0.7172367, "learning_rate": 3.4393888364722897e-06, "loss": 0.73853838, "num_input_tokens_seen": 95977135, "step": 4440, "time_per_iteration": 2.9196605682373047 }, { "auxiliary_loss_clip": 0.01124038, "auxiliary_loss_mlp": 0.01049644, "balance_loss_clip": 1.04784775, "balance_loss_mlp": 1.02931881, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 1.869180757677473, "language_loss": 0.66229129, "learning_rate": 3.439118409456376e-06, "loss": 0.68402815, "num_input_tokens_seen": 95995435, "step": 4441, "time_per_iteration": 2.666428804397583 }, { "auxiliary_loss_clip": 0.01137041, "auxiliary_loss_mlp": 0.01049045, "balance_loss_clip": 1.04973912, "balance_loss_mlp": 1.02953053, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 3.888081439634283, "language_loss": 0.76102316, "learning_rate": 3.4388479278689486e-06, "loss": 0.78288412, "num_input_tokens_seen": 96016340, "step": 4442, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.0100646, "auxiliary_loss_mlp": 0.0105848, "balance_loss_clip": 1.02694619, "balance_loss_mlp": 1.05538034, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 0.9410220376713593, "language_loss": 0.61210632, "learning_rate": 3.4385773917202637e-06, "loss": 0.63275576, "num_input_tokens_seen": 96071205, "step": 4443, "time_per_iteration": 3.2342116832733154 }, { "auxiliary_loss_clip": 0.01123665, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.05413401, "balance_loss_mlp": 1.02239847, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 1.5620381861600383, "language_loss": 0.76195556, "learning_rate": 3.4383068010205793e-06, "loss": 0.78359205, "num_input_tokens_seen": 96094240, "step": 4444, "time_per_iteration": 3.136178731918335 }, { "auxiliary_loss_clip": 0.01142711, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.05331576, "balance_loss_mlp": 1.0213964, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 1.6750833182703528, "language_loss": 0.80892444, "learning_rate": 3.438036155780158e-06, "loss": 0.83075905, "num_input_tokens_seen": 96114105, "step": 4445, "time_per_iteration": 2.660952091217041 }, { "auxiliary_loss_clip": 0.01124381, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.05190587, "balance_loss_mlp": 1.02901077, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 2.1125172985353533, "language_loss": 0.89060926, "learning_rate": 3.43776545600926e-06, "loss": 0.9123382, "num_input_tokens_seen": 96132140, "step": 4446, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.011447, "auxiliary_loss_mlp": 0.01053132, "balance_loss_clip": 1.05528426, "balance_loss_mlp": 1.03541803, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 2.4310086382368783, "language_loss": 0.67756736, "learning_rate": 3.437494701718153e-06, "loss": 0.69954574, "num_input_tokens_seen": 96152090, "step": 4447, "time_per_iteration": 2.6696949005126953 }, { "auxiliary_loss_clip": 0.01144309, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.05496442, "balance_loss_mlp": 1.02572155, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 1.9687667134305082, "language_loss": 0.830899, "learning_rate": 3.4372238929171026e-06, "loss": 0.85279107, "num_input_tokens_seen": 96170015, "step": 4448, "time_per_iteration": 2.639463424682617 }, { "auxiliary_loss_clip": 0.0111564, "auxiliary_loss_mlp": 0.01054364, "balance_loss_clip": 1.05101895, "balance_loss_mlp": 1.03557646, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 1.479052407292424, "language_loss": 0.84231561, "learning_rate": 3.436953029616378e-06, "loss": 0.8640157, "num_input_tokens_seen": 96188065, "step": 4449, "time_per_iteration": 2.812290906906128 }, { "auxiliary_loss_clip": 0.0113237, "auxiliary_loss_mlp": 0.01055905, "balance_loss_clip": 1.05103493, "balance_loss_mlp": 1.03552055, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 1.7379167843341312, "language_loss": 0.84231997, "learning_rate": 3.4366821118262506e-06, "loss": 0.86420268, "num_input_tokens_seen": 96205780, "step": 4450, "time_per_iteration": 2.7598626613616943 }, { "auxiliary_loss_clip": 0.01109743, "auxiliary_loss_mlp": 0.01057779, "balance_loss_clip": 1.04833305, "balance_loss_mlp": 1.04044628, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 8.035146429526597, "language_loss": 0.80842566, "learning_rate": 3.4364111395569937e-06, "loss": 0.83010095, "num_input_tokens_seen": 96224990, "step": 4451, "time_per_iteration": 2.7467129230499268 }, { "auxiliary_loss_clip": 0.01141732, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.0553689, "balance_loss_mlp": 1.0379324, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 1.6378235408468254, "language_loss": 0.86285019, "learning_rate": 3.436140112818882e-06, "loss": 0.88482267, "num_input_tokens_seen": 96245345, "step": 4452, "time_per_iteration": 2.7442660331726074 }, { "auxiliary_loss_clip": 0.01134475, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.05496478, "balance_loss_mlp": 1.02926481, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 2.119384740597093, "language_loss": 0.83521158, "learning_rate": 3.435869031622194e-06, "loss": 0.85703623, "num_input_tokens_seen": 96259000, "step": 4453, "time_per_iteration": 2.659623146057129 }, { "auxiliary_loss_clip": 0.01141347, "auxiliary_loss_mlp": 0.01063496, "balance_loss_clip": 1.05624223, "balance_loss_mlp": 1.04485118, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 1.8460317519144305, "language_loss": 0.79565918, "learning_rate": 3.435597895977208e-06, "loss": 0.8177076, "num_input_tokens_seen": 96277000, "step": 4454, "time_per_iteration": 2.6458942890167236 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01056871, "balance_loss_clip": 1.05338597, "balance_loss_mlp": 1.03869116, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 1.5255880946203295, "language_loss": 0.7241919, "learning_rate": 3.435326705894206e-06, "loss": 0.74606699, "num_input_tokens_seen": 96297010, "step": 4455, "time_per_iteration": 2.7328429222106934 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01052208, "balance_loss_clip": 1.04858243, "balance_loss_mlp": 1.03508949, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 1.5657028408886426, "language_loss": 0.74017322, "learning_rate": 3.435055461383471e-06, "loss": 0.76182139, "num_input_tokens_seen": 96315780, "step": 4456, "time_per_iteration": 2.700190544128418 }, { "auxiliary_loss_clip": 0.0114232, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.05394006, "balance_loss_mlp": 1.03033149, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 2.4373070589767774, "language_loss": 0.70647967, "learning_rate": 3.4347841624552896e-06, "loss": 0.72839093, "num_input_tokens_seen": 96333465, "step": 4457, "time_per_iteration": 2.6334941387176514 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01063608, "balance_loss_clip": 1.05205595, "balance_loss_mlp": 1.04513049, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 1.8228045543818674, "language_loss": 0.7903617, "learning_rate": 3.4345128091199493e-06, "loss": 0.81212699, "num_input_tokens_seen": 96352005, "step": 4458, "time_per_iteration": 2.7377572059631348 }, { "auxiliary_loss_clip": 0.01030327, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.0366354, "balance_loss_mlp": 1.0414269, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.9600198584891941, "language_loss": 0.58691025, "learning_rate": 3.434241401387739e-06, "loss": 0.60765231, "num_input_tokens_seen": 96406265, "step": 4459, "time_per_iteration": 3.2385354042053223 }, { "auxiliary_loss_clip": 0.0108842, "auxiliary_loss_mlp": 0.01056025, "balance_loss_clip": 1.04306948, "balance_loss_mlp": 1.0379889, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 2.1196386888642382, "language_loss": 0.84988648, "learning_rate": 3.4339699392689507e-06, "loss": 0.87133086, "num_input_tokens_seen": 96425225, "step": 4460, "time_per_iteration": 2.767054319381714 }, { "auxiliary_loss_clip": 0.01134128, "auxiliary_loss_mlp": 0.01059054, "balance_loss_clip": 1.0525527, "balance_loss_mlp": 1.03916979, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 1.6839260392555548, "language_loss": 0.68334675, "learning_rate": 3.4336984227738796e-06, "loss": 0.70527858, "num_input_tokens_seen": 96443780, "step": 4461, "time_per_iteration": 2.7217342853546143 }, { "auxiliary_loss_clip": 0.0111525, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.05045152, "balance_loss_mlp": 1.03649962, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 1.7146103847032579, "language_loss": 0.67240328, "learning_rate": 3.43342685191282e-06, "loss": 0.69410318, "num_input_tokens_seen": 96464530, "step": 4462, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01116667, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05230319, "balance_loss_mlp": 1.02710128, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 1.7796857642272712, "language_loss": 0.69503593, "learning_rate": 3.4331552266960705e-06, "loss": 0.71666932, "num_input_tokens_seen": 96483345, "step": 4463, "time_per_iteration": 2.738046407699585 }, { "auxiliary_loss_clip": 0.01118676, "auxiliary_loss_mlp": 0.01049589, "balance_loss_clip": 1.0492326, "balance_loss_mlp": 1.02862048, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 2.5866232358274277, "language_loss": 0.77943784, "learning_rate": 3.432883547133931e-06, "loss": 0.80112046, "num_input_tokens_seen": 96498305, "step": 4464, "time_per_iteration": 2.6794681549072266 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.05244994, "balance_loss_mlp": 1.02410388, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 2.2986867036088285, "language_loss": 0.71375966, "learning_rate": 3.432611813236704e-06, "loss": 0.73552406, "num_input_tokens_seen": 96519740, "step": 4465, "time_per_iteration": 2.699575662612915 }, { "auxiliary_loss_clip": 0.01042347, "auxiliary_loss_mlp": 0.01001834, "balance_loss_clip": 1.02813911, "balance_loss_mlp": 0.9993788, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.7242654721351415, "language_loss": 0.53150702, "learning_rate": 3.4323400250146943e-06, "loss": 0.5519489, "num_input_tokens_seen": 96588870, "step": 4466, "time_per_iteration": 3.3984062671661377 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.0105552, "balance_loss_clip": 1.04732478, "balance_loss_mlp": 1.03381157, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 2.1738333593055796, "language_loss": 0.74038142, "learning_rate": 3.4320681824782057e-06, "loss": 0.76212335, "num_input_tokens_seen": 96605100, "step": 4467, "time_per_iteration": 2.6631343364715576 }, { "auxiliary_loss_clip": 0.01126618, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05088973, "balance_loss_mlp": 1.00093102, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 3.586661477808892, "language_loss": 0.80481976, "learning_rate": 3.4317962856375493e-06, "loss": 0.82386476, "num_input_tokens_seen": 96621410, "step": 4468, "time_per_iteration": 2.64806866645813 }, { "auxiliary_loss_clip": 0.01059326, "auxiliary_loss_mlp": 0.01006331, "balance_loss_clip": 1.02527809, "balance_loss_mlp": 1.0036248, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8399316740346766, "language_loss": 0.59498715, "learning_rate": 3.4315243345030334e-06, "loss": 0.61564374, "num_input_tokens_seen": 96684810, "step": 4469, "time_per_iteration": 3.1989517211914062 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.01048741, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02854705, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 2.165956170420043, "language_loss": 0.82055074, "learning_rate": 3.431252329084972e-06, "loss": 0.84260345, "num_input_tokens_seen": 96701920, "step": 4470, "time_per_iteration": 2.6167352199554443 }, { "auxiliary_loss_clip": 0.01117064, "auxiliary_loss_mlp": 0.01054605, "balance_loss_clip": 1.04794455, "balance_loss_mlp": 1.03563929, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 1.6543166375172473, "language_loss": 0.82841349, "learning_rate": 3.4309802693936786e-06, "loss": 0.8501302, "num_input_tokens_seen": 96721260, "step": 4471, "time_per_iteration": 4.177881956100464 }, { "auxiliary_loss_clip": 0.01133274, "auxiliary_loss_mlp": 0.01045934, "balance_loss_clip": 1.05339766, "balance_loss_mlp": 1.02762365, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 2.017001756898941, "language_loss": 0.69309431, "learning_rate": 3.43070815543947e-06, "loss": 0.71488637, "num_input_tokens_seen": 96740385, "step": 4472, "time_per_iteration": 2.6611149311065674 }, { "auxiliary_loss_clip": 0.01150636, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.05448234, "balance_loss_mlp": 1.02882099, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 1.889152474147147, "language_loss": 0.67809618, "learning_rate": 3.4304359872326656e-06, "loss": 0.70006931, "num_input_tokens_seen": 96761860, "step": 4473, "time_per_iteration": 2.6570448875427246 }, { "auxiliary_loss_clip": 0.01123821, "auxiliary_loss_mlp": 0.01056077, "balance_loss_clip": 1.05778623, "balance_loss_mlp": 1.03800452, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 2.20378943201051, "language_loss": 0.82835853, "learning_rate": 3.4301637647835843e-06, "loss": 0.8501575, "num_input_tokens_seen": 96781890, "step": 4474, "time_per_iteration": 5.79376220703125 }, { "auxiliary_loss_clip": 0.01138349, "auxiliary_loss_mlp": 0.01055982, "balance_loss_clip": 1.05353034, "balance_loss_mlp": 1.03841054, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 2.404484364093812, "language_loss": 0.71004206, "learning_rate": 3.4298914881025494e-06, "loss": 0.73198539, "num_input_tokens_seen": 96800390, "step": 4475, "time_per_iteration": 2.5969674587249756 }, { "auxiliary_loss_clip": 0.01112288, "auxiliary_loss_mlp": 0.00776382, "balance_loss_clip": 1.05001771, "balance_loss_mlp": 1.00081563, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 1.8574153972172647, "language_loss": 0.73638999, "learning_rate": 3.4296191571998863e-06, "loss": 0.75527668, "num_input_tokens_seen": 96816685, "step": 4476, "time_per_iteration": 2.70358943939209 }, { "auxiliary_loss_clip": 0.01119256, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.05050373, "balance_loss_mlp": 1.02605665, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 1.5040704863343832, "language_loss": 0.80439913, "learning_rate": 3.429346772085922e-06, "loss": 0.82602954, "num_input_tokens_seen": 96836285, "step": 4477, "time_per_iteration": 4.313180208206177 }, { "auxiliary_loss_clip": 0.01097359, "auxiliary_loss_mlp": 0.0104976, "balance_loss_clip": 1.04965031, "balance_loss_mlp": 1.0309844, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 1.7971929656919947, "language_loss": 0.65181434, "learning_rate": 3.429074332770984e-06, "loss": 0.67328548, "num_input_tokens_seen": 96857745, "step": 4478, "time_per_iteration": 2.8882603645324707 }, { "auxiliary_loss_clip": 0.01130488, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.04841042, "balance_loss_mlp": 1.03163743, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 1.933707281531851, "language_loss": 0.80987537, "learning_rate": 3.4288018392654047e-06, "loss": 0.83168429, "num_input_tokens_seen": 96877295, "step": 4479, "time_per_iteration": 2.670370578765869 }, { "auxiliary_loss_clip": 0.01127626, "auxiliary_loss_mlp": 0.00776143, "balance_loss_clip": 1.05010593, "balance_loss_mlp": 1.0010041, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 16.364114673072947, "language_loss": 0.81205857, "learning_rate": 3.4285292915795166e-06, "loss": 0.83109629, "num_input_tokens_seen": 96896160, "step": 4480, "time_per_iteration": 2.687922954559326 }, { "auxiliary_loss_clip": 0.01098242, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04720628, "balance_loss_mlp": 1.03243792, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 1.5167677573266813, "language_loss": 0.77982032, "learning_rate": 3.4282566897236543e-06, "loss": 0.80131412, "num_input_tokens_seen": 96915410, "step": 4481, "time_per_iteration": 2.783400058746338 }, { "auxiliary_loss_clip": 0.01138325, "auxiliary_loss_mlp": 0.01055373, "balance_loss_clip": 1.05098486, "balance_loss_mlp": 1.03693104, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 1.817845708033507, "language_loss": 0.74072635, "learning_rate": 3.4279840337081547e-06, "loss": 0.76266336, "num_input_tokens_seen": 96937865, "step": 4482, "time_per_iteration": 2.704923629760742 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05258846, "balance_loss_mlp": 1.02826333, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.016330221700464, "language_loss": 0.72562164, "learning_rate": 3.4277113235433584e-06, "loss": 0.74733007, "num_input_tokens_seen": 96957710, "step": 4483, "time_per_iteration": 2.697889804840088 }, { "auxiliary_loss_clip": 0.0113896, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.04867983, "balance_loss_mlp": 1.03658295, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 2.3663265895203356, "language_loss": 0.86904967, "learning_rate": 3.427438559239605e-06, "loss": 0.89100051, "num_input_tokens_seen": 96975890, "step": 4484, "time_per_iteration": 2.6893441677093506 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01049025, "balance_loss_clip": 1.05224931, "balance_loss_mlp": 1.03148949, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 1.783447205979712, "language_loss": 0.6663093, "learning_rate": 3.427165740807239e-06, "loss": 0.68819648, "num_input_tokens_seen": 96998595, "step": 4485, "time_per_iteration": 2.795172929763794 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.01053324, "balance_loss_clip": 1.04507363, "balance_loss_mlp": 1.03475094, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 2.5437851063433743, "language_loss": 0.73155308, "learning_rate": 3.426892868256604e-06, "loss": 0.75321472, "num_input_tokens_seen": 97013715, "step": 4486, "time_per_iteration": 2.6854116916656494 }, { "auxiliary_loss_clip": 0.01156209, "auxiliary_loss_mlp": 0.01047906, "balance_loss_clip": 1.05688012, "balance_loss_mlp": 1.03062034, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 2.2389379935408456, "language_loss": 0.84326887, "learning_rate": 3.4266199415980495e-06, "loss": 0.86531007, "num_input_tokens_seen": 97031570, "step": 4487, "time_per_iteration": 2.6117801666259766 }, { "auxiliary_loss_clip": 0.01127332, "auxiliary_loss_mlp": 0.0105083, "balance_loss_clip": 1.05733204, "balance_loss_mlp": 1.03228104, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 2.345170862120161, "language_loss": 0.7189706, "learning_rate": 3.4263469608419234e-06, "loss": 0.74075222, "num_input_tokens_seen": 97049815, "step": 4488, "time_per_iteration": 2.7384660243988037 }, { "auxiliary_loss_clip": 0.01074601, "auxiliary_loss_mlp": 0.01061378, "balance_loss_clip": 1.0494225, "balance_loss_mlp": 1.04040885, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 1.6359957516545125, "language_loss": 0.83725536, "learning_rate": 3.426073925998578e-06, "loss": 0.85861516, "num_input_tokens_seen": 97067570, "step": 4489, "time_per_iteration": 2.9274613857269287 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01061235, "balance_loss_clip": 1.05630314, "balance_loss_mlp": 1.04203057, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 2.6678463269995785, "language_loss": 0.90056908, "learning_rate": 3.4258008370783656e-06, "loss": 0.9224779, "num_input_tokens_seen": 97082180, "step": 4490, "time_per_iteration": 2.9096486568450928 }, { "auxiliary_loss_clip": 0.01075397, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.04493999, "balance_loss_mlp": 1.03319883, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 2.0876908666200573, "language_loss": 0.73380542, "learning_rate": 3.4255276940916434e-06, "loss": 0.75508606, "num_input_tokens_seen": 97103470, "step": 4491, "time_per_iteration": 2.9016802310943604 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01052294, "balance_loss_clip": 1.05944943, "balance_loss_mlp": 1.03453195, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 2.7575700534068783, "language_loss": 0.74795783, "learning_rate": 3.4252544970487676e-06, "loss": 0.77004933, "num_input_tokens_seen": 97118100, "step": 4492, "time_per_iteration": 2.6685187816619873 }, { "auxiliary_loss_clip": 0.01130467, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.05300546, "balance_loss_mlp": 1.03205013, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 3.551039047250381, "language_loss": 0.89015245, "learning_rate": 3.4249812459600986e-06, "loss": 0.91195965, "num_input_tokens_seen": 97136765, "step": 4493, "time_per_iteration": 2.7044742107391357 }, { "auxiliary_loss_clip": 0.01142037, "auxiliary_loss_mlp": 0.0104825, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.03079772, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.665337194117132, "language_loss": 0.71139705, "learning_rate": 3.424707940835998e-06, "loss": 0.73329991, "num_input_tokens_seen": 97157470, "step": 4494, "time_per_iteration": 2.6299519538879395 }, { "auxiliary_loss_clip": 0.01120214, "auxiliary_loss_mlp": 0.01045805, "balance_loss_clip": 1.05193532, "balance_loss_mlp": 1.02893662, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 2.4718809008283045, "language_loss": 0.8642354, "learning_rate": 3.42443458168683e-06, "loss": 0.88589561, "num_input_tokens_seen": 97176905, "step": 4495, "time_per_iteration": 2.627389907836914 }, { "auxiliary_loss_clip": 0.01151814, "auxiliary_loss_mlp": 0.0105053, "balance_loss_clip": 1.05591631, "balance_loss_mlp": 1.03308964, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 2.1521214825296844, "language_loss": 0.76781964, "learning_rate": 3.424161168522959e-06, "loss": 0.78984308, "num_input_tokens_seen": 97196380, "step": 4496, "time_per_iteration": 2.5360703468322754 }, { "auxiliary_loss_clip": 0.01064272, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.03151321, "balance_loss_mlp": 1.04716671, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.7153442156657138, "language_loss": 0.50134224, "learning_rate": 3.423887701354754e-06, "loss": 0.52248067, "num_input_tokens_seen": 97260100, "step": 4497, "time_per_iteration": 3.1133949756622314 }, { "auxiliary_loss_clip": 0.01106563, "auxiliary_loss_mlp": 0.01051954, "balance_loss_clip": 1.05492568, "balance_loss_mlp": 1.03482318, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 2.421164292554959, "language_loss": 0.72386497, "learning_rate": 3.4236141801925847e-06, "loss": 0.74545014, "num_input_tokens_seen": 97277935, "step": 4498, "time_per_iteration": 2.7409775257110596 }, { "auxiliary_loss_clip": 0.01038432, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 1.0322926, "balance_loss_mlp": 1.02582395, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.7537228186848703, "language_loss": 0.5917033, "learning_rate": 3.4233406050468237e-06, "loss": 0.61237001, "num_input_tokens_seen": 97338845, "step": 4499, "time_per_iteration": 3.2331602573394775 }, { "auxiliary_loss_clip": 0.01124574, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.05154204, "balance_loss_mlp": 1.02593243, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 2.1159538878254756, "language_loss": 0.73629957, "learning_rate": 3.4230669759278438e-06, "loss": 0.75799143, "num_input_tokens_seen": 97356640, "step": 4500, "time_per_iteration": 2.7513487339019775 }, { "auxiliary_loss_clip": 0.01116688, "auxiliary_loss_mlp": 0.01047016, "balance_loss_clip": 1.04657793, "balance_loss_mlp": 1.02878881, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 2.8997006330289925, "language_loss": 0.81041664, "learning_rate": 3.4227932928460215e-06, "loss": 0.83205366, "num_input_tokens_seen": 97372585, "step": 4501, "time_per_iteration": 2.703014850616455 }, { "auxiliary_loss_clip": 0.01104056, "auxiliary_loss_mlp": 0.01053779, "balance_loss_clip": 1.04828477, "balance_loss_mlp": 1.03331053, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 4.2139696132912565, "language_loss": 0.7261312, "learning_rate": 3.422519555811735e-06, "loss": 0.74770957, "num_input_tokens_seen": 97393315, "step": 4502, "time_per_iteration": 2.732167959213257 }, { "auxiliary_loss_clip": 0.01129704, "auxiliary_loss_mlp": 0.01047167, "balance_loss_clip": 1.04821455, "balance_loss_mlp": 1.0268774, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 1.748421457410976, "language_loss": 0.67973912, "learning_rate": 3.4222457648353642e-06, "loss": 0.70150787, "num_input_tokens_seen": 97417860, "step": 4503, "time_per_iteration": 2.7950186729431152 }, { "auxiliary_loss_clip": 0.01100008, "auxiliary_loss_mlp": 0.01051668, "balance_loss_clip": 1.04750037, "balance_loss_mlp": 1.03180754, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 1.847411158173202, "language_loss": 0.67971921, "learning_rate": 3.4219719199272918e-06, "loss": 0.70123595, "num_input_tokens_seen": 97436780, "step": 4504, "time_per_iteration": 2.7830374240875244 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05604792, "balance_loss_mlp": 1.03451371, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 1.4870002594081857, "language_loss": 0.75395846, "learning_rate": 3.421698021097902e-06, "loss": 0.77590245, "num_input_tokens_seen": 97456190, "step": 4505, "time_per_iteration": 2.6758666038513184 }, { "auxiliary_loss_clip": 0.01155407, "auxiliary_loss_mlp": 0.01064618, "balance_loss_clip": 1.05439496, "balance_loss_mlp": 1.04436409, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 2.0635482699578254, "language_loss": 0.73474276, "learning_rate": 3.42142406835758e-06, "loss": 0.75694299, "num_input_tokens_seen": 97474545, "step": 4506, "time_per_iteration": 2.652395009994507 }, { "auxiliary_loss_clip": 0.01130629, "auxiliary_loss_mlp": 0.01053462, "balance_loss_clip": 1.05147469, "balance_loss_mlp": 1.0338285, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 2.6352592870517144, "language_loss": 0.80730569, "learning_rate": 3.421150061716715e-06, "loss": 0.82914662, "num_input_tokens_seen": 97494520, "step": 4507, "time_per_iteration": 2.7858307361602783 }, { "auxiliary_loss_clip": 0.01041671, "auxiliary_loss_mlp": 0.010698, "balance_loss_clip": 1.0261147, "balance_loss_mlp": 1.0667243, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.7655673562950965, "language_loss": 0.5085085, "learning_rate": 3.420876001185698e-06, "loss": 0.52962321, "num_input_tokens_seen": 97552455, "step": 4508, "time_per_iteration": 3.144418716430664 }, { "auxiliary_loss_clip": 0.01072779, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.04359698, "balance_loss_mlp": 1.02843356, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 1.9710162430227722, "language_loss": 0.74710357, "learning_rate": 3.4206018867749197e-06, "loss": 0.76829731, "num_input_tokens_seen": 97572650, "step": 4509, "time_per_iteration": 2.8052053451538086 }, { "auxiliary_loss_clip": 0.01130819, "auxiliary_loss_mlp": 0.01042284, "balance_loss_clip": 1.05107474, "balance_loss_mlp": 1.0254159, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 2.0468089657674353, "language_loss": 0.70937192, "learning_rate": 3.4203277184947757e-06, "loss": 0.73110294, "num_input_tokens_seen": 97591150, "step": 4510, "time_per_iteration": 2.6244139671325684 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.05330467, "balance_loss_mlp": 1.02156901, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 2.4701723872261256, "language_loss": 0.70409644, "learning_rate": 3.4200534963556627e-06, "loss": 0.72584701, "num_input_tokens_seen": 97607410, "step": 4511, "time_per_iteration": 4.112820863723755 }, { "auxiliary_loss_clip": 0.0112023, "auxiliary_loss_mlp": 0.01049105, "balance_loss_clip": 1.048491, "balance_loss_mlp": 1.03115225, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 6.028868725677894, "language_loss": 0.81324005, "learning_rate": 3.419779220367979e-06, "loss": 0.83493352, "num_input_tokens_seen": 97626870, "step": 4512, "time_per_iteration": 4.285844087600708 }, { "auxiliary_loss_clip": 0.01147816, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.05365086, "balance_loss_mlp": 1.02323616, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 2.7707983308205053, "language_loss": 0.80467856, "learning_rate": 3.419504890542124e-06, "loss": 0.82654285, "num_input_tokens_seen": 97646595, "step": 4513, "time_per_iteration": 4.415290117263794 }, { "auxiliary_loss_clip": 0.01119685, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.04594898, "balance_loss_mlp": 1.02709103, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 1.8005970142501413, "language_loss": 0.88150048, "learning_rate": 3.4192305068885026e-06, "loss": 0.90314144, "num_input_tokens_seen": 97665485, "step": 4514, "time_per_iteration": 2.691697835922241 }, { "auxiliary_loss_clip": 0.01129072, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.05358005, "balance_loss_mlp": 1.03337574, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 1.6419144417830658, "language_loss": 0.91461927, "learning_rate": 3.418956069417517e-06, "loss": 0.93642819, "num_input_tokens_seen": 97683800, "step": 4515, "time_per_iteration": 2.6709890365600586 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01057835, "balance_loss_clip": 1.04920852, "balance_loss_mlp": 1.03761721, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 2.0250040358395944, "language_loss": 0.74093282, "learning_rate": 3.4186815781395756e-06, "loss": 0.76251566, "num_input_tokens_seen": 97700505, "step": 4516, "time_per_iteration": 2.7001607418060303 }, { "auxiliary_loss_clip": 0.01136738, "auxiliary_loss_mlp": 0.01052795, "balance_loss_clip": 1.05046439, "balance_loss_mlp": 1.03483033, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 2.811509606055916, "language_loss": 0.75989574, "learning_rate": 3.4184070330650866e-06, "loss": 0.78179109, "num_input_tokens_seen": 97717410, "step": 4517, "time_per_iteration": 4.207966089248657 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01058771, "balance_loss_clip": 1.04378986, "balance_loss_mlp": 1.03962636, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 2.3161178488466097, "language_loss": 0.77046895, "learning_rate": 3.4181324342044607e-06, "loss": 0.79203308, "num_input_tokens_seen": 97734545, "step": 4518, "time_per_iteration": 2.754009246826172 }, { "auxiliary_loss_clip": 0.01118909, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.05136919, "balance_loss_mlp": 1.03077579, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 2.717268994046331, "language_loss": 0.68388188, "learning_rate": 3.41785778156811e-06, "loss": 0.70554924, "num_input_tokens_seen": 97754000, "step": 4519, "time_per_iteration": 2.7800872325897217 }, { "auxiliary_loss_clip": 0.01134075, "auxiliary_loss_mlp": 0.01053278, "balance_loss_clip": 1.05009973, "balance_loss_mlp": 1.03611171, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 2.367483937305651, "language_loss": 0.75572526, "learning_rate": 3.417583075166451e-06, "loss": 0.7775988, "num_input_tokens_seen": 97772080, "step": 4520, "time_per_iteration": 2.694591760635376 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0106095, "balance_loss_clip": 1.05209494, "balance_loss_mlp": 1.04226971, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 3.3698654303080935, "language_loss": 0.76434267, "learning_rate": 3.4173083150099e-06, "loss": 0.78633487, "num_input_tokens_seen": 97789370, "step": 4521, "time_per_iteration": 2.675443649291992 }, { "auxiliary_loss_clip": 0.01117262, "auxiliary_loss_mlp": 0.0106414, "balance_loss_clip": 1.04636955, "balance_loss_mlp": 1.04578209, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 2.1933848209734936, "language_loss": 0.75041616, "learning_rate": 3.417033501108875e-06, "loss": 0.77223015, "num_input_tokens_seen": 97807385, "step": 4522, "time_per_iteration": 2.769519329071045 }, { "auxiliary_loss_clip": 0.01151707, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.05433989, "balance_loss_mlp": 1.02813768, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 1.9328965147806931, "language_loss": 0.73074079, "learning_rate": 3.416758633473798e-06, "loss": 0.75271285, "num_input_tokens_seen": 97827930, "step": 4523, "time_per_iteration": 2.6642134189605713 }, { "auxiliary_loss_clip": 0.01120278, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03014588, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 1.3899676528871532, "language_loss": 0.74113363, "learning_rate": 3.4164837121150915e-06, "loss": 0.76282012, "num_input_tokens_seen": 97847440, "step": 4524, "time_per_iteration": 2.6365647315979004 }, { "auxiliary_loss_clip": 0.0115251, "auxiliary_loss_mlp": 0.01059779, "balance_loss_clip": 1.05642283, "balance_loss_mlp": 1.04233861, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 1.6567279945506783, "language_loss": 0.7639389, "learning_rate": 3.4162087370431803e-06, "loss": 0.78606176, "num_input_tokens_seen": 97867620, "step": 4525, "time_per_iteration": 2.7116904258728027 }, { "auxiliary_loss_clip": 0.01133976, "auxiliary_loss_mlp": 0.01063183, "balance_loss_clip": 1.05110538, "balance_loss_mlp": 1.0458858, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 1.8049087044415455, "language_loss": 0.81449121, "learning_rate": 3.4159337082684926e-06, "loss": 0.8364628, "num_input_tokens_seen": 97884345, "step": 4526, "time_per_iteration": 2.583151340484619 }, { "auxiliary_loss_clip": 0.01150721, "auxiliary_loss_mlp": 0.01050593, "balance_loss_clip": 1.05157495, "balance_loss_mlp": 1.03235435, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 2.689071598576449, "language_loss": 0.77230763, "learning_rate": 3.4156586258014566e-06, "loss": 0.79432082, "num_input_tokens_seen": 97901500, "step": 4527, "time_per_iteration": 2.6060924530029297 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.00777538, "balance_loss_clip": 1.04898691, "balance_loss_mlp": 1.00073338, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 2.5564103940467313, "language_loss": 0.8187297, "learning_rate": 3.415383489652503e-06, "loss": 0.83759975, "num_input_tokens_seen": 97917800, "step": 4528, "time_per_iteration": 2.697845458984375 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01058829, "balance_loss_clip": 1.05005443, "balance_loss_mlp": 1.04094744, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 1.774189879269534, "language_loss": 0.77156031, "learning_rate": 3.4151082998320666e-06, "loss": 0.7933138, "num_input_tokens_seen": 97937225, "step": 4529, "time_per_iteration": 2.75425124168396 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01053103, "balance_loss_clip": 1.0518961, "balance_loss_mlp": 1.03634179, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 2.104422440945624, "language_loss": 0.82359695, "learning_rate": 3.4148330563505805e-06, "loss": 0.84536296, "num_input_tokens_seen": 97956845, "step": 4530, "time_per_iteration": 2.6822023391723633 }, { "auxiliary_loss_clip": 0.01136812, "auxiliary_loss_mlp": 0.01047087, "balance_loss_clip": 1.05334496, "balance_loss_mlp": 1.02971828, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 2.321764638586046, "language_loss": 0.91554427, "learning_rate": 3.4145577592184838e-06, "loss": 0.93738323, "num_input_tokens_seen": 97972465, "step": 4531, "time_per_iteration": 2.6979331970214844 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01046663, "balance_loss_clip": 1.05187678, "balance_loss_mlp": 1.02856672, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 1.9110068503115385, "language_loss": 0.76398945, "learning_rate": 3.4142824084462155e-06, "loss": 0.78583801, "num_input_tokens_seen": 97990770, "step": 4532, "time_per_iteration": 2.6663877964019775 }, { "auxiliary_loss_clip": 0.01113354, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.02386856, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 2.311201731752709, "language_loss": 0.88514459, "learning_rate": 3.4140070040442162e-06, "loss": 0.90668714, "num_input_tokens_seen": 98005775, "step": 4533, "time_per_iteration": 2.693161725997925 }, { "auxiliary_loss_clip": 0.01122748, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.05127299, "balance_loss_mlp": 1.02398562, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 2.2174577403643245, "language_loss": 0.71288157, "learning_rate": 3.413731546022929e-06, "loss": 0.73451841, "num_input_tokens_seen": 98025750, "step": 4534, "time_per_iteration": 2.7371840476989746 }, { "auxiliary_loss_clip": 0.01121649, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02177453, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 1.6997646677502514, "language_loss": 0.91605014, "learning_rate": 3.4134560343928005e-06, "loss": 0.93766987, "num_input_tokens_seen": 98044955, "step": 4535, "time_per_iteration": 2.72127103805542 }, { "auxiliary_loss_clip": 0.0113065, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.05495596, "balance_loss_mlp": 1.02739298, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 1.6448383128638457, "language_loss": 0.72919363, "learning_rate": 3.4131804691642778e-06, "loss": 0.7509526, "num_input_tokens_seen": 98065860, "step": 4536, "time_per_iteration": 2.778991460800171 }, { "auxiliary_loss_clip": 0.01137601, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.05134857, "balance_loss_mlp": 1.02601612, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 1.7760428855271044, "language_loss": 0.71682841, "learning_rate": 3.41290485034781e-06, "loss": 0.73864675, "num_input_tokens_seen": 98085450, "step": 4537, "time_per_iteration": 2.7746009826660156 }, { "auxiliary_loss_clip": 0.01119602, "auxiliary_loss_mlp": 0.01042982, "balance_loss_clip": 1.04899096, "balance_loss_mlp": 1.02455187, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 2.103574663853892, "language_loss": 0.77419543, "learning_rate": 3.4126291779538485e-06, "loss": 0.79582125, "num_input_tokens_seen": 98099115, "step": 4538, "time_per_iteration": 2.6432113647460938 }, { "auxiliary_loss_clip": 0.011333, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02784324, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 1.824827492408775, "language_loss": 0.90160263, "learning_rate": 3.412353451992847e-06, "loss": 0.923383, "num_input_tokens_seen": 98118415, "step": 4539, "time_per_iteration": 2.620088815689087 }, { "auxiliary_loss_clip": 0.0112346, "auxiliary_loss_mlp": 0.01044264, "balance_loss_clip": 1.04970992, "balance_loss_mlp": 1.0250001, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 1.7778813807473632, "language_loss": 0.88033229, "learning_rate": 3.4120776724752607e-06, "loss": 0.90200949, "num_input_tokens_seen": 98136300, "step": 4540, "time_per_iteration": 2.7115092277526855 }, { "auxiliary_loss_clip": 0.01139055, "auxiliary_loss_mlp": 0.00775653, "balance_loss_clip": 1.0515871, "balance_loss_mlp": 1.00068974, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 3.2240434674097758, "language_loss": 0.82471287, "learning_rate": 3.4118018394115476e-06, "loss": 0.84385997, "num_input_tokens_seen": 98154580, "step": 4541, "time_per_iteration": 2.6112682819366455 }, { "auxiliary_loss_clip": 0.01123955, "auxiliary_loss_mlp": 0.01045117, "balance_loss_clip": 1.05166435, "balance_loss_mlp": 1.02798617, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 2.102491799578544, "language_loss": 0.79535306, "learning_rate": 3.4115259528121678e-06, "loss": 0.81704378, "num_input_tokens_seen": 98173115, "step": 4542, "time_per_iteration": 2.7202932834625244 }, { "auxiliary_loss_clip": 0.01130053, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.05406725, "balance_loss_mlp": 1.02263296, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 1.955696716620197, "language_loss": 0.89326978, "learning_rate": 3.411250012687582e-06, "loss": 0.91496956, "num_input_tokens_seen": 98190260, "step": 4543, "time_per_iteration": 2.6846654415130615 }, { "auxiliary_loss_clip": 0.01118776, "auxiliary_loss_mlp": 0.00776653, "balance_loss_clip": 1.04913735, "balance_loss_mlp": 1.00080073, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 2.4410785724718997, "language_loss": 0.64012986, "learning_rate": 3.410974019048255e-06, "loss": 0.65908414, "num_input_tokens_seen": 98207115, "step": 4544, "time_per_iteration": 2.6373775005340576 }, { "auxiliary_loss_clip": 0.01123945, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.05455351, "balance_loss_mlp": 1.02582633, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 3.5876362405970643, "language_loss": 0.69788039, "learning_rate": 3.410697971904651e-06, "loss": 0.71956557, "num_input_tokens_seen": 98230610, "step": 4545, "time_per_iteration": 2.7943291664123535 }, { "auxiliary_loss_clip": 0.0103839, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.02576709, "balance_loss_mlp": 1.02123213, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.7314456658795918, "language_loss": 0.61636353, "learning_rate": 3.4104218712672383e-06, "loss": 0.63698411, "num_input_tokens_seen": 98293585, "step": 4546, "time_per_iteration": 3.2244455814361572 }, { "auxiliary_loss_clip": 0.0105925, "auxiliary_loss_mlp": 0.01053726, "balance_loss_clip": 1.04915786, "balance_loss_mlp": 1.03472424, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 1.905103737754333, "language_loss": 0.6467241, "learning_rate": 3.410145717146488e-06, "loss": 0.66785389, "num_input_tokens_seen": 98311680, "step": 4547, "time_per_iteration": 2.7815287113189697 }, { "auxiliary_loss_clip": 0.01123347, "auxiliary_loss_mlp": 0.00774125, "balance_loss_clip": 1.05267262, "balance_loss_mlp": 1.00081313, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 1.90846373489731, "language_loss": 0.77248073, "learning_rate": 3.4098695095528694e-06, "loss": 0.79145551, "num_input_tokens_seen": 98330770, "step": 4548, "time_per_iteration": 2.8113017082214355 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01050902, "balance_loss_clip": 1.05430245, "balance_loss_mlp": 1.03526139, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 1.9713428286290122, "language_loss": 0.82792878, "learning_rate": 3.4095932484968585e-06, "loss": 0.84966338, "num_input_tokens_seen": 98349860, "step": 4549, "time_per_iteration": 2.6938650608062744 }, { "auxiliary_loss_clip": 0.01135405, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.04898036, "balance_loss_mlp": 1.02902281, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 3.4543610040263655, "language_loss": 0.71193838, "learning_rate": 3.4093169339889305e-06, "loss": 0.73377967, "num_input_tokens_seen": 98367040, "step": 4550, "time_per_iteration": 2.638643503189087 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.05066109, "balance_loss_mlp": 1.02569556, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 3.3050607953849576, "language_loss": 0.78899491, "learning_rate": 3.409040566039563e-06, "loss": 0.81049079, "num_input_tokens_seen": 98384010, "step": 4551, "time_per_iteration": 4.352613210678101 }, { "auxiliary_loss_clip": 0.01107945, "auxiliary_loss_mlp": 0.01052105, "balance_loss_clip": 1.04898548, "balance_loss_mlp": 1.03342533, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 2.480443972085862, "language_loss": 0.71220398, "learning_rate": 3.4087641446592362e-06, "loss": 0.73380452, "num_input_tokens_seen": 98399625, "step": 4552, "time_per_iteration": 4.194540739059448 }, { "auxiliary_loss_clip": 0.01123037, "auxiliary_loss_mlp": 0.01045225, "balance_loss_clip": 1.05144608, "balance_loss_mlp": 1.0275104, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 2.1026303213651967, "language_loss": 0.71636003, "learning_rate": 3.408487669858431e-06, "loss": 0.73804259, "num_input_tokens_seen": 98417310, "step": 4553, "time_per_iteration": 2.7323882579803467 }, { "auxiliary_loss_clip": 0.01134032, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.05039358, "balance_loss_mlp": 1.02658415, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 1.7325126580228065, "language_loss": 0.58917797, "learning_rate": 3.4082111416476337e-06, "loss": 0.6109705, "num_input_tokens_seen": 98438670, "step": 4554, "time_per_iteration": 2.7384533882141113 }, { "auxiliary_loss_clip": 0.01129927, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.02400088, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 1.7915916386168997, "language_loss": 0.73645991, "learning_rate": 3.4079345600373275e-06, "loss": 0.75818133, "num_input_tokens_seen": 98456060, "step": 4555, "time_per_iteration": 2.742417335510254 }, { "auxiliary_loss_clip": 0.01141373, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.0561738, "balance_loss_mlp": 1.02152658, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 2.8904145278515303, "language_loss": 0.77755523, "learning_rate": 3.407657925038002e-06, "loss": 0.79936051, "num_input_tokens_seen": 98473765, "step": 4556, "time_per_iteration": 4.419378280639648 }, { "auxiliary_loss_clip": 0.01150896, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.05645621, "balance_loss_mlp": 1.02959132, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 7.460972643049535, "language_loss": 0.82236463, "learning_rate": 3.4073812366601473e-06, "loss": 0.84436619, "num_input_tokens_seen": 98490590, "step": 4557, "time_per_iteration": 2.6087756156921387 }, { "auxiliary_loss_clip": 0.01089746, "auxiliary_loss_mlp": 0.01046447, "balance_loss_clip": 1.04229808, "balance_loss_mlp": 1.02811229, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 2.034332886344347, "language_loss": 0.7293033, "learning_rate": 3.4071044949142547e-06, "loss": 0.75066525, "num_input_tokens_seen": 98510590, "step": 4558, "time_per_iteration": 2.7908921241760254 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01051481, "balance_loss_clip": 1.05215442, "balance_loss_mlp": 1.03334939, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 2.134307291688894, "language_loss": 0.67842996, "learning_rate": 3.406827699810819e-06, "loss": 0.70020014, "num_input_tokens_seen": 98527875, "step": 4559, "time_per_iteration": 2.7246246337890625 }, { "auxiliary_loss_clip": 0.01121642, "auxiliary_loss_mlp": 0.01055203, "balance_loss_clip": 1.04958165, "balance_loss_mlp": 1.03646374, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 2.095192605103166, "language_loss": 0.7249226, "learning_rate": 3.4065508513603353e-06, "loss": 0.74669105, "num_input_tokens_seen": 98547575, "step": 4560, "time_per_iteration": 2.634526252746582 }, { "auxiliary_loss_clip": 0.01131443, "auxiliary_loss_mlp": 0.01049928, "balance_loss_clip": 1.05592251, "balance_loss_mlp": 1.03115225, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 2.095026193088577, "language_loss": 0.81413525, "learning_rate": 3.406273949573303e-06, "loss": 0.83594894, "num_input_tokens_seen": 98566290, "step": 4561, "time_per_iteration": 2.711106538772583 }, { "auxiliary_loss_clip": 0.01156737, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.05919766, "balance_loss_mlp": 1.02688003, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 1.7066421621801435, "language_loss": 0.75436246, "learning_rate": 3.4059969944602214e-06, "loss": 0.77636886, "num_input_tokens_seen": 98586255, "step": 4562, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01155238, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.06035113, "balance_loss_mlp": 1.02138865, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 1.784616644228294, "language_loss": 0.74751598, "learning_rate": 3.4057199860315928e-06, "loss": 0.76945561, "num_input_tokens_seen": 98606030, "step": 4563, "time_per_iteration": 2.788313627243042 }, { "auxiliary_loss_clip": 0.01119321, "auxiliary_loss_mlp": 0.01048987, "balance_loss_clip": 1.04918432, "balance_loss_mlp": 1.02912664, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 1.7657560231579414, "language_loss": 0.63026172, "learning_rate": 3.4054429242979213e-06, "loss": 0.65194476, "num_input_tokens_seen": 98625225, "step": 4564, "time_per_iteration": 2.810922145843506 }, { "auxiliary_loss_clip": 0.01128901, "auxiliary_loss_mlp": 0.01046032, "balance_loss_clip": 1.05438292, "balance_loss_mlp": 1.02732766, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 1.9571814389681148, "language_loss": 0.78683448, "learning_rate": 3.4051658092697135e-06, "loss": 0.8085838, "num_input_tokens_seen": 98649470, "step": 4565, "time_per_iteration": 2.846803665161133 }, { "auxiliary_loss_clip": 0.01095875, "auxiliary_loss_mlp": 0.01050978, "balance_loss_clip": 1.04981828, "balance_loss_mlp": 1.03370428, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 2.4708024317398003, "language_loss": 0.68715227, "learning_rate": 3.404888640957477e-06, "loss": 0.70862079, "num_input_tokens_seen": 98666915, "step": 4566, "time_per_iteration": 2.714352607727051 }, { "auxiliary_loss_clip": 0.01142259, "auxiliary_loss_mlp": 0.01049797, "balance_loss_clip": 1.05835438, "balance_loss_mlp": 1.03326273, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 2.1203833431876435, "language_loss": 0.60966527, "learning_rate": 3.404611419371723e-06, "loss": 0.63158584, "num_input_tokens_seen": 98688240, "step": 4567, "time_per_iteration": 2.71791934967041 }, { "auxiliary_loss_clip": 0.01135855, "auxiliary_loss_mlp": 0.01047435, "balance_loss_clip": 1.05527198, "balance_loss_mlp": 1.02756321, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 4.134990661591929, "language_loss": 0.82529241, "learning_rate": 3.4043341445229627e-06, "loss": 0.84712529, "num_input_tokens_seen": 98708245, "step": 4568, "time_per_iteration": 2.6779236793518066 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.06012177, "balance_loss_mlp": 1.01916456, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 2.0524329167860254, "language_loss": 0.68425417, "learning_rate": 3.4040568164217117e-06, "loss": 0.70604521, "num_input_tokens_seen": 98724575, "step": 4569, "time_per_iteration": 2.6595280170440674 }, { "auxiliary_loss_clip": 0.0111585, "auxiliary_loss_mlp": 0.01047943, "balance_loss_clip": 1.04627442, "balance_loss_mlp": 1.02938235, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 2.9457223850766283, "language_loss": 0.70966327, "learning_rate": 3.4037794350784848e-06, "loss": 0.73130119, "num_input_tokens_seen": 98740700, "step": 4570, "time_per_iteration": 2.7404215335845947 }, { "auxiliary_loss_clip": 0.01035018, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.03062916, "balance_loss_mlp": 1.02521896, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.7294499123437721, "language_loss": 0.55835986, "learning_rate": 3.4035020005038014e-06, "loss": 0.57898545, "num_input_tokens_seen": 98803030, "step": 4571, "time_per_iteration": 3.369403123855591 }, { "auxiliary_loss_clip": 0.01096573, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.0493505, "balance_loss_mlp": 1.03134847, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 2.8212366896407772, "language_loss": 0.78388298, "learning_rate": 3.4032245127081812e-06, "loss": 0.80534041, "num_input_tokens_seen": 98820505, "step": 4572, "time_per_iteration": 2.835817813873291 }, { "auxiliary_loss_clip": 0.01145371, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.02365255, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 3.882915196153325, "language_loss": 0.8126958, "learning_rate": 3.402946971702147e-06, "loss": 0.83453798, "num_input_tokens_seen": 98842150, "step": 4573, "time_per_iteration": 2.709415912628174 }, { "auxiliary_loss_clip": 0.01135124, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.0529685, "balance_loss_mlp": 1.0252434, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 1.740498780022663, "language_loss": 0.79043669, "learning_rate": 3.402669377496223e-06, "loss": 0.81221676, "num_input_tokens_seen": 98861050, "step": 4574, "time_per_iteration": 2.651921272277832 }, { "auxiliary_loss_clip": 0.01104251, "auxiliary_loss_mlp": 0.01052183, "balance_loss_clip": 1.05164313, "balance_loss_mlp": 1.03518367, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 2.03666793953709, "language_loss": 0.74517256, "learning_rate": 3.402391730100936e-06, "loss": 0.76673687, "num_input_tokens_seen": 98879695, "step": 4575, "time_per_iteration": 2.7622992992401123 }, { "auxiliary_loss_clip": 0.01126178, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.05188203, "balance_loss_mlp": 1.02700627, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 2.5671977719319745, "language_loss": 0.71951419, "learning_rate": 3.402114029526814e-06, "loss": 0.74120593, "num_input_tokens_seen": 98902035, "step": 4576, "time_per_iteration": 2.85740065574646 }, { "auxiliary_loss_clip": 0.01102681, "auxiliary_loss_mlp": 0.00778132, "balance_loss_clip": 1.0506314, "balance_loss_mlp": 1.00075579, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 1.8050360629969575, "language_loss": 0.73217857, "learning_rate": 3.4018362757843866e-06, "loss": 0.7509867, "num_input_tokens_seen": 98921835, "step": 4577, "time_per_iteration": 2.9024770259857178 }, { "auxiliary_loss_clip": 0.01130618, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.05657601, "balance_loss_mlp": 1.02571797, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 1.7818656930434014, "language_loss": 0.76073247, "learning_rate": 3.401558468884188e-06, "loss": 0.78247702, "num_input_tokens_seen": 98939610, "step": 4578, "time_per_iteration": 2.7173874378204346 }, { "auxiliary_loss_clip": 0.01120877, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.05252147, "balance_loss_mlp": 1.02741659, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 2.6134371594901773, "language_loss": 0.66563278, "learning_rate": 3.4012806088367516e-06, "loss": 0.68731803, "num_input_tokens_seen": 98962250, "step": 4579, "time_per_iteration": 2.730104446411133 }, { "auxiliary_loss_clip": 0.01113502, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.04683816, "balance_loss_mlp": 1.03911948, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 1.8779975195253575, "language_loss": 0.80174518, "learning_rate": 3.4010026956526137e-06, "loss": 0.82346463, "num_input_tokens_seen": 98981845, "step": 4580, "time_per_iteration": 2.8395349979400635 }, { "auxiliary_loss_clip": 0.01141995, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05684924, "balance_loss_mlp": 1.02942991, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.5301552660019138, "language_loss": 0.67242241, "learning_rate": 3.4007247293423137e-06, "loss": 0.69434267, "num_input_tokens_seen": 99001855, "step": 4581, "time_per_iteration": 2.788644552230835 }, { "auxiliary_loss_clip": 0.01132258, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.0560689, "balance_loss_mlp": 1.03050864, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 1.785645052077455, "language_loss": 0.77915615, "learning_rate": 3.400446709916392e-06, "loss": 0.80095327, "num_input_tokens_seen": 99019880, "step": 4582, "time_per_iteration": 2.730393409729004 }, { "auxiliary_loss_clip": 0.0110084, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05119133, "balance_loss_mlp": 1.02575767, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.737971373642785, "language_loss": 0.84479475, "learning_rate": 3.4001686373853895e-06, "loss": 0.86622572, "num_input_tokens_seen": 99037570, "step": 4583, "time_per_iteration": 2.7274270057678223 }, { "auxiliary_loss_clip": 0.01139632, "auxiliary_loss_mlp": 0.01044098, "balance_loss_clip": 1.05364764, "balance_loss_mlp": 1.02693176, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 1.6883560409679848, "language_loss": 0.67007428, "learning_rate": 3.3998905117598528e-06, "loss": 0.69191158, "num_input_tokens_seen": 99056875, "step": 4584, "time_per_iteration": 2.643176794052124 }, { "auxiliary_loss_clip": 0.01080495, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.04106402, "balance_loss_mlp": 1.03475666, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 1.8352571769398758, "language_loss": 0.77349764, "learning_rate": 3.399612333050327e-06, "loss": 0.79484355, "num_input_tokens_seen": 99074685, "step": 4585, "time_per_iteration": 2.6824886798858643 }, { "auxiliary_loss_clip": 0.01142822, "auxiliary_loss_mlp": 0.00775816, "balance_loss_clip": 1.05703616, "balance_loss_mlp": 1.00084651, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 1.697985370469672, "language_loss": 0.7201665, "learning_rate": 3.399334101267362e-06, "loss": 0.73935288, "num_input_tokens_seen": 99095300, "step": 4586, "time_per_iteration": 2.672872304916382 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.05329537, "balance_loss_mlp": 1.02184618, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 2.166019285475688, "language_loss": 0.80385983, "learning_rate": 3.3990558164215073e-06, "loss": 0.82550168, "num_input_tokens_seen": 99115965, "step": 4587, "time_per_iteration": 2.716212272644043 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.05435753, "balance_loss_mlp": 1.02916992, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 3.416975868515595, "language_loss": 0.83000016, "learning_rate": 3.398777478523316e-06, "loss": 0.85185915, "num_input_tokens_seen": 99134265, "step": 4588, "time_per_iteration": 2.6104485988616943 }, { "auxiliary_loss_clip": 0.01109827, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.04756808, "balance_loss_mlp": 1.02567828, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.3306263403060763, "language_loss": 0.75309169, "learning_rate": 3.398499087583342e-06, "loss": 0.77461863, "num_input_tokens_seen": 99156185, "step": 4589, "time_per_iteration": 4.333514928817749 }, { "auxiliary_loss_clip": 0.01138237, "auxiliary_loss_mlp": 0.01046648, "balance_loss_clip": 1.0555464, "balance_loss_mlp": 1.02944636, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 1.9812216556422375, "language_loss": 0.8860873, "learning_rate": 3.398220643612143e-06, "loss": 0.90793616, "num_input_tokens_seen": 99176735, "step": 4590, "time_per_iteration": 4.256460428237915 }, { "auxiliary_loss_clip": 0.01132985, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.05280411, "balance_loss_mlp": 1.03025222, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 1.594737426944321, "language_loss": 0.71265185, "learning_rate": 3.397942146620277e-06, "loss": 0.7344681, "num_input_tokens_seen": 99199765, "step": 4591, "time_per_iteration": 2.8263018131256104 }, { "auxiliary_loss_clip": 0.01114882, "auxiliary_loss_mlp": 0.01048296, "balance_loss_clip": 1.05395412, "balance_loss_mlp": 1.0301044, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 3.793452037579163, "language_loss": 0.80017495, "learning_rate": 3.3976635966183046e-06, "loss": 0.82180673, "num_input_tokens_seen": 99218435, "step": 4592, "time_per_iteration": 4.289790153503418 }, { "auxiliary_loss_clip": 0.01051224, "auxiliary_loss_mlp": 0.00755885, "balance_loss_clip": 1.02655387, "balance_loss_mlp": 1.00253439, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.710408868807485, "language_loss": 0.61613023, "learning_rate": 3.3973849936167886e-06, "loss": 0.63420129, "num_input_tokens_seen": 99276200, "step": 4593, "time_per_iteration": 3.201831817626953 }, { "auxiliary_loss_clip": 0.01130969, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.05307889, "balance_loss_mlp": 1.02640104, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 1.9659750468178385, "language_loss": 0.778301, "learning_rate": 3.3971063376262937e-06, "loss": 0.80004054, "num_input_tokens_seen": 99297625, "step": 4594, "time_per_iteration": 2.7222111225128174 }, { "auxiliary_loss_clip": 0.0113791, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05557215, "balance_loss_mlp": 1.02168524, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 1.5118783378909677, "language_loss": 0.91944981, "learning_rate": 3.3968276286573866e-06, "loss": 0.9412154, "num_input_tokens_seen": 99315790, "step": 4595, "time_per_iteration": 4.290736198425293 }, { "auxiliary_loss_clip": 0.01134891, "auxiliary_loss_mlp": 0.01052323, "balance_loss_clip": 1.05374146, "balance_loss_mlp": 1.03413117, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 1.7744098894398055, "language_loss": 0.69208467, "learning_rate": 3.3965488667206353e-06, "loss": 0.71395689, "num_input_tokens_seen": 99334615, "step": 4596, "time_per_iteration": 2.7178540229797363 }, { "auxiliary_loss_clip": 0.01125254, "auxiliary_loss_mlp": 0.01048102, "balance_loss_clip": 1.05075955, "balance_loss_mlp": 1.02977943, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 1.7305541104386353, "language_loss": 0.63536781, "learning_rate": 3.3962700518266113e-06, "loss": 0.65710139, "num_input_tokens_seen": 99356685, "step": 4597, "time_per_iteration": 2.7713348865509033 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01046127, "balance_loss_clip": 1.05762243, "balance_loss_mlp": 1.02949786, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 2.077440653118394, "language_loss": 0.86298984, "learning_rate": 3.395991183985887e-06, "loss": 0.8849535, "num_input_tokens_seen": 99374810, "step": 4598, "time_per_iteration": 2.6077804565429688 }, { "auxiliary_loss_clip": 0.01151532, "auxiliary_loss_mlp": 0.01046218, "balance_loss_clip": 1.0559516, "balance_loss_mlp": 1.02790797, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 2.6195813063936493, "language_loss": 0.79957914, "learning_rate": 3.395712263209037e-06, "loss": 0.82155669, "num_input_tokens_seen": 99391290, "step": 4599, "time_per_iteration": 2.67372989654541 }, { "auxiliary_loss_clip": 0.01127397, "auxiliary_loss_mlp": 0.01049332, "balance_loss_clip": 1.04922533, "balance_loss_mlp": 1.03152239, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 1.7492576371751551, "language_loss": 0.78788924, "learning_rate": 3.395433289506639e-06, "loss": 0.80965656, "num_input_tokens_seen": 99409120, "step": 4600, "time_per_iteration": 2.7197396755218506 }, { "auxiliary_loss_clip": 0.01119636, "auxiliary_loss_mlp": 0.01049981, "balance_loss_clip": 1.05458808, "balance_loss_mlp": 1.03226674, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 2.9827767838021906, "language_loss": 0.7372371, "learning_rate": 3.3951542628892694e-06, "loss": 0.75893331, "num_input_tokens_seen": 99426180, "step": 4601, "time_per_iteration": 2.7212698459625244 }, { "auxiliary_loss_clip": 0.01137986, "auxiliary_loss_mlp": 0.01053484, "balance_loss_clip": 1.05503917, "balance_loss_mlp": 1.03514934, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 1.7018676665174548, "language_loss": 0.80055201, "learning_rate": 3.3948751833675113e-06, "loss": 0.82246667, "num_input_tokens_seen": 99447720, "step": 4602, "time_per_iteration": 2.6929776668548584 }, { "auxiliary_loss_clip": 0.01131471, "auxiliary_loss_mlp": 0.01060998, "balance_loss_clip": 1.05209374, "balance_loss_mlp": 1.04194784, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 2.3561631161543986, "language_loss": 0.77018148, "learning_rate": 3.3945960509519455e-06, "loss": 0.79210615, "num_input_tokens_seen": 99464720, "step": 4603, "time_per_iteration": 2.7761597633361816 }, { "auxiliary_loss_clip": 0.01118804, "auxiliary_loss_mlp": 0.01044782, "balance_loss_clip": 1.05331254, "balance_loss_mlp": 1.02858686, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 1.686999686787164, "language_loss": 0.81469357, "learning_rate": 3.3943168656531585e-06, "loss": 0.83632934, "num_input_tokens_seen": 99482310, "step": 4604, "time_per_iteration": 2.6715614795684814 }, { "auxiliary_loss_clip": 0.01096642, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.04733086, "balance_loss_mlp": 1.02428889, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 1.8500484413544072, "language_loss": 0.7021662, "learning_rate": 3.3940376274817363e-06, "loss": 0.72355425, "num_input_tokens_seen": 99501255, "step": 4605, "time_per_iteration": 2.824810266494751 }, { "auxiliary_loss_clip": 0.01051326, "auxiliary_loss_mlp": 0.01005015, "balance_loss_clip": 1.02826095, "balance_loss_mlp": 1.00244009, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.7013581781305706, "language_loss": 0.57222801, "learning_rate": 3.3937583364482673e-06, "loss": 0.59279138, "num_input_tokens_seen": 99568925, "step": 4606, "time_per_iteration": 3.288269519805908 }, { "auxiliary_loss_clip": 0.01125032, "auxiliary_loss_mlp": 0.01050719, "balance_loss_clip": 1.05177283, "balance_loss_mlp": 1.03280139, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 1.9503980757161308, "language_loss": 0.69579148, "learning_rate": 3.3934789925633424e-06, "loss": 0.71754897, "num_input_tokens_seen": 99588455, "step": 4607, "time_per_iteration": 2.7865042686462402 }, { "auxiliary_loss_clip": 0.0113039, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.05402029, "balance_loss_mlp": 1.0242002, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 1.5552750364168406, "language_loss": 0.69727945, "learning_rate": 3.393199595837555e-06, "loss": 0.71899283, "num_input_tokens_seen": 99609355, "step": 4608, "time_per_iteration": 2.7139909267425537 }, { "auxiliary_loss_clip": 0.0109619, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04789758, "balance_loss_mlp": 1.024894, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 1.922338327624115, "language_loss": 0.73170602, "learning_rate": 3.392920146281499e-06, "loss": 0.75308412, "num_input_tokens_seen": 99628780, "step": 4609, "time_per_iteration": 2.8674490451812744 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.01054215, "balance_loss_clip": 1.04444993, "balance_loss_mlp": 1.03615475, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 2.284482242639661, "language_loss": 0.84028268, "learning_rate": 3.3926406439057714e-06, "loss": 0.86188376, "num_input_tokens_seen": 99644545, "step": 4610, "time_per_iteration": 2.6861605644226074 }, { "auxiliary_loss_clip": 0.01074905, "auxiliary_loss_mlp": 0.00781444, "balance_loss_clip": 1.04093325, "balance_loss_mlp": 1.00102568, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 2.0943450829127044, "language_loss": 0.68915951, "learning_rate": 3.3923610887209705e-06, "loss": 0.70772296, "num_input_tokens_seen": 99663125, "step": 4611, "time_per_iteration": 2.799345016479492 }, { "auxiliary_loss_clip": 0.01144902, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.05466819, "balance_loss_mlp": 1.02591395, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 2.6988182686748785, "language_loss": 0.73646772, "learning_rate": 3.392081480737698e-06, "loss": 0.75834239, "num_input_tokens_seen": 99682645, "step": 4612, "time_per_iteration": 2.643157720565796 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.00775997, "balance_loss_clip": 1.05283117, "balance_loss_mlp": 1.00099993, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 2.0654093622255436, "language_loss": 0.66356897, "learning_rate": 3.3918018199665563e-06, "loss": 0.68272179, "num_input_tokens_seen": 99700520, "step": 4613, "time_per_iteration": 2.6685144901275635 }, { "auxiliary_loss_clip": 0.01096758, "auxiliary_loss_mlp": 0.01051618, "balance_loss_clip": 1.04526055, "balance_loss_mlp": 1.03354573, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 1.5160858700983233, "language_loss": 0.79385912, "learning_rate": 3.39152210641815e-06, "loss": 0.8153429, "num_input_tokens_seen": 99720355, "step": 4614, "time_per_iteration": 2.82061505317688 }, { "auxiliary_loss_clip": 0.01129896, "auxiliary_loss_mlp": 0.01047714, "balance_loss_clip": 1.04873419, "balance_loss_mlp": 1.02978539, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 2.763943164845673, "language_loss": 0.80632633, "learning_rate": 3.3912423401030865e-06, "loss": 0.82810241, "num_input_tokens_seen": 99736090, "step": 4615, "time_per_iteration": 2.607448101043701 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01051705, "balance_loss_clip": 1.04532576, "balance_loss_mlp": 1.03447962, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 2.3373471978129543, "language_loss": 0.646945, "learning_rate": 3.3909625210319735e-06, "loss": 0.66860855, "num_input_tokens_seen": 99751805, "step": 4616, "time_per_iteration": 2.693556308746338 }, { "auxiliary_loss_clip": 0.01133374, "auxiliary_loss_mlp": 0.01047225, "balance_loss_clip": 1.0536505, "balance_loss_mlp": 1.03001153, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.175848824107301, "language_loss": 0.82324976, "learning_rate": 3.3906826492154226e-06, "loss": 0.84505582, "num_input_tokens_seen": 99770610, "step": 4617, "time_per_iteration": 2.64677357673645 }, { "auxiliary_loss_clip": 0.01147475, "auxiliary_loss_mlp": 0.01049438, "balance_loss_clip": 1.05210304, "balance_loss_mlp": 1.03261721, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 2.8579401527932236, "language_loss": 0.77031851, "learning_rate": 3.3904027246640458e-06, "loss": 0.79228759, "num_input_tokens_seen": 99787305, "step": 4618, "time_per_iteration": 2.555001735687256 }, { "auxiliary_loss_clip": 0.01151182, "auxiliary_loss_mlp": 0.01042958, "balance_loss_clip": 1.05599475, "balance_loss_mlp": 1.0268048, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 1.6850470881083441, "language_loss": 0.85102153, "learning_rate": 3.390122747388459e-06, "loss": 0.87296283, "num_input_tokens_seen": 99808940, "step": 4619, "time_per_iteration": 2.753230094909668 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02592564, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 1.6763124645732197, "language_loss": 0.7707957, "learning_rate": 3.3898427173992778e-06, "loss": 0.79242951, "num_input_tokens_seen": 99829575, "step": 4620, "time_per_iteration": 2.7764816284179688 }, { "auxiliary_loss_clip": 0.01091863, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04290819, "balance_loss_mlp": 1.02517962, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 1.985202794634515, "language_loss": 0.78144193, "learning_rate": 3.389562634707122e-06, "loss": 0.80278563, "num_input_tokens_seen": 99847575, "step": 4621, "time_per_iteration": 2.740419387817383 }, { "auxiliary_loss_clip": 0.01113871, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.04857588, "balance_loss_mlp": 1.03642535, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 2.864120631038579, "language_loss": 0.87357259, "learning_rate": 3.389282499322611e-06, "loss": 0.89525354, "num_input_tokens_seen": 99864995, "step": 4622, "time_per_iteration": 2.8351151943206787 }, { "auxiliary_loss_clip": 0.01096216, "auxiliary_loss_mlp": 0.01052098, "balance_loss_clip": 1.0477345, "balance_loss_mlp": 1.0349195, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 1.7857472181098575, "language_loss": 0.81315404, "learning_rate": 3.389002311256369e-06, "loss": 0.83463717, "num_input_tokens_seen": 99881540, "step": 4623, "time_per_iteration": 2.7112133502960205 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.05434608, "balance_loss_mlp": 1.02628374, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 2.1551340516102897, "language_loss": 0.80889726, "learning_rate": 3.3887220705190204e-06, "loss": 0.83052659, "num_input_tokens_seen": 99899595, "step": 4624, "time_per_iteration": 2.6492481231689453 }, { "auxiliary_loss_clip": 0.01112812, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05008531, "balance_loss_mlp": 1.00092447, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 2.21671742511245, "language_loss": 0.76949263, "learning_rate": 3.388441777121191e-06, "loss": 0.78839707, "num_input_tokens_seen": 99913020, "step": 4625, "time_per_iteration": 2.6312057971954346 }, { "auxiliary_loss_clip": 0.01106879, "auxiliary_loss_mlp": 0.01046687, "balance_loss_clip": 1.04205859, "balance_loss_mlp": 1.02767277, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 1.790813282848893, "language_loss": 0.69947815, "learning_rate": 3.388161431073511e-06, "loss": 0.72101378, "num_input_tokens_seen": 99931405, "step": 4626, "time_per_iteration": 2.7656819820404053 }, { "auxiliary_loss_clip": 0.0110548, "auxiliary_loss_mlp": 0.01041917, "balance_loss_clip": 1.04827905, "balance_loss_mlp": 1.02385652, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.1086116607571546, "language_loss": 0.92367601, "learning_rate": 3.38788103238661e-06, "loss": 0.94515002, "num_input_tokens_seen": 99948100, "step": 4627, "time_per_iteration": 2.8608667850494385 }, { "auxiliary_loss_clip": 0.01149683, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.05388021, "balance_loss_mlp": 1.0248611, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 1.7290354122756755, "language_loss": 0.85490036, "learning_rate": 3.387600581071121e-06, "loss": 0.87680495, "num_input_tokens_seen": 99966470, "step": 4628, "time_per_iteration": 2.6468069553375244 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.0104202, "balance_loss_clip": 1.0482378, "balance_loss_mlp": 1.02509212, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 1.5106040860694088, "language_loss": 0.79246545, "learning_rate": 3.387320077137679e-06, "loss": 0.81403273, "num_input_tokens_seen": 99985930, "step": 4629, "time_per_iteration": 5.656833648681641 }, { "auxiliary_loss_clip": 0.01100825, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.04602218, "balance_loss_mlp": 1.02339983, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 1.5125577415085874, "language_loss": 0.84574991, "learning_rate": 3.3870395205969208e-06, "loss": 0.86716145, "num_input_tokens_seen": 100006235, "step": 4630, "time_per_iteration": 2.70917010307312 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.04848623, "balance_loss_mlp": 1.02099967, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 2.1016222667741857, "language_loss": 0.81134796, "learning_rate": 3.386758911459485e-06, "loss": 0.83297169, "num_input_tokens_seen": 100023655, "step": 4631, "time_per_iteration": 4.19342041015625 }, { "auxiliary_loss_clip": 0.01149092, "auxiliary_loss_mlp": 0.01049428, "balance_loss_clip": 1.05402875, "balance_loss_mlp": 1.03257155, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 3.9436500565538295, "language_loss": 0.71196103, "learning_rate": 3.3864782497360126e-06, "loss": 0.7339462, "num_input_tokens_seen": 100043280, "step": 4632, "time_per_iteration": 2.620439291000366 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01044268, "balance_loss_clip": 1.05435467, "balance_loss_mlp": 1.02798355, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 1.8243983980851597, "language_loss": 0.82563186, "learning_rate": 3.386197535437145e-06, "loss": 0.84740269, "num_input_tokens_seen": 100057690, "step": 4633, "time_per_iteration": 2.6531693935394287 }, { "auxiliary_loss_clip": 0.01122775, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.04714537, "balance_loss_mlp": 1.02130151, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 1.6667943176882647, "language_loss": 0.87727869, "learning_rate": 3.385916768573529e-06, "loss": 0.89890444, "num_input_tokens_seen": 100075875, "step": 4634, "time_per_iteration": 4.391691446304321 }, { "auxiliary_loss_clip": 0.01118626, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04900146, "balance_loss_mlp": 1.02503181, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 1.8664238108113964, "language_loss": 0.7701081, "learning_rate": 3.38563594915581e-06, "loss": 0.79172325, "num_input_tokens_seen": 100092930, "step": 4635, "time_per_iteration": 2.7107748985290527 }, { "auxiliary_loss_clip": 0.01148262, "auxiliary_loss_mlp": 0.01044984, "balance_loss_clip": 1.05233121, "balance_loss_mlp": 1.02705491, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 1.6280540509164947, "language_loss": 0.65174443, "learning_rate": 3.385355077194637e-06, "loss": 0.67367697, "num_input_tokens_seen": 100110790, "step": 4636, "time_per_iteration": 2.660099744796753 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.01042528, "balance_loss_clip": 1.048437, "balance_loss_mlp": 1.0243845, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 2.8501862977667667, "language_loss": 0.83485681, "learning_rate": 3.3850741527006604e-06, "loss": 0.85662234, "num_input_tokens_seen": 100126970, "step": 4637, "time_per_iteration": 2.6234302520751953 }, { "auxiliary_loss_clip": 0.01117465, "auxiliary_loss_mlp": 0.01043194, "balance_loss_clip": 1.04580319, "balance_loss_mlp": 1.02658796, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 1.4481958644660236, "language_loss": 0.75996393, "learning_rate": 3.384793175684533e-06, "loss": 0.78157055, "num_input_tokens_seen": 100146720, "step": 4638, "time_per_iteration": 2.6488263607025146 }, { "auxiliary_loss_clip": 0.0113367, "auxiliary_loss_mlp": 0.01047522, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02935445, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 1.973043880665722, "language_loss": 0.71658665, "learning_rate": 3.38451214615691e-06, "loss": 0.73839855, "num_input_tokens_seen": 100165920, "step": 4639, "time_per_iteration": 2.606290817260742 }, { "auxiliary_loss_clip": 0.01134631, "auxiliary_loss_mlp": 0.01040486, "balance_loss_clip": 1.04905224, "balance_loss_mlp": 1.02213931, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 1.9413688357819885, "language_loss": 0.6546669, "learning_rate": 3.384231064128447e-06, "loss": 0.67641807, "num_input_tokens_seen": 100185525, "step": 4640, "time_per_iteration": 2.670572280883789 }, { "auxiliary_loss_clip": 0.01134835, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.05033112, "balance_loss_mlp": 1.02394438, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 2.0528630099938385, "language_loss": 0.72150993, "learning_rate": 3.383949929609804e-06, "loss": 0.74326581, "num_input_tokens_seen": 100204850, "step": 4641, "time_per_iteration": 2.693377733230591 }, { "auxiliary_loss_clip": 0.01112862, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.05076349, "balance_loss_mlp": 1.02322423, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 1.7365449070814052, "language_loss": 0.74695385, "learning_rate": 3.383668742611641e-06, "loss": 0.7685138, "num_input_tokens_seen": 100224520, "step": 4642, "time_per_iteration": 2.7462241649627686 }, { "auxiliary_loss_clip": 0.0111075, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.04543257, "balance_loss_mlp": 1.02603781, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 1.8272594017764643, "language_loss": 0.85924351, "learning_rate": 3.3833875031446205e-06, "loss": 0.88080341, "num_input_tokens_seen": 100243935, "step": 4643, "time_per_iteration": 2.725135564804077 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01045051, "balance_loss_clip": 1.04933143, "balance_loss_mlp": 1.02697933, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 1.7474380366240072, "language_loss": 0.83161986, "learning_rate": 3.383106211219407e-06, "loss": 0.85312265, "num_input_tokens_seen": 100262290, "step": 4644, "time_per_iteration": 2.7356133460998535 }, { "auxiliary_loss_clip": 0.01135825, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.04996896, "balance_loss_mlp": 1.02672005, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 1.8326156585035789, "language_loss": 0.79077673, "learning_rate": 3.3828248668466673e-06, "loss": 0.81257844, "num_input_tokens_seen": 100280015, "step": 4645, "time_per_iteration": 2.6605966091156006 }, { "auxiliary_loss_clip": 0.01043101, "auxiliary_loss_mlp": 0.01005168, "balance_loss_clip": 1.02972245, "balance_loss_mlp": 1.00273657, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7804050577208047, "language_loss": 0.62298429, "learning_rate": 3.3825434700370705e-06, "loss": 0.64346695, "num_input_tokens_seen": 100338935, "step": 4646, "time_per_iteration": 3.203944206237793 }, { "auxiliary_loss_clip": 0.01116876, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.05170095, "balance_loss_mlp": 1.02054703, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 1.6679902986930268, "language_loss": 0.89280778, "learning_rate": 3.3822620208012865e-06, "loss": 0.91434449, "num_input_tokens_seen": 100359905, "step": 4647, "time_per_iteration": 2.829617500305176 }, { "auxiliary_loss_clip": 0.0113911, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.05125523, "balance_loss_mlp": 1.02880919, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 1.8012650128540075, "language_loss": 0.86784112, "learning_rate": 3.381980519149988e-06, "loss": 0.88970304, "num_input_tokens_seen": 100376955, "step": 4648, "time_per_iteration": 2.632321357727051 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05110133, "balance_loss_mlp": 1.02733302, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 2.0026822782024705, "language_loss": 0.73003638, "learning_rate": 3.38169896509385e-06, "loss": 0.75183129, "num_input_tokens_seen": 100397545, "step": 4649, "time_per_iteration": 2.7211172580718994 }, { "auxiliary_loss_clip": 0.01111127, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.04752195, "balance_loss_mlp": 1.02557421, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.1164331968139325, "language_loss": 0.80629992, "learning_rate": 3.381417358643549e-06, "loss": 0.82786095, "num_input_tokens_seen": 100415080, "step": 4650, "time_per_iteration": 2.7502310276031494 }, { "auxiliary_loss_clip": 0.01039445, "auxiliary_loss_mlp": 0.00754956, "balance_loss_clip": 1.03124094, "balance_loss_mlp": 1.00203133, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8151234776797575, "language_loss": 0.58806145, "learning_rate": 3.3811356998097624e-06, "loss": 0.60600549, "num_input_tokens_seen": 100471105, "step": 4651, "time_per_iteration": 3.2224526405334473 }, { "auxiliary_loss_clip": 0.01135312, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.04708123, "balance_loss_mlp": 1.02753818, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 1.7351399642666463, "language_loss": 0.74332011, "learning_rate": 3.3808539886031726e-06, "loss": 0.76514727, "num_input_tokens_seen": 100492520, "step": 4652, "time_per_iteration": 2.685736894607544 }, { "auxiliary_loss_clip": 0.01148943, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.05235481, "balance_loss_mlp": 1.02742696, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 2.2003219434248633, "language_loss": 0.79789567, "learning_rate": 3.380572225034461e-06, "loss": 0.81984192, "num_input_tokens_seen": 100512870, "step": 4653, "time_per_iteration": 2.7558584213256836 }, { "auxiliary_loss_clip": 0.01121239, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.03280401, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 2.080129868341082, "language_loss": 0.78903222, "learning_rate": 3.380290409114312e-06, "loss": 0.81074733, "num_input_tokens_seen": 100531655, "step": 4654, "time_per_iteration": 2.6496095657348633 }, { "auxiliary_loss_clip": 0.01101836, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.04982615, "balance_loss_mlp": 1.03267753, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 2.0985102630300134, "language_loss": 0.81319463, "learning_rate": 3.3800085408534127e-06, "loss": 0.83473378, "num_input_tokens_seen": 100548005, "step": 4655, "time_per_iteration": 2.742586135864258 }, { "auxiliary_loss_clip": 0.01112605, "auxiliary_loss_mlp": 0.00776867, "balance_loss_clip": 1.04759109, "balance_loss_mlp": 1.00071263, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 1.7515804597190672, "language_loss": 0.81455064, "learning_rate": 3.3797266202624506e-06, "loss": 0.83344543, "num_input_tokens_seen": 100567980, "step": 4656, "time_per_iteration": 2.796480894088745 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01050328, "balance_loss_clip": 1.05115008, "balance_loss_mlp": 1.03204143, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 2.044588364139205, "language_loss": 0.83203471, "learning_rate": 3.3794446473521176e-06, "loss": 0.85373986, "num_input_tokens_seen": 100588630, "step": 4657, "time_per_iteration": 2.6785871982574463 }, { "auxiliary_loss_clip": 0.01111476, "auxiliary_loss_mlp": 0.01052182, "balance_loss_clip": 1.04937756, "balance_loss_mlp": 1.03294206, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 2.165484252442401, "language_loss": 0.63694274, "learning_rate": 3.379162622133105e-06, "loss": 0.65857935, "num_input_tokens_seen": 100608775, "step": 4658, "time_per_iteration": 2.879409074783325 }, { "auxiliary_loss_clip": 0.01136248, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.02822304, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 1.7192056687926605, "language_loss": 0.78342974, "learning_rate": 3.3788805446161073e-06, "loss": 0.80525422, "num_input_tokens_seen": 100627975, "step": 4659, "time_per_iteration": 2.6989047527313232 }, { "auxiliary_loss_clip": 0.0111004, "auxiliary_loss_mlp": 0.01054733, "balance_loss_clip": 1.04974771, "balance_loss_mlp": 1.03588593, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 1.755148683242289, "language_loss": 0.79341501, "learning_rate": 3.3785984148118215e-06, "loss": 0.8150627, "num_input_tokens_seen": 100645430, "step": 4660, "time_per_iteration": 2.715477705001831 }, { "auxiliary_loss_clip": 0.01108147, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.05007386, "balance_loss_mlp": 1.02897, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 2.2526204230687115, "language_loss": 0.80604905, "learning_rate": 3.3783162327309453e-06, "loss": 0.82759559, "num_input_tokens_seen": 100663775, "step": 4661, "time_per_iteration": 2.7715258598327637 }, { "auxiliary_loss_clip": 0.01125452, "auxiliary_loss_mlp": 0.01056292, "balance_loss_clip": 1.05232596, "balance_loss_mlp": 1.03836262, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 1.5529278028038542, "language_loss": 0.79010582, "learning_rate": 3.3780339983841794e-06, "loss": 0.81192333, "num_input_tokens_seen": 100686085, "step": 4662, "time_per_iteration": 2.81427264213562 }, { "auxiliary_loss_clip": 0.01133119, "auxiliary_loss_mlp": 0.01052014, "balance_loss_clip": 1.05226839, "balance_loss_mlp": 1.03252363, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 1.6202884167711182, "language_loss": 0.69617724, "learning_rate": 3.377751711782227e-06, "loss": 0.71802866, "num_input_tokens_seen": 100705135, "step": 4663, "time_per_iteration": 2.697368860244751 }, { "auxiliary_loss_clip": 0.01124677, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.05170035, "balance_loss_mlp": 1.03104067, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 1.9196144000248758, "language_loss": 0.77708608, "learning_rate": 3.377469372935791e-06, "loss": 0.79882622, "num_input_tokens_seen": 100724960, "step": 4664, "time_per_iteration": 2.7275149822235107 }, { "auxiliary_loss_clip": 0.01107718, "auxiliary_loss_mlp": 0.01048769, "balance_loss_clip": 1.0480299, "balance_loss_mlp": 1.03099537, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 1.999889511399453, "language_loss": 0.79593849, "learning_rate": 3.377186981855578e-06, "loss": 0.81750339, "num_input_tokens_seen": 100741995, "step": 4665, "time_per_iteration": 2.710507392883301 }, { "auxiliary_loss_clip": 0.01132609, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.04908824, "balance_loss_mlp": 1.02724159, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 1.8624041004678782, "language_loss": 0.81080002, "learning_rate": 3.3769045385522968e-06, "loss": 0.83257234, "num_input_tokens_seen": 100758985, "step": 4666, "time_per_iteration": 2.6129403114318848 }, { "auxiliary_loss_clip": 0.01108409, "auxiliary_loss_mlp": 0.01071225, "balance_loss_clip": 1.04823136, "balance_loss_mlp": 1.05097127, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 2.103406835637469, "language_loss": 0.84507895, "learning_rate": 3.376622043036658e-06, "loss": 0.86687529, "num_input_tokens_seen": 100777820, "step": 4667, "time_per_iteration": 2.7332448959350586 }, { "auxiliary_loss_clip": 0.01123034, "auxiliary_loss_mlp": 0.00775483, "balance_loss_clip": 1.05581784, "balance_loss_mlp": 1.00072694, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 3.1307253624061486, "language_loss": 0.79295927, "learning_rate": 3.376339495319373e-06, "loss": 0.81194448, "num_input_tokens_seen": 100798205, "step": 4668, "time_per_iteration": 5.80406928062439 }, { "auxiliary_loss_clip": 0.01086886, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.04659402, "balance_loss_mlp": 1.02432859, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 1.6340052887006857, "language_loss": 0.76323926, "learning_rate": 3.3760568954111563e-06, "loss": 0.7845341, "num_input_tokens_seen": 100819800, "step": 4669, "time_per_iteration": 2.909986734390259 }, { "auxiliary_loss_clip": 0.01135126, "auxiliary_loss_mlp": 0.01048727, "balance_loss_clip": 1.05091906, "balance_loss_mlp": 1.03104806, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 2.509610012971093, "language_loss": 0.79246378, "learning_rate": 3.375774243322725e-06, "loss": 0.81430233, "num_input_tokens_seen": 100837880, "step": 4670, "time_per_iteration": 4.177394866943359 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01050214, "balance_loss_clip": 1.04797912, "balance_loss_mlp": 1.03053236, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 2.7368773080153455, "language_loss": 0.79247916, "learning_rate": 3.3754915390647955e-06, "loss": 0.81404507, "num_input_tokens_seen": 100856350, "step": 4671, "time_per_iteration": 2.711390256881714 }, { "auxiliary_loss_clip": 0.01127751, "auxiliary_loss_mlp": 0.01045588, "balance_loss_clip": 1.05121446, "balance_loss_mlp": 1.02806473, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 1.6750085767967255, "language_loss": 0.74537772, "learning_rate": 3.37520878264809e-06, "loss": 0.76711112, "num_input_tokens_seen": 100876135, "step": 4672, "time_per_iteration": 2.661121129989624 }, { "auxiliary_loss_clip": 0.01124033, "auxiliary_loss_mlp": 0.01050888, "balance_loss_clip": 1.04696918, "balance_loss_mlp": 1.03130245, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 2.8450273884489805, "language_loss": 0.75648308, "learning_rate": 3.3749259740833286e-06, "loss": 0.77823234, "num_input_tokens_seen": 100894790, "step": 4673, "time_per_iteration": 2.672701120376587 }, { "auxiliary_loss_clip": 0.0113134, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04937172, "balance_loss_mlp": 1.02492452, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 1.8533271967959946, "language_loss": 0.72668427, "learning_rate": 3.374643113381237e-06, "loss": 0.74842358, "num_input_tokens_seen": 100915100, "step": 4674, "time_per_iteration": 4.2516560554504395 }, { "auxiliary_loss_clip": 0.01138771, "auxiliary_loss_mlp": 0.01046386, "balance_loss_clip": 1.05174136, "balance_loss_mlp": 1.02751493, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 2.0688845921593377, "language_loss": 0.77195638, "learning_rate": 3.374360200552541e-06, "loss": 0.79380798, "num_input_tokens_seen": 100932795, "step": 4675, "time_per_iteration": 2.618218183517456 }, { "auxiliary_loss_clip": 0.01149881, "auxiliary_loss_mlp": 0.01047998, "balance_loss_clip": 1.05321908, "balance_loss_mlp": 1.02948523, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 1.9283078401930889, "language_loss": 0.70211101, "learning_rate": 3.374077235607968e-06, "loss": 0.7240898, "num_input_tokens_seen": 100950505, "step": 4676, "time_per_iteration": 2.59861159324646 }, { "auxiliary_loss_clip": 0.01144319, "auxiliary_loss_mlp": 0.01042342, "balance_loss_clip": 1.05481541, "balance_loss_mlp": 1.02517629, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 1.6132814643409343, "language_loss": 0.7048012, "learning_rate": 3.3737942185582487e-06, "loss": 0.72666782, "num_input_tokens_seen": 100968790, "step": 4677, "time_per_iteration": 2.6064453125 }, { "auxiliary_loss_clip": 0.01125461, "auxiliary_loss_mlp": 0.01047839, "balance_loss_clip": 1.04849231, "balance_loss_mlp": 1.02783537, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 1.5663130673511025, "language_loss": 0.639018, "learning_rate": 3.3735111494141153e-06, "loss": 0.66075099, "num_input_tokens_seen": 100990205, "step": 4678, "time_per_iteration": 2.6609809398651123 }, { "auxiliary_loss_clip": 0.01134563, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.05104351, "balance_loss_mlp": 1.03315794, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 5.827919401990006, "language_loss": 0.70568973, "learning_rate": 3.3732280281863013e-06, "loss": 0.72753799, "num_input_tokens_seen": 101009815, "step": 4679, "time_per_iteration": 2.7039310932159424 }, { "auxiliary_loss_clip": 0.01134537, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.05048108, "balance_loss_mlp": 1.02283621, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 2.2073803144691255, "language_loss": 0.74848735, "learning_rate": 3.3729448548855422e-06, "loss": 0.77024174, "num_input_tokens_seen": 101026780, "step": 4680, "time_per_iteration": 2.6897919178009033 }, { "auxiliary_loss_clip": 0.01149427, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 1.05414999, "balance_loss_mlp": 1.02363694, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 2.2743778704427267, "language_loss": 0.7719292, "learning_rate": 3.3726616295225774e-06, "loss": 0.793823, "num_input_tokens_seen": 101046215, "step": 4681, "time_per_iteration": 2.6178102493286133 }, { "auxiliary_loss_clip": 0.01138594, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.05333447, "balance_loss_mlp": 1.01864183, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 2.5230258038951723, "language_loss": 0.74197519, "learning_rate": 3.372378352108146e-06, "loss": 0.76373291, "num_input_tokens_seen": 101063365, "step": 4682, "time_per_iteration": 2.5892751216888428 }, { "auxiliary_loss_clip": 0.01145225, "auxiliary_loss_mlp": 0.01043744, "balance_loss_clip": 1.05250573, "balance_loss_mlp": 1.02619636, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 1.5493572746384299, "language_loss": 0.81096184, "learning_rate": 3.3720950226529894e-06, "loss": 0.83285153, "num_input_tokens_seen": 101083835, "step": 4683, "time_per_iteration": 2.6272947788238525 }, { "auxiliary_loss_clip": 0.01089095, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04691851, "balance_loss_mlp": 1.02916479, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 1.5570192452178944, "language_loss": 0.76437271, "learning_rate": 3.371811641167852e-06, "loss": 0.78574431, "num_input_tokens_seen": 101101740, "step": 4684, "time_per_iteration": 2.7542243003845215 }, { "auxiliary_loss_clip": 0.01090035, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.04495156, "balance_loss_mlp": 1.02659678, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 3.250404845672824, "language_loss": 0.76287019, "learning_rate": 3.3715282076634807e-06, "loss": 0.78420913, "num_input_tokens_seen": 101120480, "step": 4685, "time_per_iteration": 2.724954843521118 }, { "auxiliary_loss_clip": 0.01116834, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.05042076, "balance_loss_mlp": 1.02820265, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 1.80192319881426, "language_loss": 0.75822544, "learning_rate": 3.3712447221506218e-06, "loss": 0.77984667, "num_input_tokens_seen": 101142910, "step": 4686, "time_per_iteration": 2.7375218868255615 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01054481, "balance_loss_clip": 1.04542971, "balance_loss_mlp": 1.03530002, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 5.9534421572259095, "language_loss": 0.62298906, "learning_rate": 3.370961184640025e-06, "loss": 0.64467359, "num_input_tokens_seen": 101160030, "step": 4687, "time_per_iteration": 2.7273154258728027 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01052662, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.03501928, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 3.512847657951686, "language_loss": 0.76642895, "learning_rate": 3.3706775951424433e-06, "loss": 0.78820634, "num_input_tokens_seen": 101177675, "step": 4688, "time_per_iteration": 2.6962485313415527 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01038903, "balance_loss_clip": 1.050143, "balance_loss_mlp": 1.0222497, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 2.029299855452059, "language_loss": 0.78377295, "learning_rate": 3.37039395366863e-06, "loss": 0.80527258, "num_input_tokens_seen": 101192225, "step": 4689, "time_per_iteration": 2.7611160278320312 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.044873, "balance_loss_mlp": 1.02469492, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 1.6619977361488503, "language_loss": 0.78151089, "learning_rate": 3.37011026022934e-06, "loss": 0.80294096, "num_input_tokens_seen": 101210870, "step": 4690, "time_per_iteration": 2.8166253566741943 }, { "auxiliary_loss_clip": 0.01144307, "auxiliary_loss_mlp": 0.0077562, "balance_loss_clip": 1.04972041, "balance_loss_mlp": 1.00065684, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 1.8251699545436237, "language_loss": 0.87835205, "learning_rate": 3.369826514835332e-06, "loss": 0.8975513, "num_input_tokens_seen": 101229965, "step": 4691, "time_per_iteration": 2.755540609359741 }, { "auxiliary_loss_clip": 0.01120177, "auxiliary_loss_mlp": 0.01057161, "balance_loss_clip": 1.0480932, "balance_loss_mlp": 1.03866005, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 2.0164591316320086, "language_loss": 0.81783265, "learning_rate": 3.3695427174973654e-06, "loss": 0.83960605, "num_input_tokens_seen": 101250980, "step": 4692, "time_per_iteration": 2.766826868057251 }, { "auxiliary_loss_clip": 0.01108273, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.05000174, "balance_loss_mlp": 1.02690101, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 1.5153062693168577, "language_loss": 0.74520338, "learning_rate": 3.3692588682262022e-06, "loss": 0.76673198, "num_input_tokens_seen": 101273335, "step": 4693, "time_per_iteration": 2.833829402923584 }, { "auxiliary_loss_clip": 0.01107692, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.04546356, "balance_loss_mlp": 1.02018356, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 1.6139880108231377, "language_loss": 0.77396065, "learning_rate": 3.3689749670326046e-06, "loss": 0.79542327, "num_input_tokens_seen": 101292110, "step": 4694, "time_per_iteration": 2.6783409118652344 }, { "auxiliary_loss_clip": 0.01131719, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.05066633, "balance_loss_mlp": 1.02610695, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 2.1245298140537354, "language_loss": 0.67171001, "learning_rate": 3.3686910139273392e-06, "loss": 0.69346148, "num_input_tokens_seen": 101312815, "step": 4695, "time_per_iteration": 2.657508373260498 }, { "auxiliary_loss_clip": 0.01129418, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.05160189, "balance_loss_mlp": 1.02857292, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 2.1132011275006297, "language_loss": 0.75410438, "learning_rate": 3.3684070089211736e-06, "loss": 0.77587581, "num_input_tokens_seen": 101329045, "step": 4696, "time_per_iteration": 2.6419622898101807 }, { "auxiliary_loss_clip": 0.01108873, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.04857826, "balance_loss_mlp": 1.03241634, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 1.6547739374499746, "language_loss": 0.62379837, "learning_rate": 3.368122952024877e-06, "loss": 0.64538848, "num_input_tokens_seen": 101352715, "step": 4697, "time_per_iteration": 2.863271951675415 }, { "auxiliary_loss_clip": 0.01098306, "auxiliary_loss_mlp": 0.01038026, "balance_loss_clip": 1.04702902, "balance_loss_mlp": 1.0213964, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 1.3648463295211168, "language_loss": 0.73178887, "learning_rate": 3.3678388432492214e-06, "loss": 0.75315219, "num_input_tokens_seen": 101374640, "step": 4698, "time_per_iteration": 2.7437515258789062 }, { "auxiliary_loss_clip": 0.01138661, "auxiliary_loss_mlp": 0.01044687, "balance_loss_clip": 1.04783368, "balance_loss_mlp": 1.02820039, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 1.73143255072412, "language_loss": 0.75260699, "learning_rate": 3.3675546826049788e-06, "loss": 0.77444041, "num_input_tokens_seen": 101393595, "step": 4699, "time_per_iteration": 2.6352651119232178 }, { "auxiliary_loss_clip": 0.01130406, "auxiliary_loss_mlp": 0.01042781, "balance_loss_clip": 1.04642487, "balance_loss_mlp": 1.02379072, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 2.939003683920128, "language_loss": 0.80683541, "learning_rate": 3.3672704701029265e-06, "loss": 0.82856727, "num_input_tokens_seen": 101409265, "step": 4700, "time_per_iteration": 2.597543478012085 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05168593, "balance_loss_mlp": 1.03699148, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 1.8973185440197946, "language_loss": 0.82377315, "learning_rate": 3.3669862057538402e-06, "loss": 0.84547931, "num_input_tokens_seen": 101428365, "step": 4701, "time_per_iteration": 2.6613359451293945 }, { "auxiliary_loss_clip": 0.01079732, "auxiliary_loss_mlp": 0.01044955, "balance_loss_clip": 1.04725862, "balance_loss_mlp": 1.02782488, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 2.6106451650427913, "language_loss": 0.72911763, "learning_rate": 3.3667018895685004e-06, "loss": 0.75036454, "num_input_tokens_seen": 101447280, "step": 4702, "time_per_iteration": 2.927156448364258 }, { "auxiliary_loss_clip": 0.0114189, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.05118549, "balance_loss_mlp": 1.02240694, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 2.1110096252533754, "language_loss": 0.78497601, "learning_rate": 3.3664175215576886e-06, "loss": 0.80678773, "num_input_tokens_seen": 101465435, "step": 4703, "time_per_iteration": 2.603217124938965 }, { "auxiliary_loss_clip": 0.01115372, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.04668045, "balance_loss_mlp": 1.03100109, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 1.6207045759516274, "language_loss": 0.69310379, "learning_rate": 3.3661331017321867e-06, "loss": 0.71475154, "num_input_tokens_seen": 101486355, "step": 4704, "time_per_iteration": 2.737741708755493 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.05106401, "balance_loss_mlp": 1.02204967, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 2.0629797483939893, "language_loss": 0.70487976, "learning_rate": 3.3658486301027807e-06, "loss": 0.72637939, "num_input_tokens_seen": 101505875, "step": 4705, "time_per_iteration": 2.7810943126678467 }, { "auxiliary_loss_clip": 0.01051193, "auxiliary_loss_mlp": 0.01011527, "balance_loss_clip": 1.02885246, "balance_loss_mlp": 1.00905895, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7331461257989402, "language_loss": 0.59262896, "learning_rate": 3.3655641066802577e-06, "loss": 0.6132561, "num_input_tokens_seen": 101565045, "step": 4706, "time_per_iteration": 3.223500967025757 }, { "auxiliary_loss_clip": 0.01117208, "auxiliary_loss_mlp": 0.01042955, "balance_loss_clip": 1.04750693, "balance_loss_mlp": 1.02711248, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.4542369915695899, "language_loss": 0.82314008, "learning_rate": 3.365279531475407e-06, "loss": 0.84474176, "num_input_tokens_seen": 101585825, "step": 4707, "time_per_iteration": 5.995711326599121 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01043198, "balance_loss_clip": 1.04714823, "balance_loss_mlp": 1.02451742, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 1.6937335335925583, "language_loss": 0.80196846, "learning_rate": 3.36499490449902e-06, "loss": 0.82363296, "num_input_tokens_seen": 101606105, "step": 4708, "time_per_iteration": 2.730365753173828 }, { "auxiliary_loss_clip": 0.01036827, "auxiliary_loss_mlp": 0.01004906, "balance_loss_clip": 1.0241586, "balance_loss_mlp": 1.00274837, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 0.8797441515413378, "language_loss": 0.62768304, "learning_rate": 3.3647102257618895e-06, "loss": 0.64810038, "num_input_tokens_seen": 101656875, "step": 4709, "time_per_iteration": 3.0734164714813232 }, { "auxiliary_loss_clip": 0.01113275, "auxiliary_loss_mlp": 0.01045412, "balance_loss_clip": 1.04819441, "balance_loss_mlp": 1.02711344, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 1.4416556980461737, "language_loss": 0.74092108, "learning_rate": 3.3644254952748103e-06, "loss": 0.76250798, "num_input_tokens_seen": 101676225, "step": 4710, "time_per_iteration": 4.214928388595581 }, { "auxiliary_loss_clip": 0.01108833, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.04568553, "balance_loss_mlp": 1.0393765, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 2.192994300890924, "language_loss": 0.7857554, "learning_rate": 3.364140713048579e-06, "loss": 0.80742794, "num_input_tokens_seen": 101693710, "step": 4711, "time_per_iteration": 2.9334824085235596 }, { "auxiliary_loss_clip": 0.01135754, "auxiliary_loss_mlp": 0.00775746, "balance_loss_clip": 1.05244637, "balance_loss_mlp": 1.00072622, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 2.328121287113732, "language_loss": 0.70832199, "learning_rate": 3.363855879093996e-06, "loss": 0.72743702, "num_input_tokens_seen": 101714010, "step": 4712, "time_per_iteration": 2.8570704460144043 }, { "auxiliary_loss_clip": 0.0114641, "auxiliary_loss_mlp": 0.01050688, "balance_loss_clip": 1.05171633, "balance_loss_mlp": 1.03284216, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 2.3843934106626157, "language_loss": 0.81725228, "learning_rate": 3.3635709934218605e-06, "loss": 0.83922327, "num_input_tokens_seen": 101732995, "step": 4713, "time_per_iteration": 4.343034029006958 }, { "auxiliary_loss_clip": 0.01120505, "auxiliary_loss_mlp": 0.01048075, "balance_loss_clip": 1.05054498, "balance_loss_mlp": 1.03044379, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 1.7964609324305687, "language_loss": 0.75316995, "learning_rate": 3.3632860560429766e-06, "loss": 0.77485573, "num_input_tokens_seen": 101751385, "step": 4714, "time_per_iteration": 2.656919479370117 }, { "auxiliary_loss_clip": 0.01129168, "auxiliary_loss_mlp": 0.01051102, "balance_loss_clip": 1.050372, "balance_loss_mlp": 1.03424633, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 1.4082553086863412, "language_loss": 0.78457153, "learning_rate": 3.3630010669681494e-06, "loss": 0.80637431, "num_input_tokens_seen": 101773825, "step": 4715, "time_per_iteration": 2.721869468688965 }, { "auxiliary_loss_clip": 0.01117334, "auxiliary_loss_mlp": 0.01046437, "balance_loss_clip": 1.04618871, "balance_loss_mlp": 1.0294199, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 1.791082386208426, "language_loss": 0.73825723, "learning_rate": 3.3627160262081845e-06, "loss": 0.75989497, "num_input_tokens_seen": 101791920, "step": 4716, "time_per_iteration": 2.689964532852173 }, { "auxiliary_loss_clip": 0.0111778, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.03397131, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 2.1425450832247868, "language_loss": 0.74293232, "learning_rate": 3.3624309337738917e-06, "loss": 0.76465869, "num_input_tokens_seen": 101809515, "step": 4717, "time_per_iteration": 2.653107166290283 }, { "auxiliary_loss_clip": 0.01112398, "auxiliary_loss_mlp": 0.01052347, "balance_loss_clip": 1.04736984, "balance_loss_mlp": 1.03526437, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 1.96982951308544, "language_loss": 0.67022157, "learning_rate": 3.3621457896760813e-06, "loss": 0.69186902, "num_input_tokens_seen": 101827735, "step": 4718, "time_per_iteration": 2.7287323474884033 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01052629, "balance_loss_clip": 1.04606366, "balance_loss_mlp": 1.03479528, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 1.7409435577223806, "language_loss": 0.72453725, "learning_rate": 3.361860593925566e-06, "loss": 0.7462635, "num_input_tokens_seen": 101845970, "step": 4719, "time_per_iteration": 2.7101874351501465 }, { "auxiliary_loss_clip": 0.01129472, "auxiliary_loss_mlp": 0.01044, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02711964, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 1.8163652523997504, "language_loss": 0.80517805, "learning_rate": 3.3615753465331605e-06, "loss": 0.82691276, "num_input_tokens_seen": 101865040, "step": 4720, "time_per_iteration": 2.630380392074585 }, { "auxiliary_loss_clip": 0.01130938, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.04798317, "balance_loss_mlp": 1.02935672, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 2.340232614040239, "language_loss": 0.79146183, "learning_rate": 3.3612900475096817e-06, "loss": 0.81324387, "num_input_tokens_seen": 101883735, "step": 4721, "time_per_iteration": 2.6779117584228516 }, { "auxiliary_loss_clip": 0.01091324, "auxiliary_loss_mlp": 0.00778191, "balance_loss_clip": 1.04653215, "balance_loss_mlp": 1.00074911, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 1.7859505861297744, "language_loss": 0.82514244, "learning_rate": 3.3610046968659474e-06, "loss": 0.84383762, "num_input_tokens_seen": 101903025, "step": 4722, "time_per_iteration": 2.8601412773132324 }, { "auxiliary_loss_clip": 0.0114735, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.05396807, "balance_loss_mlp": 1.02641416, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 1.8976073667217488, "language_loss": 0.70048773, "learning_rate": 3.3607192946127785e-06, "loss": 0.72239512, "num_input_tokens_seen": 101922255, "step": 4723, "time_per_iteration": 2.6259007453918457 }, { "auxiliary_loss_clip": 0.0111455, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04818106, "balance_loss_mlp": 1.03247368, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 1.540245146059843, "language_loss": 0.78676599, "learning_rate": 3.360433840760998e-06, "loss": 0.80842292, "num_input_tokens_seen": 101943100, "step": 4724, "time_per_iteration": 2.7364859580993652 }, { "auxiliary_loss_clip": 0.01116323, "auxiliary_loss_mlp": 0.01063488, "balance_loss_clip": 1.04846072, "balance_loss_mlp": 1.04442668, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 1.6728910575536384, "language_loss": 0.92433345, "learning_rate": 3.36014833532143e-06, "loss": 0.94613159, "num_input_tokens_seen": 101963160, "step": 4725, "time_per_iteration": 2.653244733810425 }, { "auxiliary_loss_clip": 0.01137335, "auxiliary_loss_mlp": 0.01047317, "balance_loss_clip": 1.05249703, "balance_loss_mlp": 1.02951932, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 1.5774329387244128, "language_loss": 0.88881439, "learning_rate": 3.3598627783049e-06, "loss": 0.91066098, "num_input_tokens_seen": 101984300, "step": 4726, "time_per_iteration": 2.6815872192382812 }, { "auxiliary_loss_clip": 0.01132666, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.05290008, "balance_loss_mlp": 1.03223181, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 2.008368257744288, "language_loss": 0.78913373, "learning_rate": 3.359577169722238e-06, "loss": 0.81095803, "num_input_tokens_seen": 102005765, "step": 4727, "time_per_iteration": 2.8668875694274902 }, { "auxiliary_loss_clip": 0.01134036, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.05225933, "balance_loss_mlp": 1.02603006, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 2.1196929739552433, "language_loss": 0.66590458, "learning_rate": 3.3592915095842733e-06, "loss": 0.68766308, "num_input_tokens_seen": 102022755, "step": 4728, "time_per_iteration": 2.6871252059936523 }, { "auxiliary_loss_clip": 0.01111522, "auxiliary_loss_mlp": 0.01054966, "balance_loss_clip": 1.04948676, "balance_loss_mlp": 1.03766847, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 1.7247901443745783, "language_loss": 0.76369143, "learning_rate": 3.3590057979018386e-06, "loss": 0.78535628, "num_input_tokens_seen": 102041850, "step": 4729, "time_per_iteration": 2.671739339828491 }, { "auxiliary_loss_clip": 0.01121198, "auxiliary_loss_mlp": 0.01054506, "balance_loss_clip": 1.05166233, "balance_loss_mlp": 1.03707767, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 1.8284571123244682, "language_loss": 0.67062581, "learning_rate": 3.3587200346857674e-06, "loss": 0.69238287, "num_input_tokens_seen": 102059500, "step": 4730, "time_per_iteration": 2.6957883834838867 }, { "auxiliary_loss_clip": 0.01120949, "auxiliary_loss_mlp": 0.01040777, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02283621, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 1.8142087038783352, "language_loss": 0.7456513, "learning_rate": 3.3584342199468965e-06, "loss": 0.76726854, "num_input_tokens_seen": 102080460, "step": 4731, "time_per_iteration": 2.7621212005615234 }, { "auxiliary_loss_clip": 0.01100065, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.04959893, "balance_loss_mlp": 1.02338386, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 1.4533231430590194, "language_loss": 0.83672202, "learning_rate": 3.3581483536960638e-06, "loss": 0.85812879, "num_input_tokens_seen": 102100950, "step": 4732, "time_per_iteration": 2.807701587677002 }, { "auxiliary_loss_clip": 0.01135958, "auxiliary_loss_mlp": 0.01049006, "balance_loss_clip": 1.05248308, "balance_loss_mlp": 1.03040957, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 2.88493918484894, "language_loss": 0.78892827, "learning_rate": 3.357862435944109e-06, "loss": 0.8107779, "num_input_tokens_seen": 102119345, "step": 4733, "time_per_iteration": 2.66524076461792 }, { "auxiliary_loss_clip": 0.01153472, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05533004, "balance_loss_mlp": 1.02984452, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 2.2364375024988776, "language_loss": 0.71791029, "learning_rate": 3.357576466701875e-06, "loss": 0.73992205, "num_input_tokens_seen": 102139050, "step": 4734, "time_per_iteration": 2.6941637992858887 }, { "auxiliary_loss_clip": 0.01125779, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.05455363, "balance_loss_mlp": 1.01766825, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 1.8491255089189595, "language_loss": 0.73942113, "learning_rate": 3.3572904459802056e-06, "loss": 0.76103032, "num_input_tokens_seen": 102157935, "step": 4735, "time_per_iteration": 2.736027956008911 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.05248201, "balance_loss_mlp": 1.03177929, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 1.7217440703764713, "language_loss": 0.79690897, "learning_rate": 3.357004373789946e-06, "loss": 0.81862932, "num_input_tokens_seen": 102175325, "step": 4736, "time_per_iteration": 2.7069075107574463 }, { "auxiliary_loss_clip": 0.01152237, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.0569663, "balance_loss_mlp": 1.03019249, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 2.5331890881723327, "language_loss": 0.59956342, "learning_rate": 3.3567182501419453e-06, "loss": 0.62157094, "num_input_tokens_seen": 102196625, "step": 4737, "time_per_iteration": 2.718904972076416 }, { "auxiliary_loss_clip": 0.01131951, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.05099404, "balance_loss_mlp": 1.02437758, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 1.8696274848062555, "language_loss": 0.86556888, "learning_rate": 3.356432075047052e-06, "loss": 0.88730049, "num_input_tokens_seen": 102214975, "step": 4738, "time_per_iteration": 2.719223976135254 }, { "auxiliary_loss_clip": 0.01127313, "auxiliary_loss_mlp": 0.01051123, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.03207278, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 2.688438536338364, "language_loss": 0.90028232, "learning_rate": 3.356145848516118e-06, "loss": 0.92206669, "num_input_tokens_seen": 102231885, "step": 4739, "time_per_iteration": 2.674363851547241 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01044124, "balance_loss_clip": 1.05522013, "balance_loss_mlp": 1.02627802, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 1.41783833400805, "language_loss": 0.7216897, "learning_rate": 3.355859570559998e-06, "loss": 0.74349129, "num_input_tokens_seen": 102252725, "step": 4740, "time_per_iteration": 2.688591957092285 }, { "auxiliary_loss_clip": 0.01130927, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.02229571, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 3.325446081949271, "language_loss": 0.77782756, "learning_rate": 3.3555732411895477e-06, "loss": 0.79952878, "num_input_tokens_seen": 102271730, "step": 4741, "time_per_iteration": 2.6747119426727295 }, { "auxiliary_loss_clip": 0.01107503, "auxiliary_loss_mlp": 0.01048819, "balance_loss_clip": 1.04771924, "balance_loss_mlp": 1.03065109, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 1.6557809034578879, "language_loss": 0.75952959, "learning_rate": 3.3552868604156235e-06, "loss": 0.78109288, "num_input_tokens_seen": 102291325, "step": 4742, "time_per_iteration": 2.7584095001220703 }, { "auxiliary_loss_clip": 0.01151989, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05341601, "balance_loss_mlp": 1.03720486, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 2.0538587827096713, "language_loss": 0.57376975, "learning_rate": 3.355000428249086e-06, "loss": 0.59586358, "num_input_tokens_seen": 102309000, "step": 4743, "time_per_iteration": 2.621572494506836 }, { "auxiliary_loss_clip": 0.01116239, "auxiliary_loss_mlp": 0.01056356, "balance_loss_clip": 1.05067348, "balance_loss_mlp": 1.03747356, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 1.6259491452975234, "language_loss": 0.74499846, "learning_rate": 3.354713944700797e-06, "loss": 0.76672441, "num_input_tokens_seen": 102329240, "step": 4744, "time_per_iteration": 2.8029959201812744 }, { "auxiliary_loss_clip": 0.01132324, "auxiliary_loss_mlp": 0.01047205, "balance_loss_clip": 1.05420351, "balance_loss_mlp": 1.03014612, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 2.4725597828733563, "language_loss": 0.77258176, "learning_rate": 3.3544274097816185e-06, "loss": 0.79437709, "num_input_tokens_seen": 102344440, "step": 4745, "time_per_iteration": 2.5961194038391113 }, { "auxiliary_loss_clip": 0.01124474, "auxiliary_loss_mlp": 0.01040571, "balance_loss_clip": 1.05262041, "balance_loss_mlp": 1.02427554, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 1.9164884333366974, "language_loss": 0.8275286, "learning_rate": 3.3541408235024173e-06, "loss": 0.84917903, "num_input_tokens_seen": 102360985, "step": 4746, "time_per_iteration": 4.211855411529541 }, { "auxiliary_loss_clip": 0.01101779, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.0488627, "balance_loss_mlp": 1.02497482, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 1.8281951571940926, "language_loss": 0.79537141, "learning_rate": 3.3538541858740604e-06, "loss": 0.81682348, "num_input_tokens_seen": 102380320, "step": 4747, "time_per_iteration": 4.276613712310791 }, { "auxiliary_loss_clip": 0.01046154, "auxiliary_loss_mlp": 0.01017989, "balance_loss_clip": 1.02844512, "balance_loss_mlp": 1.01572371, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7754147669680839, "language_loss": 0.6049211, "learning_rate": 3.3535674969074173e-06, "loss": 0.62556255, "num_input_tokens_seen": 102439140, "step": 4748, "time_per_iteration": 3.0963478088378906 }, { "auxiliary_loss_clip": 0.01148062, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.05367923, "balance_loss_mlp": 1.03001821, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 2.39914017508816, "language_loss": 0.8061412, "learning_rate": 3.3532807566133592e-06, "loss": 0.82810223, "num_input_tokens_seen": 102450990, "step": 4749, "time_per_iteration": 4.199607610702515 }, { "auxiliary_loss_clip": 0.01135936, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.05160487, "balance_loss_mlp": 1.02788317, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 1.92101956988616, "language_loss": 0.70763719, "learning_rate": 3.3529939650027587e-06, "loss": 0.72944903, "num_input_tokens_seen": 102471820, "step": 4750, "time_per_iteration": 2.6975722312927246 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.05308008, "balance_loss_mlp": 1.02660573, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 1.619747991653998, "language_loss": 0.81983078, "learning_rate": 3.3527071220864917e-06, "loss": 0.84158808, "num_input_tokens_seen": 102492625, "step": 4751, "time_per_iteration": 2.685194969177246 }, { "auxiliary_loss_clip": 0.01146027, "auxiliary_loss_mlp": 0.01046872, "balance_loss_clip": 1.0541997, "balance_loss_mlp": 1.03009951, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 2.1857777553010203, "language_loss": 0.80359828, "learning_rate": 3.3524202278754353e-06, "loss": 0.82552731, "num_input_tokens_seen": 102514145, "step": 4752, "time_per_iteration": 4.363154649734497 }, { "auxiliary_loss_clip": 0.01130862, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.04920304, "balance_loss_mlp": 1.02675319, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 2.612706759191024, "language_loss": 0.78674287, "learning_rate": 3.3521332823804676e-06, "loss": 0.8085075, "num_input_tokens_seen": 102532365, "step": 4753, "time_per_iteration": 2.6128499507904053 }, { "auxiliary_loss_clip": 0.0114991, "auxiliary_loss_mlp": 0.01051658, "balance_loss_clip": 1.05356765, "balance_loss_mlp": 1.03166628, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 3.5161743537336596, "language_loss": 0.8947711, "learning_rate": 3.3518462856124704e-06, "loss": 0.91678679, "num_input_tokens_seen": 102548425, "step": 4754, "time_per_iteration": 2.5410687923431396 }, { "auxiliary_loss_clip": 0.01130155, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.05048347, "balance_loss_mlp": 1.03026593, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 2.3617926288322724, "language_loss": 0.82039523, "learning_rate": 3.3515592375823267e-06, "loss": 0.84216481, "num_input_tokens_seen": 102566370, "step": 4755, "time_per_iteration": 2.6514527797698975 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.04732597, "balance_loss_mlp": 1.03233767, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 1.6385978416895255, "language_loss": 0.83764589, "learning_rate": 3.351272138300922e-06, "loss": 0.8591305, "num_input_tokens_seen": 102588715, "step": 4756, "time_per_iteration": 2.7975916862487793 }, { "auxiliary_loss_clip": 0.01023363, "auxiliary_loss_mlp": 0.01007772, "balance_loss_clip": 1.01913142, "balance_loss_mlp": 1.00524473, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.8721113874523594, "language_loss": 0.6097033, "learning_rate": 3.350984987779142e-06, "loss": 0.63001466, "num_input_tokens_seen": 102656715, "step": 4757, "time_per_iteration": 3.406625986099243 }, { "auxiliary_loss_clip": 0.01147819, "auxiliary_loss_mlp": 0.01038916, "balance_loss_clip": 1.05585599, "balance_loss_mlp": 1.021595, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 2.030913944398288, "language_loss": 0.66206789, "learning_rate": 3.3506977860278756e-06, "loss": 0.68393528, "num_input_tokens_seen": 102676545, "step": 4758, "time_per_iteration": 2.589768648147583 }, { "auxiliary_loss_clip": 0.01133475, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.04988813, "balance_loss_mlp": 1.02581418, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 2.019963236438103, "language_loss": 0.63374877, "learning_rate": 3.3504105330580143e-06, "loss": 0.65551043, "num_input_tokens_seen": 102702875, "step": 4759, "time_per_iteration": 2.809325695037842 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.00777076, "balance_loss_clip": 1.04924989, "balance_loss_mlp": 1.00088644, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 1.9693348774443893, "language_loss": 0.74033993, "learning_rate": 3.3501232288804496e-06, "loss": 0.75940251, "num_input_tokens_seen": 102723160, "step": 4760, "time_per_iteration": 2.6797397136688232 }, { "auxiliary_loss_clip": 0.01124387, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.05517232, "balance_loss_mlp": 1.02849925, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 2.574168946313644, "language_loss": 0.72227889, "learning_rate": 3.3498358735060773e-06, "loss": 0.74397296, "num_input_tokens_seen": 102743855, "step": 4761, "time_per_iteration": 2.672394275665283 }, { "auxiliary_loss_clip": 0.01079005, "auxiliary_loss_mlp": 0.01049385, "balance_loss_clip": 1.04688287, "balance_loss_mlp": 1.03218305, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 2.095293128310336, "language_loss": 0.74758703, "learning_rate": 3.349548466945793e-06, "loss": 0.76887095, "num_input_tokens_seen": 102761370, "step": 4762, "time_per_iteration": 2.8573946952819824 }, { "auxiliary_loss_clip": 0.01108257, "auxiliary_loss_mlp": 0.01044255, "balance_loss_clip": 1.05117726, "balance_loss_mlp": 1.02725577, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 1.4714690500952254, "language_loss": 0.76185489, "learning_rate": 3.349261009210496e-06, "loss": 0.78338003, "num_input_tokens_seen": 102780885, "step": 4763, "time_per_iteration": 2.7058494091033936 }, { "auxiliary_loss_clip": 0.01103052, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.0442332, "balance_loss_mlp": 1.0234046, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 2.250941696220621, "language_loss": 0.77264833, "learning_rate": 3.348973500311086e-06, "loss": 0.79409599, "num_input_tokens_seen": 102801000, "step": 4764, "time_per_iteration": 2.7363107204437256 }, { "auxiliary_loss_clip": 0.0111141, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.02520347, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 3.808468667851145, "language_loss": 0.71222258, "learning_rate": 3.348685940258466e-06, "loss": 0.73377991, "num_input_tokens_seen": 102820230, "step": 4765, "time_per_iteration": 2.7225682735443115 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.0501802, "balance_loss_mlp": 1.02118707, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 1.6284115173108313, "language_loss": 0.76206756, "learning_rate": 3.3483983290635395e-06, "loss": 0.78371924, "num_input_tokens_seen": 102842670, "step": 4766, "time_per_iteration": 2.724776268005371 }, { "auxiliary_loss_clip": 0.01130255, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.0502758, "balance_loss_mlp": 1.02133691, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.7313176116986193, "language_loss": 0.77457404, "learning_rate": 3.348110666737214e-06, "loss": 0.79625863, "num_input_tokens_seen": 102864480, "step": 4767, "time_per_iteration": 2.7313742637634277 }, { "auxiliary_loss_clip": 0.0114162, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.02519727, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 1.7818476838857593, "language_loss": 0.65043855, "learning_rate": 3.3478229532903956e-06, "loss": 0.67228168, "num_input_tokens_seen": 102883740, "step": 4768, "time_per_iteration": 2.6173784732818604 }, { "auxiliary_loss_clip": 0.01123197, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.04803848, "balance_loss_mlp": 1.02385533, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.5842392137882455, "language_loss": 0.70497799, "learning_rate": 3.3475351887339967e-06, "loss": 0.7266233, "num_input_tokens_seen": 102902945, "step": 4769, "time_per_iteration": 2.627859115600586 }, { "auxiliary_loss_clip": 0.01078118, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.04276228, "balance_loss_mlp": 1.01722169, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 1.555057890983365, "language_loss": 0.74735439, "learning_rate": 3.3472473730789288e-06, "loss": 0.76847351, "num_input_tokens_seen": 102922405, "step": 4770, "time_per_iteration": 2.807286262512207 }, { "auxiliary_loss_clip": 0.01094623, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.04522562, "balance_loss_mlp": 1.02336657, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 2.2768786529491427, "language_loss": 0.6760053, "learning_rate": 3.3469595063361045e-06, "loss": 0.6973632, "num_input_tokens_seen": 102938980, "step": 4771, "time_per_iteration": 2.7709410190582275 }, { "auxiliary_loss_clip": 0.01041422, "auxiliary_loss_mlp": 0.01015109, "balance_loss_clip": 1.01907253, "balance_loss_mlp": 1.01243877, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.770068198596698, "language_loss": 0.56874299, "learning_rate": 3.3466715885164414e-06, "loss": 0.58930826, "num_input_tokens_seen": 103000405, "step": 4772, "time_per_iteration": 3.0978245735168457 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.0077878, "balance_loss_clip": 1.04115915, "balance_loss_mlp": 1.00089169, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 2.7874039039613345, "language_loss": 0.82870376, "learning_rate": 3.346383619630856e-06, "loss": 0.84716898, "num_input_tokens_seen": 103017970, "step": 4773, "time_per_iteration": 2.7716143131256104 }, { "auxiliary_loss_clip": 0.0114188, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.04776216, "balance_loss_mlp": 1.02553546, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 11.069053071667042, "language_loss": 0.77580261, "learning_rate": 3.34609559969027e-06, "loss": 0.79765546, "num_input_tokens_seen": 103036385, "step": 4774, "time_per_iteration": 2.604790687561035 }, { "auxiliary_loss_clip": 0.01119567, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.04915977, "balance_loss_mlp": 1.02414346, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 1.9103573283121942, "language_loss": 0.73611873, "learning_rate": 3.3458075287056034e-06, "loss": 0.75773501, "num_input_tokens_seen": 103052170, "step": 4775, "time_per_iteration": 2.6234211921691895 }, { "auxiliary_loss_clip": 0.01133151, "auxiliary_loss_mlp": 0.01045326, "balance_loss_clip": 1.04905081, "balance_loss_mlp": 1.02782607, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 1.6535491049734306, "language_loss": 0.88343942, "learning_rate": 3.34551940668778e-06, "loss": 0.9052242, "num_input_tokens_seen": 103070510, "step": 4776, "time_per_iteration": 2.6941640377044678 }, { "auxiliary_loss_clip": 0.01132773, "auxiliary_loss_mlp": 0.0104327, "balance_loss_clip": 1.05156159, "balance_loss_mlp": 1.02712941, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 1.7321020140737395, "language_loss": 0.74257779, "learning_rate": 3.345231233647726e-06, "loss": 0.76433825, "num_input_tokens_seen": 103089590, "step": 4777, "time_per_iteration": 2.645650863647461 }, { "auxiliary_loss_clip": 0.01126691, "auxiliary_loss_mlp": 0.01045293, "balance_loss_clip": 1.05245948, "balance_loss_mlp": 1.02812648, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 1.9446580110028222, "language_loss": 0.80069196, "learning_rate": 3.3449430095963696e-06, "loss": 0.82241178, "num_input_tokens_seen": 103109080, "step": 4778, "time_per_iteration": 2.7606308460235596 }, { "auxiliary_loss_clip": 0.01123482, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.05461526, "balance_loss_mlp": 1.02750611, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 1.7560492266469991, "language_loss": 0.7396307, "learning_rate": 3.3446547345446386e-06, "loss": 0.76131058, "num_input_tokens_seen": 103127755, "step": 4779, "time_per_iteration": 2.831167221069336 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.04866719, "balance_loss_mlp": 1.0262928, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 1.5882306223862566, "language_loss": 0.76327771, "learning_rate": 3.3443664085034656e-06, "loss": 0.7849375, "num_input_tokens_seen": 103147035, "step": 4780, "time_per_iteration": 2.6548538208007812 }, { "auxiliary_loss_clip": 0.01102465, "auxiliary_loss_mlp": 0.01042038, "balance_loss_clip": 1.04413557, "balance_loss_mlp": 1.02517641, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 1.5896497572299877, "language_loss": 0.81445092, "learning_rate": 3.344078031483784e-06, "loss": 0.83589596, "num_input_tokens_seen": 103165410, "step": 4781, "time_per_iteration": 2.6422417163848877 }, { "auxiliary_loss_clip": 0.01109573, "auxiliary_loss_mlp": 0.01045358, "balance_loss_clip": 1.05339658, "balance_loss_mlp": 1.0277034, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 1.8389370421072637, "language_loss": 0.86738765, "learning_rate": 3.3437896034965283e-06, "loss": 0.888937, "num_input_tokens_seen": 103183710, "step": 4782, "time_per_iteration": 2.7507951259613037 }, { "auxiliary_loss_clip": 0.01113582, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.05343366, "balance_loss_mlp": 1.02604771, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 1.5283433651606986, "language_loss": 0.71153063, "learning_rate": 3.3435011245526357e-06, "loss": 0.73309994, "num_input_tokens_seen": 103203790, "step": 4783, "time_per_iteration": 2.7166218757629395 }, { "auxiliary_loss_clip": 0.0112343, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.05475473, "balance_loss_mlp": 1.030761, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 1.6861942701171202, "language_loss": 0.76872855, "learning_rate": 3.343212594663047e-06, "loss": 0.79044163, "num_input_tokens_seen": 103223925, "step": 4784, "time_per_iteration": 2.693665027618408 }, { "auxiliary_loss_clip": 0.01095423, "auxiliary_loss_mlp": 0.01053931, "balance_loss_clip": 1.04587293, "balance_loss_mlp": 1.03514349, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 4.596098798847224, "language_loss": 0.75646108, "learning_rate": 3.3429240138387015e-06, "loss": 0.77795458, "num_input_tokens_seen": 103244760, "step": 4785, "time_per_iteration": 4.380687236785889 }, { "auxiliary_loss_clip": 0.01144615, "auxiliary_loss_mlp": 0.01048905, "balance_loss_clip": 1.0532378, "balance_loss_mlp": 1.03213263, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 2.434913324661012, "language_loss": 0.83660555, "learning_rate": 3.3426353820905425e-06, "loss": 0.85854077, "num_input_tokens_seen": 103261995, "step": 4786, "time_per_iteration": 4.138700723648071 }, { "auxiliary_loss_clip": 0.01113505, "auxiliary_loss_mlp": 0.0077478, "balance_loss_clip": 1.05201936, "balance_loss_mlp": 1.00095487, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 1.8737605513707083, "language_loss": 0.80388975, "learning_rate": 3.342346699429516e-06, "loss": 0.82277262, "num_input_tokens_seen": 103279780, "step": 4787, "time_per_iteration": 2.7030651569366455 }, { "auxiliary_loss_clip": 0.01120528, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.02212751, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 1.8370986188087255, "language_loss": 0.83052301, "learning_rate": 3.3420579658665677e-06, "loss": 0.85212183, "num_input_tokens_seen": 103300580, "step": 4788, "time_per_iteration": 2.7650442123413086 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.0567044, "balance_loss_mlp": 1.0279882, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 7.859878454786593, "language_loss": 0.73045379, "learning_rate": 3.3417691814126468e-06, "loss": 0.75201148, "num_input_tokens_seen": 103320430, "step": 4789, "time_per_iteration": 4.340694189071655 }, { "auxiliary_loss_clip": 0.01123471, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04852343, "balance_loss_mlp": 1.02599669, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 1.7615007973154742, "language_loss": 0.84425223, "learning_rate": 3.341480346078704e-06, "loss": 0.86591256, "num_input_tokens_seen": 103337695, "step": 4790, "time_per_iteration": 2.6953821182250977 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.01049022, "balance_loss_clip": 1.05240703, "balance_loss_mlp": 1.03145027, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 1.743209341690147, "language_loss": 0.78031182, "learning_rate": 3.3411914598756922e-06, "loss": 0.80216199, "num_input_tokens_seen": 103357010, "step": 4791, "time_per_iteration": 4.299259424209595 }, { "auxiliary_loss_clip": 0.01120123, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05015528, "balance_loss_mlp": 1.01999843, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 2.2148694233914474, "language_loss": 0.70164073, "learning_rate": 3.3409025228145654e-06, "loss": 0.72321159, "num_input_tokens_seen": 103375600, "step": 4792, "time_per_iteration": 2.646732807159424 }, { "auxiliary_loss_clip": 0.01107079, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.05645919, "balance_loss_mlp": 1.02149773, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 1.9192442052106609, "language_loss": 0.79200894, "learning_rate": 3.3406135349062812e-06, "loss": 0.81345713, "num_input_tokens_seen": 103395225, "step": 4793, "time_per_iteration": 2.765010356903076 }, { "auxiliary_loss_clip": 0.01117839, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.05114603, "balance_loss_mlp": 1.02235532, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 1.7689864288971164, "language_loss": 0.78136635, "learning_rate": 3.340324496161797e-06, "loss": 0.80292487, "num_input_tokens_seen": 103417245, "step": 4794, "time_per_iteration": 2.868473529815674 }, { "auxiliary_loss_clip": 0.01134193, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.05259347, "balance_loss_mlp": 1.02856886, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.1692523829597063, "language_loss": 0.8320052, "learning_rate": 3.340035406592074e-06, "loss": 0.85380542, "num_input_tokens_seen": 103435500, "step": 4795, "time_per_iteration": 2.6216471195220947 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.05043364, "balance_loss_mlp": 1.0279845, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 2.290853867887048, "language_loss": 0.74744678, "learning_rate": 3.339746266208074e-06, "loss": 0.76915002, "num_input_tokens_seen": 103451040, "step": 4796, "time_per_iteration": 2.6819822788238525 }, { "auxiliary_loss_clip": 0.01136938, "auxiliary_loss_mlp": 0.01040822, "balance_loss_clip": 1.05140758, "balance_loss_mlp": 1.02221298, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 1.9890524806298786, "language_loss": 0.73144913, "learning_rate": 3.3394570750207614e-06, "loss": 0.7532267, "num_input_tokens_seen": 103471330, "step": 4797, "time_per_iteration": 2.666097640991211 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.00775335, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.00097072, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 1.9324008515617646, "language_loss": 0.74650872, "learning_rate": 3.3391678330411017e-06, "loss": 0.76530659, "num_input_tokens_seen": 103488060, "step": 4798, "time_per_iteration": 2.7281830310821533 }, { "auxiliary_loss_clip": 0.0113412, "auxiliary_loss_mlp": 0.01043523, "balance_loss_clip": 1.04996431, "balance_loss_mlp": 1.02463984, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 3.037553219769834, "language_loss": 0.66004431, "learning_rate": 3.3388785402800642e-06, "loss": 0.68182075, "num_input_tokens_seen": 103503600, "step": 4799, "time_per_iteration": 2.6416096687316895 }, { "auxiliary_loss_clip": 0.01144575, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.05205584, "balance_loss_mlp": 1.03268862, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 1.7946911133370596, "language_loss": 0.8231616, "learning_rate": 3.3385891967486178e-06, "loss": 0.84510577, "num_input_tokens_seen": 103524195, "step": 4800, "time_per_iteration": 2.704357624053955 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.04861474, "balance_loss_mlp": 1.02392507, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 1.5930665564066124, "language_loss": 0.9080106, "learning_rate": 3.3382998024577347e-06, "loss": 0.92946744, "num_input_tokens_seen": 103545235, "step": 4801, "time_per_iteration": 2.8163902759552 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.00775037, "balance_loss_clip": 1.05178905, "balance_loss_mlp": 1.0008862, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 2.098995863955026, "language_loss": 0.74342406, "learning_rate": 3.33801035741839e-06, "loss": 0.76236671, "num_input_tokens_seen": 103563305, "step": 4802, "time_per_iteration": 2.8244271278381348 }, { "auxiliary_loss_clip": 0.01029511, "auxiliary_loss_mlp": 0.01004263, "balance_loss_clip": 1.02472734, "balance_loss_mlp": 1.00193822, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.7780596068321518, "language_loss": 0.62987334, "learning_rate": 3.337720861641558e-06, "loss": 0.65021104, "num_input_tokens_seen": 103625025, "step": 4803, "time_per_iteration": 3.299269676208496 }, { "auxiliary_loss_clip": 0.01083739, "auxiliary_loss_mlp": 0.01051002, "balance_loss_clip": 1.03981495, "balance_loss_mlp": 1.03369915, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 1.8528386679599225, "language_loss": 0.71095157, "learning_rate": 3.3374313151382165e-06, "loss": 0.73229897, "num_input_tokens_seen": 103644235, "step": 4804, "time_per_iteration": 2.762883424758911 }, { "auxiliary_loss_clip": 0.01135071, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.05108273, "balance_loss_mlp": 1.0289135, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 1.926588918304246, "language_loss": 0.67916834, "learning_rate": 3.337141717919346e-06, "loss": 0.70099443, "num_input_tokens_seen": 103664700, "step": 4805, "time_per_iteration": 2.6848111152648926 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05359602, "balance_loss_mlp": 1.03029394, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 1.4381182508216341, "language_loss": 0.69720542, "learning_rate": 3.3368520699959272e-06, "loss": 0.71902293, "num_input_tokens_seen": 103686595, "step": 4806, "time_per_iteration": 2.762458562850952 }, { "auxiliary_loss_clip": 0.01120642, "auxiliary_loss_mlp": 0.01052311, "balance_loss_clip": 1.05073118, "balance_loss_mlp": 1.03559768, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.4600495853323927, "language_loss": 0.71255589, "learning_rate": 3.3365623713789443e-06, "loss": 0.73428547, "num_input_tokens_seen": 103707525, "step": 4807, "time_per_iteration": 2.740931987762451 }, { "auxiliary_loss_clip": 0.01106054, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.05087459, "balance_loss_mlp": 1.02625299, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 1.6111027163793539, "language_loss": 0.81489629, "learning_rate": 3.336272622079382e-06, "loss": 0.83639085, "num_input_tokens_seen": 103727905, "step": 4808, "time_per_iteration": 2.722787380218506 }, { "auxiliary_loss_clip": 0.01098162, "auxiliary_loss_mlp": 0.01048507, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.03160298, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 1.7874609682529725, "language_loss": 0.78304112, "learning_rate": 3.3359828221082276e-06, "loss": 0.80450785, "num_input_tokens_seen": 103748335, "step": 4809, "time_per_iteration": 2.742063522338867 }, { "auxiliary_loss_clip": 0.01091743, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.04519784, "balance_loss_mlp": 1.02924204, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 1.7709564567634208, "language_loss": 0.78864932, "learning_rate": 3.3356929714764714e-06, "loss": 0.81004226, "num_input_tokens_seen": 103767020, "step": 4810, "time_per_iteration": 2.7578415870666504 }, { "auxiliary_loss_clip": 0.01090252, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.04552603, "balance_loss_mlp": 1.02280235, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 1.6298276151024105, "language_loss": 0.76974982, "learning_rate": 3.3354030701951032e-06, "loss": 0.79104245, "num_input_tokens_seen": 103786355, "step": 4811, "time_per_iteration": 2.7336831092834473 }, { "auxiliary_loss_clip": 0.01132677, "auxiliary_loss_mlp": 0.01047674, "balance_loss_clip": 1.05356216, "balance_loss_mlp": 1.03038859, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 1.4740946425962824, "language_loss": 0.77044773, "learning_rate": 3.335113118275117e-06, "loss": 0.79225123, "num_input_tokens_seen": 103809345, "step": 4812, "time_per_iteration": 2.745115280151367 }, { "auxiliary_loss_clip": 0.01024348, "auxiliary_loss_mlp": 0.01009076, "balance_loss_clip": 1.02794337, "balance_loss_mlp": 1.00728762, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8337141037006477, "language_loss": 0.60292435, "learning_rate": 3.3348231157275085e-06, "loss": 0.62325859, "num_input_tokens_seen": 103871180, "step": 4813, "time_per_iteration": 3.3592262268066406 }, { "auxiliary_loss_clip": 0.01094544, "auxiliary_loss_mlp": 0.01044805, "balance_loss_clip": 1.0431211, "balance_loss_mlp": 1.02734065, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 3.1340543474440623, "language_loss": 0.82301223, "learning_rate": 3.3345330625632725e-06, "loss": 0.84440577, "num_input_tokens_seen": 103889040, "step": 4814, "time_per_iteration": 2.7069244384765625 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01052591, "balance_loss_clip": 1.05051374, "balance_loss_mlp": 1.03556752, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 1.6672038490985601, "language_loss": 0.73249441, "learning_rate": 3.3342429587934094e-06, "loss": 0.75403512, "num_input_tokens_seen": 103910380, "step": 4815, "time_per_iteration": 2.764214515686035 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.05259883, "balance_loss_mlp": 1.02997231, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.9821106518618066, "language_loss": 0.70783043, "learning_rate": 3.3339528044289198e-06, "loss": 0.72959292, "num_input_tokens_seen": 103929955, "step": 4816, "time_per_iteration": 2.7809629440307617 }, { "auxiliary_loss_clip": 0.01119261, "auxiliary_loss_mlp": 0.01048806, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.03097248, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.3636227133284122, "language_loss": 0.7445122, "learning_rate": 3.3336625994808055e-06, "loss": 0.76619279, "num_input_tokens_seen": 103948020, "step": 4817, "time_per_iteration": 2.829183578491211 }, { "auxiliary_loss_clip": 0.01108198, "auxiliary_loss_mlp": 0.01054129, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.03633142, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 1.8479613371686012, "language_loss": 0.76190692, "learning_rate": 3.3333723439600723e-06, "loss": 0.78353024, "num_input_tokens_seen": 103968740, "step": 4818, "time_per_iteration": 2.827925443649292 }, { "auxiliary_loss_clip": 0.01074516, "auxiliary_loss_mlp": 0.01041914, "balance_loss_clip": 1.04805899, "balance_loss_mlp": 1.02477193, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 1.9558897556763024, "language_loss": 0.80060315, "learning_rate": 3.3330820378777263e-06, "loss": 0.82176751, "num_input_tokens_seen": 103986005, "step": 4819, "time_per_iteration": 2.8941574096679688 }, { "auxiliary_loss_clip": 0.01110223, "auxiliary_loss_mlp": 0.01048219, "balance_loss_clip": 1.0494163, "balance_loss_mlp": 1.02931273, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 1.8074124972104149, "language_loss": 0.78504574, "learning_rate": 3.332791681244776e-06, "loss": 0.80663019, "num_input_tokens_seen": 104005070, "step": 4820, "time_per_iteration": 2.7016515731811523 }, { "auxiliary_loss_clip": 0.01096478, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.04924846, "balance_loss_mlp": 1.02028775, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 2.105369007151224, "language_loss": 0.72925651, "learning_rate": 3.332501274072231e-06, "loss": 0.7505917, "num_input_tokens_seen": 104022945, "step": 4821, "time_per_iteration": 2.743091583251953 }, { "auxiliary_loss_clip": 0.01132782, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.05055594, "balance_loss_mlp": 1.02290142, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 2.331696646407205, "language_loss": 0.71962738, "learning_rate": 3.332210816371104e-06, "loss": 0.74136078, "num_input_tokens_seen": 104042080, "step": 4822, "time_per_iteration": 2.768996477127075 }, { "auxiliary_loss_clip": 0.01128837, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.05237818, "balance_loss_mlp": 1.03142738, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 1.8111020118629353, "language_loss": 0.662521, "learning_rate": 3.3319203081524102e-06, "loss": 0.68429112, "num_input_tokens_seen": 104060975, "step": 4823, "time_per_iteration": 2.733591318130493 }, { "auxiliary_loss_clip": 0.01107872, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.04404497, "balance_loss_mlp": 1.02588761, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 4.579803152663717, "language_loss": 0.81162238, "learning_rate": 3.331629749427164e-06, "loss": 0.83312368, "num_input_tokens_seen": 104081395, "step": 4824, "time_per_iteration": 4.278540849685669 }, { "auxiliary_loss_clip": 0.01143667, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.05104661, "balance_loss_mlp": 1.025828, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 2.265114761106369, "language_loss": 0.72592747, "learning_rate": 3.331339140206385e-06, "loss": 0.74780297, "num_input_tokens_seen": 104099995, "step": 4825, "time_per_iteration": 4.177908658981323 }, { "auxiliary_loss_clip": 0.01147795, "auxiliary_loss_mlp": 0.01036998, "balance_loss_clip": 1.05434549, "balance_loss_mlp": 1.01930714, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 2.216571865047856, "language_loss": 0.73680669, "learning_rate": 3.331048480501092e-06, "loss": 0.75865459, "num_input_tokens_seen": 104118930, "step": 4826, "time_per_iteration": 2.6371700763702393 }, { "auxiliary_loss_clip": 0.0113072, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05073726, "balance_loss_mlp": 1.02483773, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 2.324527624383577, "language_loss": 0.68556225, "learning_rate": 3.3307577703223073e-06, "loss": 0.70727527, "num_input_tokens_seen": 104136940, "step": 4827, "time_per_iteration": 2.6447484493255615 }, { "auxiliary_loss_clip": 0.01125924, "auxiliary_loss_mlp": 0.0104453, "balance_loss_clip": 1.04981911, "balance_loss_mlp": 1.02650571, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 1.8485927197530279, "language_loss": 0.80266023, "learning_rate": 3.3304670096810545e-06, "loss": 0.82436466, "num_input_tokens_seen": 104154280, "step": 4828, "time_per_iteration": 4.131803274154663 }, { "auxiliary_loss_clip": 0.01144317, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05393863, "balance_loss_mlp": 1.03288054, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 1.8003854621941846, "language_loss": 0.80658895, "learning_rate": 3.33017619858836e-06, "loss": 0.8285315, "num_input_tokens_seen": 104172605, "step": 4829, "time_per_iteration": 2.760899066925049 }, { "auxiliary_loss_clip": 0.011197, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.05093288, "balance_loss_mlp": 1.02680826, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 1.5734536519128175, "language_loss": 0.82911146, "learning_rate": 3.329885337055249e-06, "loss": 0.85074902, "num_input_tokens_seen": 104194120, "step": 4830, "time_per_iteration": 4.403480529785156 }, { "auxiliary_loss_clip": 0.01137563, "auxiliary_loss_mlp": 0.01048934, "balance_loss_clip": 1.05430257, "balance_loss_mlp": 1.03155351, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 2.2586543311689486, "language_loss": 0.79236752, "learning_rate": 3.3295944250927546e-06, "loss": 0.81423253, "num_input_tokens_seen": 104210875, "step": 4831, "time_per_iteration": 2.6066412925720215 }, { "auxiliary_loss_clip": 0.01143728, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.05470276, "balance_loss_mlp": 1.03000546, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 1.9694662738232038, "language_loss": 0.7459774, "learning_rate": 3.3293034627119055e-06, "loss": 0.76787293, "num_input_tokens_seen": 104229875, "step": 4832, "time_per_iteration": 2.8411331176757812 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.05429769, "balance_loss_mlp": 1.02335787, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 1.979215737756815, "language_loss": 0.76150024, "learning_rate": 3.329012449923736e-06, "loss": 0.78310186, "num_input_tokens_seen": 104250405, "step": 4833, "time_per_iteration": 2.7510006427764893 }, { "auxiliary_loss_clip": 0.01107016, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.04580688, "balance_loss_mlp": 1.02383542, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 1.7715964188803632, "language_loss": 0.64404124, "learning_rate": 3.3287213867392813e-06, "loss": 0.66551173, "num_input_tokens_seen": 104269185, "step": 4834, "time_per_iteration": 2.6475064754486084 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.05111325, "balance_loss_mlp": 1.01724815, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 1.4640588842294755, "language_loss": 0.71717769, "learning_rate": 3.3284302731695783e-06, "loss": 0.73867083, "num_input_tokens_seen": 104289400, "step": 4835, "time_per_iteration": 2.6991324424743652 }, { "auxiliary_loss_clip": 0.01117393, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.04881835, "balance_loss_mlp": 1.02187634, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 1.657223137158586, "language_loss": 0.79492378, "learning_rate": 3.3281391092256668e-06, "loss": 0.81647086, "num_input_tokens_seen": 104310485, "step": 4836, "time_per_iteration": 2.7060084342956543 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01045193, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.02744293, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 1.9442300400082562, "language_loss": 0.81372344, "learning_rate": 3.3278478949185865e-06, "loss": 0.83524883, "num_input_tokens_seen": 104327330, "step": 4837, "time_per_iteration": 2.640610933303833 }, { "auxiliary_loss_clip": 0.01116355, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.04938102, "balance_loss_mlp": 1.0233283, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 6.209911556378307, "language_loss": 0.67358792, "learning_rate": 3.327556630259381e-06, "loss": 0.69514549, "num_input_tokens_seen": 104350350, "step": 4838, "time_per_iteration": 2.758422374725342 }, { "auxiliary_loss_clip": 0.01147958, "auxiliary_loss_mlp": 0.00775113, "balance_loss_clip": 1.05402315, "balance_loss_mlp": 1.00096607, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 1.5628414298261506, "language_loss": 0.71139944, "learning_rate": 3.327265315259095e-06, "loss": 0.73063016, "num_input_tokens_seen": 104369995, "step": 4839, "time_per_iteration": 2.683349132537842 }, { "auxiliary_loss_clip": 0.0114095, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.04966319, "balance_loss_mlp": 1.02147555, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 1.9403130873020338, "language_loss": 0.7539593, "learning_rate": 3.326973949928776e-06, "loss": 0.77573812, "num_input_tokens_seen": 104392285, "step": 4840, "time_per_iteration": 2.696808099746704 }, { "auxiliary_loss_clip": 0.01093571, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.04470551, "balance_loss_mlp": 1.02825069, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 1.7841334294021773, "language_loss": 0.60546595, "learning_rate": 3.326682534279471e-06, "loss": 0.62685257, "num_input_tokens_seen": 104412640, "step": 4841, "time_per_iteration": 2.74575138092041 }, { "auxiliary_loss_clip": 0.01120271, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.04983509, "balance_loss_mlp": 1.02288651, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 1.408353605568525, "language_loss": 0.71321762, "learning_rate": 3.326391068322232e-06, "loss": 0.73481655, "num_input_tokens_seen": 104435245, "step": 4842, "time_per_iteration": 2.7568962574005127 }, { "auxiliary_loss_clip": 0.01130885, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.05042899, "balance_loss_mlp": 1.02191257, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 2.1183002067983585, "language_loss": 0.73610562, "learning_rate": 3.3260995520681098e-06, "loss": 0.75778532, "num_input_tokens_seen": 104455395, "step": 4843, "time_per_iteration": 2.6703171730041504 }, { "auxiliary_loss_clip": 0.0108851, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.02058005, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 4.868884277111801, "language_loss": 0.58445942, "learning_rate": 3.3258079855281602e-06, "loss": 0.60570699, "num_input_tokens_seen": 104473350, "step": 4844, "time_per_iteration": 2.7461965084075928 }, { "auxiliary_loss_clip": 0.01138917, "auxiliary_loss_mlp": 0.01039428, "balance_loss_clip": 1.05586743, "balance_loss_mlp": 1.0222863, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 1.9200815982611392, "language_loss": 0.86459565, "learning_rate": 3.3255163687134396e-06, "loss": 0.88637912, "num_input_tokens_seen": 104492265, "step": 4845, "time_per_iteration": 2.711101770401001 }, { "auxiliary_loss_clip": 0.01115849, "auxiliary_loss_mlp": 0.01052584, "balance_loss_clip": 1.05018926, "balance_loss_mlp": 1.03505993, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 1.7226223126663984, "language_loss": 0.67067879, "learning_rate": 3.3252247016350046e-06, "loss": 0.69236308, "num_input_tokens_seen": 104510755, "step": 4846, "time_per_iteration": 2.698076009750366 }, { "auxiliary_loss_clip": 0.01120746, "auxiliary_loss_mlp": 0.01040428, "balance_loss_clip": 1.05198884, "balance_loss_mlp": 1.02457917, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 1.9884880347168128, "language_loss": 0.70629871, "learning_rate": 3.3249329843039166e-06, "loss": 0.7279104, "num_input_tokens_seen": 104530830, "step": 4847, "time_per_iteration": 2.6693859100341797 }, { "auxiliary_loss_clip": 0.01129385, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.0490911, "balance_loss_mlp": 1.02048314, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 1.4444788582363046, "language_loss": 0.73975939, "learning_rate": 3.324641216731237e-06, "loss": 0.76141691, "num_input_tokens_seen": 104550115, "step": 4848, "time_per_iteration": 2.779012680053711 }, { "auxiliary_loss_clip": 0.0112526, "auxiliary_loss_mlp": 0.01051811, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.03391802, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 3.067540232947916, "language_loss": 0.76738584, "learning_rate": 3.3243493989280295e-06, "loss": 0.7891565, "num_input_tokens_seen": 104566255, "step": 4849, "time_per_iteration": 2.6103999614715576 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.04718697, "balance_loss_mlp": 1.02541125, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 1.7266499063872853, "language_loss": 0.78276592, "learning_rate": 3.3240575309053596e-06, "loss": 0.80442822, "num_input_tokens_seen": 104585235, "step": 4850, "time_per_iteration": 2.6395609378814697 }, { "auxiliary_loss_clip": 0.01111964, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.04907775, "balance_loss_mlp": 1.0209378, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 1.8024770323318549, "language_loss": 0.7657702, "learning_rate": 3.323765612674296e-06, "loss": 0.78727031, "num_input_tokens_seen": 104605315, "step": 4851, "time_per_iteration": 2.7265985012054443 }, { "auxiliary_loss_clip": 0.01132156, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.052459, "balance_loss_mlp": 1.03083527, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 1.3639310788782566, "language_loss": 0.77680421, "learning_rate": 3.3234736442459078e-06, "loss": 0.7985822, "num_input_tokens_seen": 104626055, "step": 4852, "time_per_iteration": 2.7161712646484375 }, { "auxiliary_loss_clip": 0.01120344, "auxiliary_loss_mlp": 0.01051407, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03523064, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 1.6397145219173752, "language_loss": 0.7816534, "learning_rate": 3.3231816256312665e-06, "loss": 0.80337089, "num_input_tokens_seen": 104646005, "step": 4853, "time_per_iteration": 2.748053789138794 }, { "auxiliary_loss_clip": 0.01108012, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.04923177, "balance_loss_mlp": 1.02535105, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 2.273586870261815, "language_loss": 0.8791436, "learning_rate": 3.322889556841445e-06, "loss": 0.90063715, "num_input_tokens_seen": 104661620, "step": 4854, "time_per_iteration": 2.7663791179656982 }, { "auxiliary_loss_clip": 0.01128591, "auxiliary_loss_mlp": 0.01054226, "balance_loss_clip": 1.05255818, "balance_loss_mlp": 1.03502131, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 1.7143523369489482, "language_loss": 0.86374146, "learning_rate": 3.322597437887519e-06, "loss": 0.88556957, "num_input_tokens_seen": 104681445, "step": 4855, "time_per_iteration": 2.613903284072876 }, { "auxiliary_loss_clip": 0.01039808, "auxiliary_loss_mlp": 0.01005184, "balance_loss_clip": 1.02170599, "balance_loss_mlp": 1.00303864, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.7954079009769616, "language_loss": 0.60148996, "learning_rate": 3.322305268780566e-06, "loss": 0.6219399, "num_input_tokens_seen": 104747945, "step": 4856, "time_per_iteration": 3.273501396179199 }, { "auxiliary_loss_clip": 0.01115701, "auxiliary_loss_mlp": 0.00774991, "balance_loss_clip": 1.04708552, "balance_loss_mlp": 1.00107539, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 1.7540806356878256, "language_loss": 0.6825304, "learning_rate": 3.322013049531664e-06, "loss": 0.70143735, "num_input_tokens_seen": 104766225, "step": 4857, "time_per_iteration": 2.6799964904785156 }, { "auxiliary_loss_clip": 0.01129839, "auxiliary_loss_mlp": 0.00774071, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.00106227, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 1.9069678720023968, "language_loss": 0.83446503, "learning_rate": 3.321720780151895e-06, "loss": 0.85350412, "num_input_tokens_seen": 104785345, "step": 4858, "time_per_iteration": 2.7004997730255127 }, { "auxiliary_loss_clip": 0.01143419, "auxiliary_loss_mlp": 0.01047414, "balance_loss_clip": 1.05265319, "balance_loss_mlp": 1.03119004, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 1.7162042036272904, "language_loss": 0.77357888, "learning_rate": 3.321428460652342e-06, "loss": 0.79548717, "num_input_tokens_seen": 104804560, "step": 4859, "time_per_iteration": 2.5901620388031006 }, { "auxiliary_loss_clip": 0.01105726, "auxiliary_loss_mlp": 0.01044957, "balance_loss_clip": 1.05237806, "balance_loss_mlp": 1.02816057, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 2.2554676354860246, "language_loss": 0.68046212, "learning_rate": 3.3211360910440885e-06, "loss": 0.70196903, "num_input_tokens_seen": 104821105, "step": 4860, "time_per_iteration": 2.7831058502197266 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01041096, "balance_loss_clip": 1.05229402, "balance_loss_mlp": 1.02662396, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 2.539974445673703, "language_loss": 0.75258791, "learning_rate": 3.320843671338222e-06, "loss": 0.77417195, "num_input_tokens_seen": 104841440, "step": 4861, "time_per_iteration": 2.7506070137023926 }, { "auxiliary_loss_clip": 0.01128031, "auxiliary_loss_mlp": 0.0105121, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.03620112, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 3.0942357088370245, "language_loss": 0.91498685, "learning_rate": 3.320551201545832e-06, "loss": 0.93677926, "num_input_tokens_seen": 104858210, "step": 4862, "time_per_iteration": 2.589700937271118 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.02786124, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 2.2124063953391464, "language_loss": 0.73112279, "learning_rate": 3.320258681678008e-06, "loss": 0.75287139, "num_input_tokens_seen": 104875620, "step": 4863, "time_per_iteration": 4.142335653305054 }, { "auxiliary_loss_clip": 0.01061699, "auxiliary_loss_mlp": 0.01044676, "balance_loss_clip": 1.04478168, "balance_loss_mlp": 1.02934611, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 1.893468710780351, "language_loss": 0.77841508, "learning_rate": 3.319966111745842e-06, "loss": 0.79947883, "num_input_tokens_seen": 104894600, "step": 4864, "time_per_iteration": 4.309613943099976 }, { "auxiliary_loss_clip": 0.01102707, "auxiliary_loss_mlp": 0.01050983, "balance_loss_clip": 1.04593945, "balance_loss_mlp": 1.03424644, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 1.5703024458168264, "language_loss": 0.81861019, "learning_rate": 3.319673491760429e-06, "loss": 0.84014714, "num_input_tokens_seen": 104914530, "step": 4865, "time_per_iteration": 2.762397527694702 }, { "auxiliary_loss_clip": 0.0109576, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05265307, "balance_loss_mlp": 1.02924657, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 2.2072447614425554, "language_loss": 0.85522473, "learning_rate": 3.3193808217328645e-06, "loss": 0.87664878, "num_input_tokens_seen": 104933460, "step": 4866, "time_per_iteration": 2.8033764362335205 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04811919, "balance_loss_mlp": 1.02410054, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 1.7213351696608077, "language_loss": 0.75498515, "learning_rate": 3.3190881016742476e-06, "loss": 0.7764926, "num_input_tokens_seen": 104954495, "step": 4867, "time_per_iteration": 4.2950732707977295 }, { "auxiliary_loss_clip": 0.01083116, "auxiliary_loss_mlp": 0.01052463, "balance_loss_clip": 1.04825687, "balance_loss_mlp": 1.03576183, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 1.9203033465249189, "language_loss": 0.73236179, "learning_rate": 3.3187953315956776e-06, "loss": 0.75371754, "num_input_tokens_seen": 104971915, "step": 4868, "time_per_iteration": 2.775538921356201 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01045538, "balance_loss_clip": 1.04888034, "balance_loss_mlp": 1.02836001, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 1.663889887662616, "language_loss": 0.74540651, "learning_rate": 3.3185025115082566e-06, "loss": 0.76677585, "num_input_tokens_seen": 104991335, "step": 4869, "time_per_iteration": 2.734683036804199 }, { "auxiliary_loss_clip": 0.01116568, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.050179, "balance_loss_mlp": 1.02405143, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 1.5721867242720646, "language_loss": 0.76492888, "learning_rate": 3.318209641423088e-06, "loss": 0.78649783, "num_input_tokens_seen": 105012015, "step": 4870, "time_per_iteration": 4.413575649261475 }, { "auxiliary_loss_clip": 0.01133789, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.05237079, "balance_loss_mlp": 1.0328114, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 2.0174334678237655, "language_loss": 0.6773119, "learning_rate": 3.3179167213512777e-06, "loss": 0.69915527, "num_input_tokens_seen": 105031460, "step": 4871, "time_per_iteration": 2.68796706199646 }, { "auxiliary_loss_clip": 0.01112736, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.04638386, "balance_loss_mlp": 1.03515494, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 4.945083241782643, "language_loss": 0.77463269, "learning_rate": 3.317623751303933e-06, "loss": 0.79627478, "num_input_tokens_seen": 105052965, "step": 4872, "time_per_iteration": 2.7679827213287354 }, { "auxiliary_loss_clip": 0.01078644, "auxiliary_loss_mlp": 0.01045822, "balance_loss_clip": 1.0468123, "balance_loss_mlp": 1.0273211, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 1.9468785945114855, "language_loss": 0.72814691, "learning_rate": 3.317330731292164e-06, "loss": 0.74939156, "num_input_tokens_seen": 105071840, "step": 4873, "time_per_iteration": 2.8704919815063477 }, { "auxiliary_loss_clip": 0.01135073, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.0525651, "balance_loss_mlp": 1.02705503, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 1.9420707280566882, "language_loss": 0.78093398, "learning_rate": 3.3170376613270812e-06, "loss": 0.80273187, "num_input_tokens_seen": 105089445, "step": 4874, "time_per_iteration": 2.6573073863983154 }, { "auxiliary_loss_clip": 0.01093774, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.05151463, "balance_loss_mlp": 1.02790475, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 1.8901262824755785, "language_loss": 0.77336359, "learning_rate": 3.3167445414197985e-06, "loss": 0.794752, "num_input_tokens_seen": 105106210, "step": 4875, "time_per_iteration": 2.6960959434509277 }, { "auxiliary_loss_clip": 0.01141436, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.05718327, "balance_loss_mlp": 1.02218604, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 1.556341262673854, "language_loss": 0.69037539, "learning_rate": 3.316451371581431e-06, "loss": 0.71217644, "num_input_tokens_seen": 105124200, "step": 4876, "time_per_iteration": 2.6719844341278076 }, { "auxiliary_loss_clip": 0.01121768, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.04729414, "balance_loss_mlp": 1.03105509, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 2.0371531421747466, "language_loss": 0.82111382, "learning_rate": 3.316158151823096e-06, "loss": 0.84280884, "num_input_tokens_seen": 105140400, "step": 4877, "time_per_iteration": 2.632293462753296 }, { "auxiliary_loss_clip": 0.01139233, "auxiliary_loss_mlp": 0.01040634, "balance_loss_clip": 1.05428672, "balance_loss_mlp": 1.02392054, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 3.614839551588232, "language_loss": 0.67366385, "learning_rate": 3.315864882155911e-06, "loss": 0.69546252, "num_input_tokens_seen": 105157535, "step": 4878, "time_per_iteration": 2.5839362144470215 }, { "auxiliary_loss_clip": 0.01100237, "auxiliary_loss_mlp": 0.01045253, "balance_loss_clip": 1.04628241, "balance_loss_mlp": 1.02817595, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 2.0985622071445063, "language_loss": 0.73632258, "learning_rate": 3.3155715625909982e-06, "loss": 0.75777751, "num_input_tokens_seen": 105175185, "step": 4879, "time_per_iteration": 2.738429307937622 }, { "auxiliary_loss_clip": 0.01104776, "auxiliary_loss_mlp": 0.00776504, "balance_loss_clip": 1.05266857, "balance_loss_mlp": 1.00116253, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 1.8172867500477656, "language_loss": 0.66441375, "learning_rate": 3.3152781931394803e-06, "loss": 0.68322659, "num_input_tokens_seen": 105194540, "step": 4880, "time_per_iteration": 2.7889339923858643 }, { "auxiliary_loss_clip": 0.01130875, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05021453, "balance_loss_mlp": 1.03249359, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 1.9971358437235982, "language_loss": 0.70130688, "learning_rate": 3.314984773812481e-06, "loss": 0.72310567, "num_input_tokens_seen": 105213215, "step": 4881, "time_per_iteration": 2.705906629562378 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.00775734, "balance_loss_clip": 1.04823685, "balance_loss_mlp": 1.00119698, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 1.8949601379230998, "language_loss": 0.83497417, "learning_rate": 3.314691304621127e-06, "loss": 0.85385573, "num_input_tokens_seen": 105231585, "step": 4882, "time_per_iteration": 2.715853691101074 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.05350292, "balance_loss_mlp": 1.02825117, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 2.6750396503443827, "language_loss": 0.71433568, "learning_rate": 3.314397785576548e-06, "loss": 0.73627448, "num_input_tokens_seen": 105250120, "step": 4883, "time_per_iteration": 2.629642963409424 }, { "auxiliary_loss_clip": 0.01123143, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05262315, "balance_loss_mlp": 1.0230521, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 2.1262053984109226, "language_loss": 0.92650437, "learning_rate": 3.3141042166898726e-06, "loss": 0.94814324, "num_input_tokens_seen": 105266065, "step": 4884, "time_per_iteration": 2.727379322052002 }, { "auxiliary_loss_clip": 0.01138638, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05512667, "balance_loss_mlp": 1.0232085, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 2.19754538449792, "language_loss": 0.73535883, "learning_rate": 3.313810597972234e-06, "loss": 0.75714231, "num_input_tokens_seen": 105282155, "step": 4885, "time_per_iteration": 2.706212043762207 }, { "auxiliary_loss_clip": 0.01124089, "auxiliary_loss_mlp": 0.01045234, "balance_loss_clip": 1.04882109, "balance_loss_mlp": 1.02791286, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 2.8259058407064566, "language_loss": 0.84815478, "learning_rate": 3.3135169294347655e-06, "loss": 0.86984795, "num_input_tokens_seen": 105299225, "step": 4886, "time_per_iteration": 2.651383876800537 }, { "auxiliary_loss_clip": 0.01112051, "auxiliary_loss_mlp": 0.01040147, "balance_loss_clip": 1.04674077, "balance_loss_mlp": 1.023839, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 2.312079302728887, "language_loss": 0.77030611, "learning_rate": 3.313223211088603e-06, "loss": 0.7918281, "num_input_tokens_seen": 105315710, "step": 4887, "time_per_iteration": 2.8299317359924316 }, { "auxiliary_loss_clip": 0.01121167, "auxiliary_loss_mlp": 0.01044419, "balance_loss_clip": 1.05137563, "balance_loss_mlp": 1.02809978, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 4.814706857660641, "language_loss": 0.79822707, "learning_rate": 3.3129294429448855e-06, "loss": 0.81988299, "num_input_tokens_seen": 105333505, "step": 4888, "time_per_iteration": 2.6942543983459473 }, { "auxiliary_loss_clip": 0.01114672, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.01886487, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 1.8060574020422921, "language_loss": 0.55514884, "learning_rate": 3.3126356250147517e-06, "loss": 0.57664764, "num_input_tokens_seen": 105355605, "step": 4889, "time_per_iteration": 2.838529586791992 }, { "auxiliary_loss_clip": 0.01136079, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.05230045, "balance_loss_mlp": 1.02257514, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 1.9006309093473746, "language_loss": 0.84414017, "learning_rate": 3.3123417573093434e-06, "loss": 0.86590338, "num_input_tokens_seen": 105374225, "step": 4890, "time_per_iteration": 2.653601884841919 }, { "auxiliary_loss_clip": 0.01138833, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.05449104, "balance_loss_mlp": 1.02767992, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 2.3284792525221625, "language_loss": 0.72417939, "learning_rate": 3.3120478398398046e-06, "loss": 0.74600995, "num_input_tokens_seen": 105391565, "step": 4891, "time_per_iteration": 2.6499764919281006 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01046245, "balance_loss_clip": 1.05517375, "balance_loss_mlp": 1.02797008, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 1.6858898954482169, "language_loss": 0.77310836, "learning_rate": 3.3117538726172797e-06, "loss": 0.7950598, "num_input_tokens_seen": 105409840, "step": 4892, "time_per_iteration": 2.6123669147491455 }, { "auxiliary_loss_clip": 0.01143283, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.05147183, "balance_loss_mlp": 1.01932704, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 1.8056938004749827, "language_loss": 0.77826709, "learning_rate": 3.3114598556529164e-06, "loss": 0.80006474, "num_input_tokens_seen": 105428645, "step": 4893, "time_per_iteration": 2.6142194271087646 }, { "auxiliary_loss_clip": 0.01106286, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.0508399, "balance_loss_mlp": 1.02912164, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 3.6552959609210944, "language_loss": 0.85032988, "learning_rate": 3.311165788957864e-06, "loss": 0.87185144, "num_input_tokens_seen": 105447480, "step": 4894, "time_per_iteration": 2.837883234024048 }, { "auxiliary_loss_clip": 0.01131513, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.05098557, "balance_loss_mlp": 1.02169216, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 3.570255241204836, "language_loss": 0.90650308, "learning_rate": 3.310871672543274e-06, "loss": 0.92819947, "num_input_tokens_seen": 105464600, "step": 4895, "time_per_iteration": 2.588153839111328 }, { "auxiliary_loss_clip": 0.01138224, "auxiliary_loss_mlp": 0.01045554, "balance_loss_clip": 1.05338621, "balance_loss_mlp": 1.02777958, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 1.7548452829513195, "language_loss": 0.86612183, "learning_rate": 3.3105775064202982e-06, "loss": 0.88795966, "num_input_tokens_seen": 105481510, "step": 4896, "time_per_iteration": 2.6405279636383057 }, { "auxiliary_loss_clip": 0.01142594, "auxiliary_loss_mlp": 0.01053714, "balance_loss_clip": 1.05662429, "balance_loss_mlp": 1.03620195, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 2.0549220420715906, "language_loss": 0.73394442, "learning_rate": 3.3102832906000924e-06, "loss": 0.75590742, "num_input_tokens_seen": 105501390, "step": 4897, "time_per_iteration": 2.6669554710388184 }, { "auxiliary_loss_clip": 0.01128563, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.04556203, "balance_loss_mlp": 1.03214252, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 2.0814872266581426, "language_loss": 0.74344778, "learning_rate": 3.309989025093813e-06, "loss": 0.76524007, "num_input_tokens_seen": 105519600, "step": 4898, "time_per_iteration": 2.6286890506744385 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01047883, "balance_loss_clip": 1.05775058, "balance_loss_mlp": 1.02880955, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 2.610474436320842, "language_loss": 0.70560962, "learning_rate": 3.309694709912618e-06, "loss": 0.72749114, "num_input_tokens_seen": 105535970, "step": 4899, "time_per_iteration": 2.6050777435302734 }, { "auxiliary_loss_clip": 0.01122842, "auxiliary_loss_mlp": 0.00775757, "balance_loss_clip": 1.05115175, "balance_loss_mlp": 1.00110114, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 2.6981557529788587, "language_loss": 0.78938496, "learning_rate": 3.3094003450676685e-06, "loss": 0.80837095, "num_input_tokens_seen": 105556735, "step": 4900, "time_per_iteration": 2.7517058849334717 }, { "auxiliary_loss_clip": 0.0110429, "auxiliary_loss_mlp": 0.01059395, "balance_loss_clip": 1.04257679, "balance_loss_mlp": 1.03992808, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 1.7286923709762618, "language_loss": 0.80861294, "learning_rate": 3.3091059305701268e-06, "loss": 0.83024979, "num_input_tokens_seen": 105574875, "step": 4901, "time_per_iteration": 2.58297061920166 }, { "auxiliary_loss_clip": 0.01114064, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.01993775, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 2.2236242529025954, "language_loss": 0.57768303, "learning_rate": 3.308811466431157e-06, "loss": 0.59917623, "num_input_tokens_seen": 105594225, "step": 4902, "time_per_iteration": 2.6765553951263428 }, { "auxiliary_loss_clip": 0.01122886, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05165744, "balance_loss_mlp": 1.02809834, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 1.6365628527843905, "language_loss": 0.7553789, "learning_rate": 3.308516952661925e-06, "loss": 0.77705181, "num_input_tokens_seen": 105614000, "step": 4903, "time_per_iteration": 5.72201132774353 }, { "auxiliary_loss_clip": 0.01117125, "auxiliary_loss_mlp": 0.01054328, "balance_loss_clip": 1.05058551, "balance_loss_mlp": 1.03506362, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 1.79479894391178, "language_loss": 0.62782186, "learning_rate": 3.3082223892736e-06, "loss": 0.64953631, "num_input_tokens_seen": 105634575, "step": 4904, "time_per_iteration": 2.7290875911712646 }, { "auxiliary_loss_clip": 0.01135143, "auxiliary_loss_mlp": 0.01043669, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.02669382, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 1.4755442774564356, "language_loss": 0.73145443, "learning_rate": 3.3079277762773496e-06, "loss": 0.75324261, "num_input_tokens_seen": 105654385, "step": 4905, "time_per_iteration": 2.6482555866241455 }, { "auxiliary_loss_clip": 0.01112476, "auxiliary_loss_mlp": 0.01046266, "balance_loss_clip": 1.05017638, "balance_loss_mlp": 1.028265, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 1.7800977730713317, "language_loss": 0.8199898, "learning_rate": 3.3076331136843476e-06, "loss": 0.84157723, "num_input_tokens_seen": 105673570, "step": 4906, "time_per_iteration": 2.737182378768921 }, { "auxiliary_loss_clip": 0.01094663, "auxiliary_loss_mlp": 0.01040505, "balance_loss_clip": 1.04579425, "balance_loss_mlp": 1.02372003, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 2.8763815934933867, "language_loss": 0.87373984, "learning_rate": 3.3073384015057667e-06, "loss": 0.89509153, "num_input_tokens_seen": 105691940, "step": 4907, "time_per_iteration": 4.367825746536255 }, { "auxiliary_loss_clip": 0.01149393, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.05400407, "balance_loss_mlp": 1.02501488, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 2.047818146937445, "language_loss": 0.81910521, "learning_rate": 3.307043639752782e-06, "loss": 0.84103584, "num_input_tokens_seen": 105709825, "step": 4908, "time_per_iteration": 2.578582525253296 }, { "auxiliary_loss_clip": 0.01055582, "auxiliary_loss_mlp": 0.01003419, "balance_loss_clip": 1.02453518, "balance_loss_mlp": 1.00138056, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 0.7982723827999523, "language_loss": 0.57287854, "learning_rate": 3.3067488284365728e-06, "loss": 0.59346855, "num_input_tokens_seen": 105766880, "step": 4909, "time_per_iteration": 4.640491247177124 }, { "auxiliary_loss_clip": 0.01135445, "auxiliary_loss_mlp": 0.00774301, "balance_loss_clip": 1.05580318, "balance_loss_mlp": 1.00097156, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 1.756295161453336, "language_loss": 0.87018639, "learning_rate": 3.3064539675683163e-06, "loss": 0.88928384, "num_input_tokens_seen": 105786875, "step": 4910, "time_per_iteration": 2.642312526702881 }, { "auxiliary_loss_clip": 0.01131096, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.05359542, "balance_loss_mlp": 1.02744913, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 1.692596753939278, "language_loss": 0.73332304, "learning_rate": 3.3061590571591946e-06, "loss": 0.75506431, "num_input_tokens_seen": 105805315, "step": 4911, "time_per_iteration": 2.6130573749542236 }, { "auxiliary_loss_clip": 0.01132917, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.05330253, "balance_loss_mlp": 1.02193832, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 1.8009313294920104, "language_loss": 0.89653587, "learning_rate": 3.3058640972203904e-06, "loss": 0.91824973, "num_input_tokens_seen": 105825125, "step": 4912, "time_per_iteration": 2.660090684890747 }, { "auxiliary_loss_clip": 0.01114053, "auxiliary_loss_mlp": 0.010529, "balance_loss_clip": 1.0482899, "balance_loss_mlp": 1.03503084, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.3579869674800176, "language_loss": 0.83175462, "learning_rate": 3.3055690877630894e-06, "loss": 0.85342413, "num_input_tokens_seen": 105846085, "step": 4913, "time_per_iteration": 2.743364095687866 }, { "auxiliary_loss_clip": 0.01142468, "auxiliary_loss_mlp": 0.01043093, "balance_loss_clip": 1.04977608, "balance_loss_mlp": 1.02690446, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.9704695859403116, "language_loss": 0.76919919, "learning_rate": 3.3052740287984765e-06, "loss": 0.79105484, "num_input_tokens_seen": 105865400, "step": 4914, "time_per_iteration": 2.6778385639190674 }, { "auxiliary_loss_clip": 0.01121315, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.05064511, "balance_loss_mlp": 1.02818418, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 1.678810736285401, "language_loss": 0.81829619, "learning_rate": 3.3049789203377424e-06, "loss": 0.8399632, "num_input_tokens_seen": 105887920, "step": 4915, "time_per_iteration": 2.9347212314605713 }, { "auxiliary_loss_clip": 0.01068117, "auxiliary_loss_mlp": 0.01044435, "balance_loss_clip": 1.04405856, "balance_loss_mlp": 1.02722168, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 2.129336551193515, "language_loss": 0.84701812, "learning_rate": 3.3046837623920772e-06, "loss": 0.86814368, "num_input_tokens_seen": 105904035, "step": 4916, "time_per_iteration": 2.9183273315429688 }, { "auxiliary_loss_clip": 0.01125851, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.04655123, "balance_loss_mlp": 1.01975429, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 2.1082729468541683, "language_loss": 0.69490808, "learning_rate": 3.3043885549726723e-06, "loss": 0.71653348, "num_input_tokens_seen": 105922685, "step": 4917, "time_per_iteration": 2.7400357723236084 }, { "auxiliary_loss_clip": 0.01123659, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.05140972, "balance_loss_mlp": 1.02214622, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 2.699189623646437, "language_loss": 0.91076934, "learning_rate": 3.3040932980907226e-06, "loss": 0.93239224, "num_input_tokens_seen": 105940425, "step": 4918, "time_per_iteration": 2.7343270778656006 }, { "auxiliary_loss_clip": 0.01147937, "auxiliary_loss_mlp": 0.01043258, "balance_loss_clip": 1.0551039, "balance_loss_mlp": 1.02629495, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 1.9388581576792214, "language_loss": 0.72399175, "learning_rate": 3.303797991757425e-06, "loss": 0.74590373, "num_input_tokens_seen": 105960550, "step": 4919, "time_per_iteration": 2.718583822250366 }, { "auxiliary_loss_clip": 0.01119627, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.04843163, "balance_loss_mlp": 1.02838945, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 1.8826298231205452, "language_loss": 0.75919485, "learning_rate": 3.3035026359839763e-06, "loss": 0.78084767, "num_input_tokens_seen": 105978820, "step": 4920, "time_per_iteration": 2.7425734996795654 }, { "auxiliary_loss_clip": 0.01121739, "auxiliary_loss_mlp": 0.01052293, "balance_loss_clip": 1.05511427, "balance_loss_mlp": 1.03449547, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 5.307541834842734, "language_loss": 0.69020098, "learning_rate": 3.3032072307815774e-06, "loss": 0.71194124, "num_input_tokens_seen": 105997545, "step": 4921, "time_per_iteration": 2.7755305767059326 }, { "auxiliary_loss_clip": 0.01120164, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05075121, "balance_loss_mlp": 1.02453458, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 1.8488664920888758, "language_loss": 0.7462194, "learning_rate": 3.3029117761614298e-06, "loss": 0.767851, "num_input_tokens_seen": 106015320, "step": 4922, "time_per_iteration": 2.740687131881714 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.00775382, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.00129843, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 1.7662799143188246, "language_loss": 0.77148855, "learning_rate": 3.302616272134737e-06, "loss": 0.79076254, "num_input_tokens_seen": 106034555, "step": 4923, "time_per_iteration": 2.664875030517578 }, { "auxiliary_loss_clip": 0.01117655, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.05065989, "balance_loss_mlp": 1.0247035, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 1.7775190737024398, "language_loss": 0.86232758, "learning_rate": 3.3023207187127042e-06, "loss": 0.88392955, "num_input_tokens_seen": 106054200, "step": 4924, "time_per_iteration": 2.7413501739501953 }, { "auxiliary_loss_clip": 0.01132544, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.05098939, "balance_loss_mlp": 1.02114248, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 1.479657736715748, "language_loss": 0.82050943, "learning_rate": 3.3020251159065396e-06, "loss": 0.84221852, "num_input_tokens_seen": 106074700, "step": 4925, "time_per_iteration": 2.676556348800659 }, { "auxiliary_loss_clip": 0.01078547, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.04153097, "balance_loss_mlp": 1.03283572, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 2.5440905583969697, "language_loss": 0.86138272, "learning_rate": 3.301729463727452e-06, "loss": 0.88268495, "num_input_tokens_seen": 106091415, "step": 4926, "time_per_iteration": 2.675780773162842 }, { "auxiliary_loss_clip": 0.01108502, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.04910469, "balance_loss_mlp": 1.0193243, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 2.332235960138756, "language_loss": 0.85897464, "learning_rate": 3.3014337621866527e-06, "loss": 0.88041389, "num_input_tokens_seen": 106109135, "step": 4927, "time_per_iteration": 2.7407169342041016 }, { "auxiliary_loss_clip": 0.01131541, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.05158448, "balance_loss_mlp": 1.02312613, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 3.581765820174834, "language_loss": 0.80772752, "learning_rate": 3.3011380112953553e-06, "loss": 0.8294366, "num_input_tokens_seen": 106125750, "step": 4928, "time_per_iteration": 2.6719777584075928 }, { "auxiliary_loss_clip": 0.01123889, "auxiliary_loss_mlp": 0.01043191, "balance_loss_clip": 1.04852009, "balance_loss_mlp": 1.02346206, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 2.79065826833615, "language_loss": 0.7313869, "learning_rate": 3.300842211064773e-06, "loss": 0.75305772, "num_input_tokens_seen": 106142835, "step": 4929, "time_per_iteration": 2.75266695022583 }, { "auxiliary_loss_clip": 0.0112132, "auxiliary_loss_mlp": 0.01054118, "balance_loss_clip": 1.0495156, "balance_loss_mlp": 1.03481805, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 2.360375509218164, "language_loss": 0.71534413, "learning_rate": 3.3005463615061246e-06, "loss": 0.73709846, "num_input_tokens_seen": 106160680, "step": 4930, "time_per_iteration": 2.799149990081787 }, { "auxiliary_loss_clip": 0.01028509, "auxiliary_loss_mlp": 0.01003992, "balance_loss_clip": 1.03094876, "balance_loss_mlp": 1.00229919, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8053244370028285, "language_loss": 0.6061247, "learning_rate": 3.3002504626306275e-06, "loss": 0.6264497, "num_input_tokens_seen": 106224415, "step": 4931, "time_per_iteration": 3.218900442123413 }, { "auxiliary_loss_clip": 0.01007041, "auxiliary_loss_mlp": 0.01005936, "balance_loss_clip": 1.02247667, "balance_loss_mlp": 1.00395727, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7408573754586586, "language_loss": 0.52380091, "learning_rate": 3.2999545144495023e-06, "loss": 0.54393071, "num_input_tokens_seen": 106279140, "step": 4932, "time_per_iteration": 3.26432728767395 }, { "auxiliary_loss_clip": 0.01129633, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.04917526, "balance_loss_mlp": 1.02584457, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 2.012094119717185, "language_loss": 0.81540775, "learning_rate": 3.299658516973972e-06, "loss": 0.83712846, "num_input_tokens_seen": 106298190, "step": 4933, "time_per_iteration": 2.804293155670166 }, { "auxiliary_loss_clip": 0.01092845, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.04405773, "balance_loss_mlp": 1.01966333, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 1.916542141573101, "language_loss": 0.75165296, "learning_rate": 3.299362470215261e-06, "loss": 0.77295041, "num_input_tokens_seen": 106319065, "step": 4934, "time_per_iteration": 2.797697067260742 }, { "auxiliary_loss_clip": 0.01126398, "auxiliary_loss_mlp": 0.01047716, "balance_loss_clip": 1.04985118, "balance_loss_mlp": 1.03013301, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 1.8491505675561635, "language_loss": 0.62093496, "learning_rate": 3.299066374184594e-06, "loss": 0.64267612, "num_input_tokens_seen": 106338040, "step": 4935, "time_per_iteration": 2.6466407775878906 }, { "auxiliary_loss_clip": 0.01129018, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05052114, "balance_loss_mlp": 1.02452123, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.4269626202910053, "language_loss": 0.79485404, "learning_rate": 3.2987702288932e-06, "loss": 0.81656075, "num_input_tokens_seen": 106358900, "step": 4936, "time_per_iteration": 2.7333009243011475 }, { "auxiliary_loss_clip": 0.01100808, "auxiliary_loss_mlp": 0.01048756, "balance_loss_clip": 1.04970682, "balance_loss_mlp": 1.03040934, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 1.5951903019521643, "language_loss": 0.73993498, "learning_rate": 3.298474034352309e-06, "loss": 0.76143062, "num_input_tokens_seen": 106381805, "step": 4937, "time_per_iteration": 2.853935718536377 }, { "auxiliary_loss_clip": 0.01094789, "auxiliary_loss_mlp": 0.01038743, "balance_loss_clip": 1.05060768, "balance_loss_mlp": 1.0209924, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 1.654578873057457, "language_loss": 0.78373563, "learning_rate": 3.2981777905731526e-06, "loss": 0.80507094, "num_input_tokens_seen": 106402365, "step": 4938, "time_per_iteration": 2.803147077560425 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05193913, "balance_loss_mlp": 1.02931857, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 2.4827377035181013, "language_loss": 0.76842266, "learning_rate": 3.297881497566964e-06, "loss": 0.79009068, "num_input_tokens_seen": 106419800, "step": 4939, "time_per_iteration": 2.8867270946502686 }, { "auxiliary_loss_clip": 0.0111051, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.04666841, "balance_loss_mlp": 1.02361226, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 1.8055035581570296, "language_loss": 0.78354549, "learning_rate": 3.297585155344979e-06, "loss": 0.80506229, "num_input_tokens_seen": 106440300, "step": 4940, "time_per_iteration": 2.783046245574951 }, { "auxiliary_loss_clip": 0.01117762, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.0486958, "balance_loss_mlp": 1.01876736, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 1.6305550110852276, "language_loss": 0.75628781, "learning_rate": 3.297288763918435e-06, "loss": 0.77784479, "num_input_tokens_seen": 106460035, "step": 4941, "time_per_iteration": 2.74379825592041 }, { "auxiliary_loss_clip": 0.01138083, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05272233, "balance_loss_mlp": 1.03276968, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 2.3053326725865313, "language_loss": 0.74158287, "learning_rate": 3.2969923232985712e-06, "loss": 0.76347995, "num_input_tokens_seen": 106481095, "step": 4942, "time_per_iteration": 4.468350410461426 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05172181, "balance_loss_mlp": 1.02589595, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.42728921351593, "language_loss": 0.702492, "learning_rate": 3.2966958334966287e-06, "loss": 0.72404563, "num_input_tokens_seen": 106501590, "step": 4943, "time_per_iteration": 4.2555251121521 }, { "auxiliary_loss_clip": 0.01124177, "auxiliary_loss_mlp": 0.01041442, "balance_loss_clip": 1.04988825, "balance_loss_mlp": 1.02360821, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 2.221197725988377, "language_loss": 0.795506, "learning_rate": 3.2963992945238497e-06, "loss": 0.81716216, "num_input_tokens_seen": 106519430, "step": 4944, "time_per_iteration": 2.6572201251983643 }, { "auxiliary_loss_clip": 0.0111705, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04914248, "balance_loss_mlp": 1.02521038, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 2.187472317578873, "language_loss": 0.83260202, "learning_rate": 3.2961027063914795e-06, "loss": 0.85418606, "num_input_tokens_seen": 106535870, "step": 4945, "time_per_iteration": 2.6700363159179688 }, { "auxiliary_loss_clip": 0.01090371, "auxiliary_loss_mlp": 0.01039575, "balance_loss_clip": 1.04623246, "balance_loss_mlp": 1.02256417, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 1.8830005833778707, "language_loss": 0.67067397, "learning_rate": 3.2958060691107654e-06, "loss": 0.69197345, "num_input_tokens_seen": 106553560, "step": 4946, "time_per_iteration": 4.29357385635376 }, { "auxiliary_loss_clip": 0.01127819, "auxiliary_loss_mlp": 0.00777134, "balance_loss_clip": 1.04997563, "balance_loss_mlp": 1.00115252, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 1.879721590970614, "language_loss": 0.73877805, "learning_rate": 3.2955093826929547e-06, "loss": 0.75782764, "num_input_tokens_seen": 106574115, "step": 4947, "time_per_iteration": 2.657038450241089 }, { "auxiliary_loss_clip": 0.01109701, "auxiliary_loss_mlp": 0.01045546, "balance_loss_clip": 1.04896843, "balance_loss_mlp": 1.02705622, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 2.0989098852090633, "language_loss": 0.73522758, "learning_rate": 3.2952126471492985e-06, "loss": 0.75678003, "num_input_tokens_seen": 106593070, "step": 4948, "time_per_iteration": 4.4359636306762695 }, { "auxiliary_loss_clip": 0.01139863, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.04885721, "balance_loss_mlp": 1.02332592, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 2.06615582769113, "language_loss": 0.8397494, "learning_rate": 3.2949158624910497e-06, "loss": 0.86155105, "num_input_tokens_seen": 106610695, "step": 4949, "time_per_iteration": 2.6052157878875732 }, { "auxiliary_loss_clip": 0.01128522, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.04901218, "balance_loss_mlp": 1.02459633, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 2.2184783420455814, "language_loss": 0.71360326, "learning_rate": 3.2946190287294603e-06, "loss": 0.73530424, "num_input_tokens_seen": 106631300, "step": 4950, "time_per_iteration": 2.678953170776367 }, { "auxiliary_loss_clip": 0.01095366, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.04944646, "balance_loss_mlp": 1.0239712, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 3.098719098855731, "language_loss": 0.82645297, "learning_rate": 3.294322145875789e-06, "loss": 0.84780639, "num_input_tokens_seen": 106650065, "step": 4951, "time_per_iteration": 2.7566003799438477 }, { "auxiliary_loss_clip": 0.01118264, "auxiliary_loss_mlp": 0.01039186, "balance_loss_clip": 1.04655933, "balance_loss_mlp": 1.02190065, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 15.690000260498868, "language_loss": 0.74144769, "learning_rate": 3.2940252139412912e-06, "loss": 0.76302218, "num_input_tokens_seen": 106668230, "step": 4952, "time_per_iteration": 2.7019882202148438 }, { "auxiliary_loss_clip": 0.01063128, "auxiliary_loss_mlp": 0.01049349, "balance_loss_clip": 1.0433315, "balance_loss_mlp": 1.03133702, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 1.6701113978494808, "language_loss": 0.84251344, "learning_rate": 3.293728232937228e-06, "loss": 0.86363828, "num_input_tokens_seen": 106687785, "step": 4953, "time_per_iteration": 2.9622793197631836 }, { "auxiliary_loss_clip": 0.01120636, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.04966831, "balance_loss_mlp": 1.02428031, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.301918041259246, "language_loss": 0.74366152, "learning_rate": 3.2934312028748597e-06, "loss": 0.76527375, "num_input_tokens_seen": 106706875, "step": 4954, "time_per_iteration": 2.767455577850342 }, { "auxiliary_loss_clip": 0.01138563, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.04899216, "balance_loss_mlp": 1.02028275, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 2.0603039788066155, "language_loss": 0.75687683, "learning_rate": 3.293134123765452e-06, "loss": 0.77862525, "num_input_tokens_seen": 106725105, "step": 4955, "time_per_iteration": 2.638389825820923 }, { "auxiliary_loss_clip": 0.01094257, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.04760742, "balance_loss_mlp": 1.02505171, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 2.358195616275362, "language_loss": 0.72600436, "learning_rate": 3.2928369956202684e-06, "loss": 0.74737054, "num_input_tokens_seen": 106744780, "step": 4956, "time_per_iteration": 2.777873992919922 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.0104754, "balance_loss_clip": 1.04957581, "balance_loss_mlp": 1.02930105, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 2.0297274127598435, "language_loss": 0.79068756, "learning_rate": 3.2925398184505754e-06, "loss": 0.81250894, "num_input_tokens_seen": 106764670, "step": 4957, "time_per_iteration": 2.719581365585327 }, { "auxiliary_loss_clip": 0.01134843, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.05054235, "balance_loss_mlp": 1.02383018, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 1.706880580606115, "language_loss": 0.70570725, "learning_rate": 3.2922425922676437e-06, "loss": 0.7274754, "num_input_tokens_seen": 106783695, "step": 4958, "time_per_iteration": 2.613697052001953 }, { "auxiliary_loss_clip": 0.01108077, "auxiliary_loss_mlp": 0.0104267, "balance_loss_clip": 1.05166888, "balance_loss_mlp": 1.0253129, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 1.5383051389102413, "language_loss": 0.78736448, "learning_rate": 3.291945317082743e-06, "loss": 0.80887192, "num_input_tokens_seen": 106803150, "step": 4959, "time_per_iteration": 2.751455545425415 }, { "auxiliary_loss_clip": 0.01129828, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.04906321, "balance_loss_mlp": 1.0290029, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 1.6624120752671379, "language_loss": 0.79747117, "learning_rate": 3.291647992907147e-06, "loss": 0.81922865, "num_input_tokens_seen": 106820705, "step": 4960, "time_per_iteration": 2.6345505714416504 }, { "auxiliary_loss_clip": 0.01110987, "auxiliary_loss_mlp": 0.01052912, "balance_loss_clip": 1.04863763, "balance_loss_mlp": 1.03449416, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 2.376132196895137, "language_loss": 0.73364639, "learning_rate": 3.291350619752129e-06, "loss": 0.75528538, "num_input_tokens_seen": 106837335, "step": 4961, "time_per_iteration": 2.725008010864258 }, { "auxiliary_loss_clip": 0.01130001, "auxiliary_loss_mlp": 0.0104294, "balance_loss_clip": 1.04824948, "balance_loss_mlp": 1.02640533, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 2.036560430862295, "language_loss": 0.62106621, "learning_rate": 3.291053197628967e-06, "loss": 0.64279556, "num_input_tokens_seen": 106856250, "step": 4962, "time_per_iteration": 2.690870523452759 }, { "auxiliary_loss_clip": 0.01128362, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03310251, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 2.046461333274312, "language_loss": 0.82866591, "learning_rate": 3.2907557265489375e-06, "loss": 0.85046291, "num_input_tokens_seen": 106873370, "step": 4963, "time_per_iteration": 2.637723207473755 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01044675, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.0272826, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 2.580714695656121, "language_loss": 0.65933317, "learning_rate": 3.290458206523322e-06, "loss": 0.68090838, "num_input_tokens_seen": 106890330, "step": 4964, "time_per_iteration": 2.7210114002227783 }, { "auxiliary_loss_clip": 0.01128428, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.04990005, "balance_loss_mlp": 1.02345669, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 1.8191471944851214, "language_loss": 0.71093529, "learning_rate": 3.2901606375634015e-06, "loss": 0.73261172, "num_input_tokens_seen": 106909190, "step": 4965, "time_per_iteration": 2.7070064544677734 }, { "auxiliary_loss_clip": 0.01151396, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05813003, "balance_loss_mlp": 1.03827357, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 2.164601494744612, "language_loss": 0.65952027, "learning_rate": 3.289863019680461e-06, "loss": 0.68159783, "num_input_tokens_seen": 106927825, "step": 4966, "time_per_iteration": 2.5820860862731934 }, { "auxiliary_loss_clip": 0.01148496, "auxiliary_loss_mlp": 0.01042183, "balance_loss_clip": 1.05610132, "balance_loss_mlp": 1.02496934, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 5.631297794621363, "language_loss": 0.73553479, "learning_rate": 3.289565352885785e-06, "loss": 0.75744158, "num_input_tokens_seen": 106943155, "step": 4967, "time_per_iteration": 2.558378219604492 }, { "auxiliary_loss_clip": 0.01110231, "auxiliary_loss_mlp": 0.01041561, "balance_loss_clip": 1.04339898, "balance_loss_mlp": 1.02440643, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 2.07351823246568, "language_loss": 0.71246195, "learning_rate": 3.2892676371906614e-06, "loss": 0.73397982, "num_input_tokens_seen": 106960295, "step": 4968, "time_per_iteration": 2.663163900375366 }, { "auxiliary_loss_clip": 0.01124763, "auxiliary_loss_mlp": 0.01043588, "balance_loss_clip": 1.04864979, "balance_loss_mlp": 1.02545607, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 2.159507035183752, "language_loss": 0.76744419, "learning_rate": 3.2889698726063805e-06, "loss": 0.78912771, "num_input_tokens_seen": 106982870, "step": 4969, "time_per_iteration": 2.729922294616699 }, { "auxiliary_loss_clip": 0.0114364, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.05239987, "balance_loss_mlp": 1.02054322, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 2.2724385668179936, "language_loss": 0.69836891, "learning_rate": 3.2886720591442327e-06, "loss": 0.72016788, "num_input_tokens_seen": 107002405, "step": 4970, "time_per_iteration": 2.6299381256103516 }, { "auxiliary_loss_clip": 0.01135061, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.05199289, "balance_loss_mlp": 1.02973413, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 2.0648779209654258, "language_loss": 0.85228848, "learning_rate": 3.2883741968155103e-06, "loss": 0.87411916, "num_input_tokens_seen": 107017310, "step": 4971, "time_per_iteration": 2.6508536338806152 }, { "auxiliary_loss_clip": 0.01112297, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.04895663, "balance_loss_mlp": 1.03510106, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 2.125047221260382, "language_loss": 0.79404521, "learning_rate": 3.2880762856315107e-06, "loss": 0.81570905, "num_input_tokens_seen": 107034645, "step": 4972, "time_per_iteration": 2.7924270629882812 }, { "auxiliary_loss_clip": 0.01145651, "auxiliary_loss_mlp": 0.01050789, "balance_loss_clip": 1.05367875, "balance_loss_mlp": 1.03427887, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 2.200462139835186, "language_loss": 0.85242772, "learning_rate": 3.2877783256035285e-06, "loss": 0.87439215, "num_input_tokens_seen": 107051125, "step": 4973, "time_per_iteration": 2.5249850749969482 }, { "auxiliary_loss_clip": 0.011108, "auxiliary_loss_mlp": 0.0104405, "balance_loss_clip": 1.04758012, "balance_loss_mlp": 1.02664554, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 2.0029664307268664, "language_loss": 0.77612329, "learning_rate": 3.287480316742863e-06, "loss": 0.79767179, "num_input_tokens_seen": 107068815, "step": 4974, "time_per_iteration": 2.6555633544921875 }, { "auxiliary_loss_clip": 0.01115732, "auxiliary_loss_mlp": 0.00779073, "balance_loss_clip": 1.04864824, "balance_loss_mlp": 1.00132942, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 1.735885031779611, "language_loss": 0.72557616, "learning_rate": 3.287182259060815e-06, "loss": 0.74452424, "num_input_tokens_seen": 107090420, "step": 4975, "time_per_iteration": 2.826773166656494 }, { "auxiliary_loss_clip": 0.01137332, "auxiliary_loss_mlp": 0.01043625, "balance_loss_clip": 1.05628741, "balance_loss_mlp": 1.02561235, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 2.282255680734404, "language_loss": 0.76357341, "learning_rate": 3.286884152568687e-06, "loss": 0.78538299, "num_input_tokens_seen": 107107255, "step": 4976, "time_per_iteration": 2.7506988048553467 }, { "auxiliary_loss_clip": 0.01130399, "auxiliary_loss_mlp": 0.01046525, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.02988303, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 2.005019372487673, "language_loss": 0.86173046, "learning_rate": 3.2865859972777827e-06, "loss": 0.88349968, "num_input_tokens_seen": 107123840, "step": 4977, "time_per_iteration": 2.665029764175415 }, { "auxiliary_loss_clip": 0.01118345, "auxiliary_loss_mlp": 0.01041325, "balance_loss_clip": 1.05032945, "balance_loss_mlp": 1.02443314, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 1.7658271873172786, "language_loss": 0.68290305, "learning_rate": 3.2862877931994088e-06, "loss": 0.70449972, "num_input_tokens_seen": 107143475, "step": 4978, "time_per_iteration": 2.8401222229003906 }, { "auxiliary_loss_clip": 0.011259, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.05556107, "balance_loss_mlp": 1.02268767, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 2.254262103488659, "language_loss": 0.76281357, "learning_rate": 3.2859895403448726e-06, "loss": 0.78447711, "num_input_tokens_seen": 107161725, "step": 4979, "time_per_iteration": 2.7814600467681885 }, { "auxiliary_loss_clip": 0.01090165, "auxiliary_loss_mlp": 0.0104942, "balance_loss_clip": 1.04378402, "balance_loss_mlp": 1.03001285, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 2.1261514095664253, "language_loss": 0.68627954, "learning_rate": 3.285691238725484e-06, "loss": 0.70767546, "num_input_tokens_seen": 107183935, "step": 4980, "time_per_iteration": 2.891620635986328 }, { "auxiliary_loss_clip": 0.01130184, "auxiliary_loss_mlp": 0.00774942, "balance_loss_clip": 1.0525018, "balance_loss_mlp": 1.00121665, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 2.1372298066204114, "language_loss": 0.73153281, "learning_rate": 3.285392888352555e-06, "loss": 0.75058407, "num_input_tokens_seen": 107204285, "step": 4981, "time_per_iteration": 5.394481420516968 }, { "auxiliary_loss_clip": 0.01131964, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.0491364, "balance_loss_mlp": 1.02280653, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 1.6530173596529, "language_loss": 0.86516619, "learning_rate": 3.2850944892373987e-06, "loss": 0.88687789, "num_input_tokens_seen": 107225265, "step": 4982, "time_per_iteration": 4.269104480743408 }, { "auxiliary_loss_clip": 0.01122605, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.05186415, "balance_loss_mlp": 1.02632844, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 2.446225936700185, "language_loss": 0.86517423, "learning_rate": 3.2847960413913307e-06, "loss": 0.88685262, "num_input_tokens_seen": 107241335, "step": 4983, "time_per_iteration": 2.844748020172119 }, { "auxiliary_loss_clip": 0.01127565, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.05255556, "balance_loss_mlp": 1.02594662, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 2.024163877740881, "language_loss": 0.78712893, "learning_rate": 3.284497544825668e-06, "loss": 0.80882448, "num_input_tokens_seen": 107259375, "step": 4984, "time_per_iteration": 2.6945550441741943 }, { "auxiliary_loss_clip": 0.01110139, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.0492574, "balance_loss_mlp": 1.02761972, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 1.5529534411437271, "language_loss": 0.78736818, "learning_rate": 3.2841989995517303e-06, "loss": 0.8089295, "num_input_tokens_seen": 107279890, "step": 4985, "time_per_iteration": 2.8082690238952637 }, { "auxiliary_loss_clip": 0.01083189, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.04330277, "balance_loss_mlp": 1.02925658, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 2.2301347819864112, "language_loss": 0.72089684, "learning_rate": 3.283900405580837e-06, "loss": 0.74223053, "num_input_tokens_seen": 107303430, "step": 4986, "time_per_iteration": 4.54891562461853 }, { "auxiliary_loss_clip": 0.01119419, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.04838538, "balance_loss_mlp": 1.03007603, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 2.1453051702670787, "language_loss": 0.73143345, "learning_rate": 3.283601762924312e-06, "loss": 0.75310332, "num_input_tokens_seen": 107323700, "step": 4987, "time_per_iteration": 4.324375152587891 }, { "auxiliary_loss_clip": 0.01111213, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.04803324, "balance_loss_mlp": 1.0233314, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 2.095598578062247, "language_loss": 0.80221194, "learning_rate": 3.2833030715934793e-06, "loss": 0.82371509, "num_input_tokens_seen": 107341965, "step": 4988, "time_per_iteration": 2.772221565246582 }, { "auxiliary_loss_clip": 0.01114945, "auxiliary_loss_mlp": 0.00777889, "balance_loss_clip": 1.04905486, "balance_loss_mlp": 1.0013597, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 1.6966696236855432, "language_loss": 0.70858777, "learning_rate": 3.2830043315996658e-06, "loss": 0.72751617, "num_input_tokens_seen": 107362615, "step": 4989, "time_per_iteration": 2.7470130920410156 }, { "auxiliary_loss_clip": 0.0110827, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.0506041, "balance_loss_mlp": 1.02906489, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 1.9545100728262668, "language_loss": 0.85589516, "learning_rate": 3.282705542954199e-06, "loss": 0.87744367, "num_input_tokens_seen": 107378980, "step": 4990, "time_per_iteration": 2.808276414871216 }, { "auxiliary_loss_clip": 0.01133569, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05172086, "balance_loss_mlp": 1.02152538, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 1.8023870470649808, "language_loss": 0.67019355, "learning_rate": 3.28240670566841e-06, "loss": 0.69192666, "num_input_tokens_seen": 107397640, "step": 4991, "time_per_iteration": 2.7097268104553223 }, { "auxiliary_loss_clip": 0.0112021, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04660511, "balance_loss_mlp": 1.02248883, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 1.684252307124257, "language_loss": 0.78640115, "learning_rate": 3.28210781975363e-06, "loss": 0.80801708, "num_input_tokens_seen": 107416020, "step": 4992, "time_per_iteration": 2.66925311088562 }, { "auxiliary_loss_clip": 0.01143243, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.05240428, "balance_loss_mlp": 1.02457952, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 2.3134173579188175, "language_loss": 0.82057947, "learning_rate": 3.281808885221193e-06, "loss": 0.84243113, "num_input_tokens_seen": 107436340, "step": 4993, "time_per_iteration": 2.613849639892578 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01048917, "balance_loss_clip": 1.04667079, "balance_loss_mlp": 1.02997458, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 2.1042579138834197, "language_loss": 0.86142659, "learning_rate": 3.2815099020824345e-06, "loss": 0.88287598, "num_input_tokens_seen": 107454585, "step": 4994, "time_per_iteration": 2.703126907348633 }, { "auxiliary_loss_clip": 0.01118329, "auxiliary_loss_mlp": 0.01041975, "balance_loss_clip": 1.05592799, "balance_loss_mlp": 1.02504694, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 1.5905866784601752, "language_loss": 0.80834931, "learning_rate": 3.2812108703486924e-06, "loss": 0.82995236, "num_input_tokens_seen": 107477180, "step": 4995, "time_per_iteration": 2.8100333213806152 }, { "auxiliary_loss_clip": 0.01117939, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.05073023, "balance_loss_mlp": 1.02623129, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 1.9490007813217745, "language_loss": 0.67086798, "learning_rate": 3.2809117900313055e-06, "loss": 0.69248348, "num_input_tokens_seen": 107500250, "step": 4996, "time_per_iteration": 2.989062786102295 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.04888701, "balance_loss_mlp": 1.02449584, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 4.4692930536610245, "language_loss": 0.75825363, "learning_rate": 3.280612661141615e-06, "loss": 0.7798208, "num_input_tokens_seen": 107520070, "step": 4997, "time_per_iteration": 2.733402967453003 }, { "auxiliary_loss_clip": 0.01131118, "auxiliary_loss_mlp": 0.0104737, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.03149128, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 2.0588160995259197, "language_loss": 0.78425241, "learning_rate": 3.2803134836909646e-06, "loss": 0.80603731, "num_input_tokens_seen": 107539285, "step": 4998, "time_per_iteration": 2.7973837852478027 }, { "auxiliary_loss_clip": 0.011392, "auxiliary_loss_mlp": 0.01044927, "balance_loss_clip": 1.05180395, "balance_loss_mlp": 1.0287745, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 18.871291300313036, "language_loss": 0.73622382, "learning_rate": 3.2800142576906985e-06, "loss": 0.7580651, "num_input_tokens_seen": 107560260, "step": 4999, "time_per_iteration": 2.7197916507720947 }, { "auxiliary_loss_clip": 0.01131684, "auxiliary_loss_mlp": 0.01044515, "balance_loss_clip": 1.05033612, "balance_loss_mlp": 1.02750361, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 1.6090337016392804, "language_loss": 0.75454789, "learning_rate": 3.2797149831521626e-06, "loss": 0.77630985, "num_input_tokens_seen": 107579260, "step": 5000, "time_per_iteration": 2.688054323196411 }, { "auxiliary_loss_clip": 0.01138443, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.0505259, "balance_loss_mlp": 1.02564812, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 1.7985326326547535, "language_loss": 0.81841409, "learning_rate": 3.2794156600867073e-06, "loss": 0.84020931, "num_input_tokens_seen": 107595245, "step": 5001, "time_per_iteration": 2.6519837379455566 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.01048602, "balance_loss_clip": 1.05139947, "balance_loss_mlp": 1.03068447, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 1.8684342377814658, "language_loss": 0.7999261, "learning_rate": 3.2791162885056815e-06, "loss": 0.82169974, "num_input_tokens_seen": 107613985, "step": 5002, "time_per_iteration": 2.6749327182769775 }, { "auxiliary_loss_clip": 0.01091983, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.04869151, "balance_loss_mlp": 1.02431834, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 1.9577039368374018, "language_loss": 0.70993537, "learning_rate": 3.2788168684204376e-06, "loss": 0.73128337, "num_input_tokens_seen": 107631435, "step": 5003, "time_per_iteration": 2.908494472503662 }, { "auxiliary_loss_clip": 0.01110546, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.05014396, "balance_loss_mlp": 1.02643037, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 1.956987555909332, "language_loss": 0.70556092, "learning_rate": 3.27851739984233e-06, "loss": 0.72710526, "num_input_tokens_seen": 107650530, "step": 5004, "time_per_iteration": 2.8064236640930176 }, { "auxiliary_loss_clip": 0.01119172, "auxiliary_loss_mlp": 0.01045143, "balance_loss_clip": 1.05067444, "balance_loss_mlp": 1.02800083, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 2.8453259041050805, "language_loss": 0.81459486, "learning_rate": 3.278217882782715e-06, "loss": 0.83623803, "num_input_tokens_seen": 107662240, "step": 5005, "time_per_iteration": 2.633951425552368 }, { "auxiliary_loss_clip": 0.01130639, "auxiliary_loss_mlp": 0.01043853, "balance_loss_clip": 1.0514015, "balance_loss_mlp": 1.02742577, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 3.7156546302240043, "language_loss": 0.74672973, "learning_rate": 3.2779183172529497e-06, "loss": 0.76847464, "num_input_tokens_seen": 107680330, "step": 5006, "time_per_iteration": 2.7556662559509277 }, { "auxiliary_loss_clip": 0.01101239, "auxiliary_loss_mlp": 0.00775371, "balance_loss_clip": 1.04850578, "balance_loss_mlp": 1.00104856, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 2.0504029481480153, "language_loss": 0.71090448, "learning_rate": 3.2776187032643932e-06, "loss": 0.72967064, "num_input_tokens_seen": 107700020, "step": 5007, "time_per_iteration": 2.83591365814209 }, { "auxiliary_loss_clip": 0.01129575, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.05173922, "balance_loss_mlp": 1.0206027, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 2.302333802055736, "language_loss": 0.76504552, "learning_rate": 3.2773190408284075e-06, "loss": 0.78672242, "num_input_tokens_seen": 107718575, "step": 5008, "time_per_iteration": 2.7624082565307617 }, { "auxiliary_loss_clip": 0.0112694, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.05119205, "balance_loss_mlp": 1.02284265, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 1.840633361886899, "language_loss": 0.84215975, "learning_rate": 3.2770193299563564e-06, "loss": 0.86382657, "num_input_tokens_seen": 107738635, "step": 5009, "time_per_iteration": 2.7053475379943848 }, { "auxiliary_loss_clip": 0.01135722, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.05079174, "balance_loss_mlp": 1.02389145, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 1.970244045667646, "language_loss": 0.83804011, "learning_rate": 3.276719570659604e-06, "loss": 0.85982549, "num_input_tokens_seen": 107753415, "step": 5010, "time_per_iteration": 2.677002429962158 }, { "auxiliary_loss_clip": 0.01108582, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.04942024, "balance_loss_mlp": 1.02294374, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 2.3216326772862246, "language_loss": 0.85401523, "learning_rate": 3.2764197629495176e-06, "loss": 0.87548327, "num_input_tokens_seen": 107773840, "step": 5011, "time_per_iteration": 2.807887077331543 }, { "auxiliary_loss_clip": 0.01119452, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.04522014, "balance_loss_mlp": 1.02680194, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 2.58081844210284, "language_loss": 0.72122502, "learning_rate": 3.2761199068374656e-06, "loss": 0.74286604, "num_input_tokens_seen": 107792020, "step": 5012, "time_per_iteration": 2.689375400543213 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01042946, "balance_loss_clip": 1.04826403, "balance_loss_mlp": 1.02628016, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 2.871668468467944, "language_loss": 0.88278735, "learning_rate": 3.275820002334819e-06, "loss": 0.90448833, "num_input_tokens_seen": 107809595, "step": 5013, "time_per_iteration": 2.6482350826263428 }, { "auxiliary_loss_clip": 0.01110184, "auxiliary_loss_mlp": 0.01050326, "balance_loss_clip": 1.04318821, "balance_loss_mlp": 1.0286417, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 1.8756845710135603, "language_loss": 0.82593644, "learning_rate": 3.2755200494529496e-06, "loss": 0.84754151, "num_input_tokens_seen": 107827230, "step": 5014, "time_per_iteration": 2.6681008338928223 }, { "auxiliary_loss_clip": 0.01092673, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04461288, "balance_loss_mlp": 1.03045392, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 1.7101695757694795, "language_loss": 0.68239003, "learning_rate": 3.2752200482032323e-06, "loss": 0.7037937, "num_input_tokens_seen": 107847195, "step": 5015, "time_per_iteration": 2.725411891937256 }, { "auxiliary_loss_clip": 0.01110447, "auxiliary_loss_mlp": 0.01043819, "balance_loss_clip": 1.0448432, "balance_loss_mlp": 1.02652168, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 2.2766913154728625, "language_loss": 0.74497074, "learning_rate": 3.2749199985970436e-06, "loss": 0.76651341, "num_input_tokens_seen": 107866420, "step": 5016, "time_per_iteration": 2.710721492767334 }, { "auxiliary_loss_clip": 0.01133464, "auxiliary_loss_mlp": 0.01041604, "balance_loss_clip": 1.05026031, "balance_loss_mlp": 1.02444994, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 1.7847015072033203, "language_loss": 0.65504754, "learning_rate": 3.2746199006457603e-06, "loss": 0.67679822, "num_input_tokens_seen": 107889090, "step": 5017, "time_per_iteration": 2.7239317893981934 }, { "auxiliary_loss_clip": 0.01091977, "auxiliary_loss_mlp": 0.01057247, "balance_loss_clip": 1.04233074, "balance_loss_mlp": 1.03813791, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 2.1696927992492783, "language_loss": 0.68739498, "learning_rate": 3.2743197543607628e-06, "loss": 0.70888722, "num_input_tokens_seen": 107907520, "step": 5018, "time_per_iteration": 2.6655359268188477 }, { "auxiliary_loss_clip": 0.01135218, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.0482893, "balance_loss_mlp": 1.02783799, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 1.9457029488983892, "language_loss": 0.78853333, "learning_rate": 3.2740195597534327e-06, "loss": 0.8103134, "num_input_tokens_seen": 107925650, "step": 5019, "time_per_iteration": 2.669679641723633 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044161, "balance_loss_clip": 1.04863656, "balance_loss_mlp": 1.02766263, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 3.674249330665847, "language_loss": 0.70038712, "learning_rate": 3.2737193168351527e-06, "loss": 0.72195333, "num_input_tokens_seen": 107943975, "step": 5020, "time_per_iteration": 2.704000234603882 }, { "auxiliary_loss_clip": 0.01143422, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.03320909, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 5.641410405732297, "language_loss": 0.78549969, "learning_rate": 3.2734190256173085e-06, "loss": 0.80743068, "num_input_tokens_seen": 107962950, "step": 5021, "time_per_iteration": 4.521278142929077 }, { "auxiliary_loss_clip": 0.01129372, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.04859924, "balance_loss_mlp": 1.01572752, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 3.308202374048827, "language_loss": 0.75482392, "learning_rate": 3.2731186861112877e-06, "loss": 0.77643561, "num_input_tokens_seen": 107979700, "step": 5022, "time_per_iteration": 4.1478235721588135 }, { "auxiliary_loss_clip": 0.01141828, "auxiliary_loss_mlp": 0.01043797, "balance_loss_clip": 1.04905522, "balance_loss_mlp": 1.02676249, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 1.7715139184612991, "language_loss": 0.69534874, "learning_rate": 3.2728182983284793e-06, "loss": 0.71720505, "num_input_tokens_seen": 107996645, "step": 5023, "time_per_iteration": 2.582491636276245 }, { "auxiliary_loss_clip": 0.01112614, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.04434311, "balance_loss_mlp": 1.02471602, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 4.128865002464027, "language_loss": 0.71400636, "learning_rate": 3.2725178622802724e-06, "loss": 0.73554134, "num_input_tokens_seen": 108015020, "step": 5024, "time_per_iteration": 2.6789708137512207 }, { "auxiliary_loss_clip": 0.01125475, "auxiliary_loss_mlp": 0.01051317, "balance_loss_clip": 1.04789031, "balance_loss_mlp": 1.03441346, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 2.5352325664815396, "language_loss": 0.73949707, "learning_rate": 3.272217377978061e-06, "loss": 0.76126498, "num_input_tokens_seen": 108036430, "step": 5025, "time_per_iteration": 2.7021281719207764 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01049255, "balance_loss_clip": 1.05115628, "balance_loss_mlp": 1.03333473, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 1.5312912087399582, "language_loss": 0.67339373, "learning_rate": 3.2719168454332387e-06, "loss": 0.69518065, "num_input_tokens_seen": 108054250, "step": 5026, "time_per_iteration": 4.172817230224609 }, { "auxiliary_loss_clip": 0.01131398, "auxiliary_loss_mlp": 0.01045765, "balance_loss_clip": 1.05058789, "balance_loss_mlp": 1.02871835, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 1.8656003857402752, "language_loss": 0.84821522, "learning_rate": 3.2716162646572034e-06, "loss": 0.86998689, "num_input_tokens_seen": 108071495, "step": 5027, "time_per_iteration": 2.66186785697937 }, { "auxiliary_loss_clip": 0.01104085, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.04686451, "balance_loss_mlp": 1.03030431, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 1.633485895123786, "language_loss": 0.78574622, "learning_rate": 3.271315635661351e-06, "loss": 0.80724418, "num_input_tokens_seen": 108092135, "step": 5028, "time_per_iteration": 4.454678297042847 }, { "auxiliary_loss_clip": 0.01113383, "auxiliary_loss_mlp": 0.01048022, "balance_loss_clip": 1.04682207, "balance_loss_mlp": 1.03115392, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 1.9340935936746968, "language_loss": 0.77085543, "learning_rate": 3.2710149584570826e-06, "loss": 0.79246956, "num_input_tokens_seen": 108112945, "step": 5029, "time_per_iteration": 2.841707229614258 }, { "auxiliary_loss_clip": 0.01111921, "auxiliary_loss_mlp": 0.01048937, "balance_loss_clip": 1.04846191, "balance_loss_mlp": 1.02920818, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 2.1432001376374257, "language_loss": 0.8240397, "learning_rate": 3.2707142330557993e-06, "loss": 0.84564829, "num_input_tokens_seen": 108130325, "step": 5030, "time_per_iteration": 2.8557751178741455 }, { "auxiliary_loss_clip": 0.01090897, "auxiliary_loss_mlp": 0.00775419, "balance_loss_clip": 1.04519463, "balance_loss_mlp": 1.00112486, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 2.2374457582531098, "language_loss": 0.6987617, "learning_rate": 3.270413459468905e-06, "loss": 0.71742487, "num_input_tokens_seen": 108150300, "step": 5031, "time_per_iteration": 2.7827746868133545 }, { "auxiliary_loss_clip": 0.01121676, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.04549253, "balance_loss_mlp": 1.02800059, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 1.8685207024800563, "language_loss": 0.82324117, "learning_rate": 3.2701126377078047e-06, "loss": 0.84490258, "num_input_tokens_seen": 108170330, "step": 5032, "time_per_iteration": 2.6529927253723145 }, { "auxiliary_loss_clip": 0.01104945, "auxiliary_loss_mlp": 0.01059072, "balance_loss_clip": 1.05129266, "balance_loss_mlp": 1.03951025, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 2.130148669813867, "language_loss": 0.73156881, "learning_rate": 3.269811767783906e-06, "loss": 0.75320899, "num_input_tokens_seen": 108191265, "step": 5033, "time_per_iteration": 2.7259597778320312 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.01049397, "balance_loss_clip": 1.04687023, "balance_loss_mlp": 1.03221893, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 1.564237149834404, "language_loss": 0.74164939, "learning_rate": 3.2695108497086185e-06, "loss": 0.76338559, "num_input_tokens_seen": 108211615, "step": 5034, "time_per_iteration": 2.674745798110962 }, { "auxiliary_loss_clip": 0.01140313, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04939198, "balance_loss_mlp": 1.02224064, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 1.8295549596836873, "language_loss": 0.72133434, "learning_rate": 3.269209883493352e-06, "loss": 0.74312872, "num_input_tokens_seen": 108231080, "step": 5035, "time_per_iteration": 2.6429855823516846 }, { "auxiliary_loss_clip": 0.01123118, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04499483, "balance_loss_mlp": 1.02267289, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 2.468501372591198, "language_loss": 0.86918867, "learning_rate": 3.2689088691495196e-06, "loss": 0.89080417, "num_input_tokens_seen": 108251125, "step": 5036, "time_per_iteration": 2.6735007762908936 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.04504728, "balance_loss_mlp": 1.0331912, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 2.859596651876304, "language_loss": 0.77406383, "learning_rate": 3.268607806688536e-06, "loss": 0.79555464, "num_input_tokens_seen": 108272545, "step": 5037, "time_per_iteration": 2.7311182022094727 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01044604, "balance_loss_clip": 1.0462358, "balance_loss_mlp": 1.02683008, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 2.32450780354164, "language_loss": 0.77307165, "learning_rate": 3.268306696121816e-06, "loss": 0.79454064, "num_input_tokens_seen": 108289725, "step": 5038, "time_per_iteration": 2.677525043487549 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.04819584, "balance_loss_mlp": 1.02067804, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 2.1234468188232976, "language_loss": 0.74140579, "learning_rate": 3.2680055374607804e-06, "loss": 0.76291645, "num_input_tokens_seen": 108310690, "step": 5039, "time_per_iteration": 2.7086853981018066 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.05068994, "balance_loss_mlp": 1.00113058, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 2.3826017374700372, "language_loss": 0.79777801, "learning_rate": 3.267704330716847e-06, "loss": 0.81690192, "num_input_tokens_seen": 108328905, "step": 5040, "time_per_iteration": 2.665175199508667 }, { "auxiliary_loss_clip": 0.01114198, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.02279687, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 1.7800027985776907, "language_loss": 0.81872481, "learning_rate": 3.267403075901438e-06, "loss": 0.84024912, "num_input_tokens_seen": 108346680, "step": 5041, "time_per_iteration": 2.6471712589263916 }, { "auxiliary_loss_clip": 0.01018002, "auxiliary_loss_mlp": 0.01004656, "balance_loss_clip": 1.0244385, "balance_loss_mlp": 1.00277221, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 0.7715538683836823, "language_loss": 0.59505904, "learning_rate": 3.267101773025978e-06, "loss": 0.61528552, "num_input_tokens_seen": 108413885, "step": 5042, "time_per_iteration": 3.3167309761047363 }, { "auxiliary_loss_clip": 0.0114486, "auxiliary_loss_mlp": 0.01036647, "balance_loss_clip": 1.05319929, "balance_loss_mlp": 1.01940918, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 1.838538817411587, "language_loss": 0.71149278, "learning_rate": 3.266800422101892e-06, "loss": 0.73330784, "num_input_tokens_seen": 108433640, "step": 5043, "time_per_iteration": 2.6266753673553467 }, { "auxiliary_loss_clip": 0.01095086, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.04519725, "balance_loss_mlp": 1.01948404, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 3.620919115388089, "language_loss": 0.69573802, "learning_rate": 3.266499023140606e-06, "loss": 0.71705186, "num_input_tokens_seen": 108452640, "step": 5044, "time_per_iteration": 2.7561492919921875 }, { "auxiliary_loss_clip": 0.01127659, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05019724, "balance_loss_mlp": 1.02335382, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 1.3797061223764004, "language_loss": 0.77188826, "learning_rate": 3.2661975761535513e-06, "loss": 0.79356289, "num_input_tokens_seen": 108472470, "step": 5045, "time_per_iteration": 2.6529667377471924 }, { "auxiliary_loss_clip": 0.01141388, "auxiliary_loss_mlp": 0.00775246, "balance_loss_clip": 1.05165195, "balance_loss_mlp": 1.00136316, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 1.772786200303907, "language_loss": 0.72473782, "learning_rate": 3.2658960811521564e-06, "loss": 0.74390417, "num_input_tokens_seen": 108493025, "step": 5046, "time_per_iteration": 2.8433380126953125 }, { "auxiliary_loss_clip": 0.01131475, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.04979491, "balance_loss_mlp": 1.02119732, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 1.7729778222487513, "language_loss": 0.81406343, "learning_rate": 3.2655945381478564e-06, "loss": 0.83578163, "num_input_tokens_seen": 108513480, "step": 5047, "time_per_iteration": 2.6653506755828857 }, { "auxiliary_loss_clip": 0.01078955, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.04126537, "balance_loss_mlp": 1.02565265, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 2.0012909108595287, "language_loss": 0.7191782, "learning_rate": 3.265292947152084e-06, "loss": 0.74039751, "num_input_tokens_seen": 108533155, "step": 5048, "time_per_iteration": 2.7198410034179688 }, { "auxiliary_loss_clip": 0.01117557, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.02263796, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 1.6260333435769418, "language_loss": 0.75220919, "learning_rate": 3.2649913081762763e-06, "loss": 0.77376425, "num_input_tokens_seen": 108551900, "step": 5049, "time_per_iteration": 2.6649906635284424 }, { "auxiliary_loss_clip": 0.01131404, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.04947305, "balance_loss_mlp": 1.01907563, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 1.5855456549340856, "language_loss": 0.82088244, "learning_rate": 3.2646896212318717e-06, "loss": 0.84255171, "num_input_tokens_seen": 108574005, "step": 5050, "time_per_iteration": 2.657400131225586 }, { "auxiliary_loss_clip": 0.01106158, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.05031502, "balance_loss_mlp": 1.02079201, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.7844840544166436, "language_loss": 0.74196702, "learning_rate": 3.2643878863303106e-06, "loss": 0.7634114, "num_input_tokens_seen": 108592715, "step": 5051, "time_per_iteration": 2.8018569946289062 }, { "auxiliary_loss_clip": 0.01079332, "auxiliary_loss_mlp": 0.00775567, "balance_loss_clip": 1.04338145, "balance_loss_mlp": 1.00118661, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 1.6849730779493737, "language_loss": 0.76015687, "learning_rate": 3.264086103483033e-06, "loss": 0.77870589, "num_input_tokens_seen": 108611770, "step": 5052, "time_per_iteration": 2.9220657348632812 }, { "auxiliary_loss_clip": 0.01143047, "auxiliary_loss_mlp": 0.01043624, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.02656555, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 2.421175308310746, "language_loss": 0.82370055, "learning_rate": 3.2637842727014836e-06, "loss": 0.84556723, "num_input_tokens_seen": 108629070, "step": 5053, "time_per_iteration": 2.5955326557159424 }, { "auxiliary_loss_clip": 0.01113702, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.0471338, "balance_loss_mlp": 1.02475214, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 1.8307418288785484, "language_loss": 0.70979112, "learning_rate": 3.2634823939971083e-06, "loss": 0.73134822, "num_input_tokens_seen": 108646315, "step": 5054, "time_per_iteration": 2.7001569271087646 }, { "auxiliary_loss_clip": 0.01140964, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05088401, "balance_loss_mlp": 1.0225668, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 2.314538095600907, "language_loss": 0.69049591, "learning_rate": 3.2631804673813545e-06, "loss": 0.71230358, "num_input_tokens_seen": 108665920, "step": 5055, "time_per_iteration": 2.6685287952423096 }, { "auxiliary_loss_clip": 0.01113325, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.04871488, "balance_loss_mlp": 1.01880479, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 1.959915959447654, "language_loss": 0.67298615, "learning_rate": 3.2628784928656707e-06, "loss": 0.69448292, "num_input_tokens_seen": 108683485, "step": 5056, "time_per_iteration": 2.6933648586273193 }, { "auxiliary_loss_clip": 0.01110454, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.04604077, "balance_loss_mlp": 1.02673686, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 1.7045430221851803, "language_loss": 0.82544303, "learning_rate": 3.262576470461507e-06, "loss": 0.84697986, "num_input_tokens_seen": 108702700, "step": 5057, "time_per_iteration": 2.740187406539917 }, { "auxiliary_loss_clip": 0.01115402, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04719019, "balance_loss_mlp": 1.0222472, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 1.8459128585017135, "language_loss": 0.88849652, "learning_rate": 3.2622744001803176e-06, "loss": 0.91004193, "num_input_tokens_seen": 108721860, "step": 5058, "time_per_iteration": 2.7015340328216553 }, { "auxiliary_loss_clip": 0.01102971, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04598641, "balance_loss_mlp": 1.03040063, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 7.837576661900421, "language_loss": 0.71809238, "learning_rate": 3.2619722820335564e-06, "loss": 0.73959899, "num_input_tokens_seen": 108743215, "step": 5059, "time_per_iteration": 2.7542827129364014 }, { "auxiliary_loss_clip": 0.01083101, "auxiliary_loss_mlp": 0.01042605, "balance_loss_clip": 1.04435182, "balance_loss_mlp": 1.02670228, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 2.424944175434462, "language_loss": 0.73316336, "learning_rate": 3.26167011603268e-06, "loss": 0.7544204, "num_input_tokens_seen": 108765505, "step": 5060, "time_per_iteration": 4.655209541320801 }, { "auxiliary_loss_clip": 0.01140365, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.05072367, "balance_loss_mlp": 1.02234221, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 2.6284704346086, "language_loss": 0.77279079, "learning_rate": 3.2613679021891463e-06, "loss": 0.79457664, "num_input_tokens_seen": 108783370, "step": 5061, "time_per_iteration": 4.1857099533081055 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.05216312, "balance_loss_mlp": 1.02225542, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 1.9238999634605745, "language_loss": 0.81891274, "learning_rate": 3.261065640514415e-06, "loss": 0.84035993, "num_input_tokens_seen": 108797430, "step": 5062, "time_per_iteration": 2.7250373363494873 }, { "auxiliary_loss_clip": 0.01132809, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.04662633, "balance_loss_mlp": 1.02098203, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 1.8479376829176948, "language_loss": 0.74707627, "learning_rate": 3.2607633310199483e-06, "loss": 0.76876783, "num_input_tokens_seen": 108816945, "step": 5063, "time_per_iteration": 2.6387155055999756 }, { "auxiliary_loss_clip": 0.01126143, "auxiliary_loss_mlp": 0.00775405, "balance_loss_clip": 1.04923415, "balance_loss_mlp": 1.00135541, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 1.691336757602503, "language_loss": 0.84400523, "learning_rate": 3.26046097371721e-06, "loss": 0.86302078, "num_input_tokens_seen": 108836615, "step": 5064, "time_per_iteration": 2.645256519317627 }, { "auxiliary_loss_clip": 0.01125608, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.04725182, "balance_loss_mlp": 1.02311337, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 2.198572989748056, "language_loss": 0.76257896, "learning_rate": 3.2601585686176655e-06, "loss": 0.78423673, "num_input_tokens_seen": 108855165, "step": 5065, "time_per_iteration": 4.119553565979004 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04441273, "balance_loss_mlp": 1.0260098, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 1.985168773674731, "language_loss": 0.62328786, "learning_rate": 3.2598561157327814e-06, "loss": 0.64482433, "num_input_tokens_seen": 108874690, "step": 5066, "time_per_iteration": 4.380331516265869 }, { "auxiliary_loss_clip": 0.01112307, "auxiliary_loss_mlp": 0.0104907, "balance_loss_clip": 1.04790235, "balance_loss_mlp": 1.03186774, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 2.188592288059769, "language_loss": 0.83193344, "learning_rate": 3.2595536150740265e-06, "loss": 0.85354722, "num_input_tokens_seen": 108893140, "step": 5067, "time_per_iteration": 2.628598213195801 }, { "auxiliary_loss_clip": 0.01136833, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.04994464, "balance_loss_mlp": 1.02904344, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 4.883769852075586, "language_loss": 0.62878895, "learning_rate": 3.259251066652873e-06, "loss": 0.65060866, "num_input_tokens_seen": 108911880, "step": 5068, "time_per_iteration": 2.583193302154541 }, { "auxiliary_loss_clip": 0.01127244, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.02316117, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 4.297243307498397, "language_loss": 0.74780715, "learning_rate": 3.258948470480793e-06, "loss": 0.7694723, "num_input_tokens_seen": 108930440, "step": 5069, "time_per_iteration": 2.643608570098877 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04608154, "balance_loss_mlp": 1.02922475, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 1.9753352797934713, "language_loss": 0.75726902, "learning_rate": 3.258645826569261e-06, "loss": 0.77875942, "num_input_tokens_seen": 108949125, "step": 5070, "time_per_iteration": 2.715672016143799 }, { "auxiliary_loss_clip": 0.01140483, "auxiliary_loss_mlp": 0.0077507, "balance_loss_clip": 1.04843533, "balance_loss_mlp": 1.0012939, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 1.7281078039111346, "language_loss": 0.81636953, "learning_rate": 3.2583431349297527e-06, "loss": 0.83552504, "num_input_tokens_seen": 108972190, "step": 5071, "time_per_iteration": 2.635542869567871 }, { "auxiliary_loss_clip": 0.01108476, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.04286063, "balance_loss_mlp": 1.02776885, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 2.0085610287172173, "language_loss": 0.76208484, "learning_rate": 3.2580403955737467e-06, "loss": 0.78362632, "num_input_tokens_seen": 108990325, "step": 5072, "time_per_iteration": 2.6662180423736572 }, { "auxiliary_loss_clip": 0.01099158, "auxiliary_loss_mlp": 0.01044752, "balance_loss_clip": 1.04694605, "balance_loss_mlp": 1.02821743, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 1.8424983506970039, "language_loss": 0.70873296, "learning_rate": 3.257737608512723e-06, "loss": 0.7301721, "num_input_tokens_seen": 109009505, "step": 5073, "time_per_iteration": 2.815281867980957 }, { "auxiliary_loss_clip": 0.01133011, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.05032837, "balance_loss_mlp": 1.03757334, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 2.0666195830085434, "language_loss": 0.76370406, "learning_rate": 3.257434773758163e-06, "loss": 0.78558439, "num_input_tokens_seen": 109026350, "step": 5074, "time_per_iteration": 2.748568534851074 }, { "auxiliary_loss_clip": 0.01115721, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.02149391, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 1.8649350467458667, "language_loss": 0.74393201, "learning_rate": 3.25713189132155e-06, "loss": 0.76546526, "num_input_tokens_seen": 109044165, "step": 5075, "time_per_iteration": 2.7015154361724854 }, { "auxiliary_loss_clip": 0.01141745, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.0498178, "balance_loss_mlp": 1.02825916, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 2.030111139920667, "language_loss": 0.75904357, "learning_rate": 3.2568289612143703e-06, "loss": 0.78093445, "num_input_tokens_seen": 109060665, "step": 5076, "time_per_iteration": 2.5811965465545654 }, { "auxiliary_loss_clip": 0.01116901, "auxiliary_loss_mlp": 0.01040641, "balance_loss_clip": 1.04864156, "balance_loss_mlp": 1.02466679, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 1.6479970241835653, "language_loss": 0.79240596, "learning_rate": 3.25652598344811e-06, "loss": 0.81398141, "num_input_tokens_seen": 109080035, "step": 5077, "time_per_iteration": 2.680205821990967 }, { "auxiliary_loss_clip": 0.01087088, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.04356635, "balance_loss_mlp": 1.01881564, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 1.6765288024346336, "language_loss": 0.74525034, "learning_rate": 3.256222958034259e-06, "loss": 0.76645821, "num_input_tokens_seen": 109097385, "step": 5078, "time_per_iteration": 2.7247111797332764 }, { "auxiliary_loss_clip": 0.01085086, "auxiliary_loss_mlp": 0.01054049, "balance_loss_clip": 1.04356313, "balance_loss_mlp": 1.03728211, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 1.7442741256404064, "language_loss": 0.66648543, "learning_rate": 3.255919884984307e-06, "loss": 0.68787676, "num_input_tokens_seen": 109115495, "step": 5079, "time_per_iteration": 2.746490716934204 }, { "auxiliary_loss_clip": 0.01127155, "auxiliary_loss_mlp": 0.01040504, "balance_loss_clip": 1.04811811, "balance_loss_mlp": 1.0248282, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 2.3583709354228213, "language_loss": 0.79841697, "learning_rate": 3.2556167643097477e-06, "loss": 0.82009357, "num_input_tokens_seen": 109134235, "step": 5080, "time_per_iteration": 2.7156612873077393 }, { "auxiliary_loss_clip": 0.01124116, "auxiliary_loss_mlp": 0.00772863, "balance_loss_clip": 1.04919219, "balance_loss_mlp": 1.00125837, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 2.2636550763480074, "language_loss": 0.81280053, "learning_rate": 3.255313596022074e-06, "loss": 0.8317703, "num_input_tokens_seen": 109152760, "step": 5081, "time_per_iteration": 2.6763248443603516 }, { "auxiliary_loss_clip": 0.01120003, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.04644883, "balance_loss_mlp": 1.02843297, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 7.924214405919456, "language_loss": 0.71839154, "learning_rate": 3.255010380132783e-06, "loss": 0.74003601, "num_input_tokens_seen": 109173925, "step": 5082, "time_per_iteration": 2.7159903049468994 }, { "auxiliary_loss_clip": 0.0112721, "auxiliary_loss_mlp": 0.01043614, "balance_loss_clip": 1.04611564, "balance_loss_mlp": 1.02554226, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 2.25447896755926, "language_loss": 0.73108822, "learning_rate": 3.2547071166533736e-06, "loss": 0.75279647, "num_input_tokens_seen": 109192510, "step": 5083, "time_per_iteration": 2.646739959716797 }, { "auxiliary_loss_clip": 0.01107487, "auxiliary_loss_mlp": 0.00775151, "balance_loss_clip": 1.04263341, "balance_loss_mlp": 1.00127327, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 1.7470718607902291, "language_loss": 0.71378291, "learning_rate": 3.254403805595344e-06, "loss": 0.73260927, "num_input_tokens_seen": 109210885, "step": 5084, "time_per_iteration": 2.6846230030059814 }, { "auxiliary_loss_clip": 0.01099017, "auxiliary_loss_mlp": 0.01047221, "balance_loss_clip": 1.04366112, "balance_loss_mlp": 1.02929187, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 1.8852357422602322, "language_loss": 0.78966236, "learning_rate": 3.2541004469701962e-06, "loss": 0.81112474, "num_input_tokens_seen": 109229180, "step": 5085, "time_per_iteration": 2.7193636894226074 }, { "auxiliary_loss_clip": 0.01130512, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.04483652, "balance_loss_mlp": 1.01910806, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 1.9742516674355037, "language_loss": 0.78476739, "learning_rate": 3.2537970407894342e-06, "loss": 0.80641937, "num_input_tokens_seen": 109249510, "step": 5086, "time_per_iteration": 2.5860135555267334 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01052848, "balance_loss_clip": 1.04314184, "balance_loss_mlp": 1.03509736, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 1.8682002339545791, "language_loss": 0.76727784, "learning_rate": 3.253493587064563e-06, "loss": 0.78884006, "num_input_tokens_seen": 109268200, "step": 5087, "time_per_iteration": 2.732639789581299 }, { "auxiliary_loss_clip": 0.01125241, "auxiliary_loss_mlp": 0.01041401, "balance_loss_clip": 1.04509556, "balance_loss_mlp": 1.02450943, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 2.048016576932303, "language_loss": 0.72534674, "learning_rate": 3.2531900858070885e-06, "loss": 0.74701315, "num_input_tokens_seen": 109288370, "step": 5088, "time_per_iteration": 2.66654109954834 }, { "auxiliary_loss_clip": 0.01128516, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.04584277, "balance_loss_mlp": 1.02587295, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 2.359735204382993, "language_loss": 0.79327172, "learning_rate": 3.252886537028521e-06, "loss": 0.8149913, "num_input_tokens_seen": 109306730, "step": 5089, "time_per_iteration": 2.613231897354126 }, { "auxiliary_loss_clip": 0.01110444, "auxiliary_loss_mlp": 0.01041514, "balance_loss_clip": 1.04634953, "balance_loss_mlp": 1.02470577, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 1.8271327477144206, "language_loss": 0.77158219, "learning_rate": 3.2525829407403703e-06, "loss": 0.79310179, "num_input_tokens_seen": 109327360, "step": 5090, "time_per_iteration": 2.7469358444213867 }, { "auxiliary_loss_clip": 0.01116264, "auxiliary_loss_mlp": 0.01050158, "balance_loss_clip": 1.04506445, "balance_loss_mlp": 1.03317034, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 1.7853121536190235, "language_loss": 0.76108491, "learning_rate": 3.2522792969541488e-06, "loss": 0.78274912, "num_input_tokens_seen": 109348135, "step": 5091, "time_per_iteration": 2.7344727516174316 }, { "auxiliary_loss_clip": 0.01076722, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.04582906, "balance_loss_mlp": 1.02905178, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 1.9985396703734173, "language_loss": 0.71938324, "learning_rate": 3.2519756056813705e-06, "loss": 0.74064058, "num_input_tokens_seen": 109366220, "step": 5092, "time_per_iteration": 2.767212390899658 }, { "auxiliary_loss_clip": 0.01114871, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04740167, "balance_loss_mlp": 1.0246855, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 3.231748461445431, "language_loss": 0.82655406, "learning_rate": 3.2516718669335522e-06, "loss": 0.84810787, "num_input_tokens_seen": 109385260, "step": 5093, "time_per_iteration": 2.705643892288208 }, { "auxiliary_loss_clip": 0.01136927, "auxiliary_loss_mlp": 0.00773786, "balance_loss_clip": 1.04842925, "balance_loss_mlp": 1.00142932, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 1.6185046249293755, "language_loss": 0.75340986, "learning_rate": 3.2513680807222114e-06, "loss": 0.77251703, "num_input_tokens_seen": 109405025, "step": 5094, "time_per_iteration": 2.6171963214874268 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01042135, "balance_loss_clip": 1.04798305, "balance_loss_mlp": 1.02639914, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 2.1053112950674824, "language_loss": 0.75988996, "learning_rate": 3.251064247058868e-06, "loss": 0.7814374, "num_input_tokens_seen": 109422465, "step": 5095, "time_per_iteration": 2.7002673149108887 }, { "auxiliary_loss_clip": 0.0112272, "auxiliary_loss_mlp": 0.01043966, "balance_loss_clip": 1.04654729, "balance_loss_mlp": 1.0278492, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 8.237851994820396, "language_loss": 0.80608332, "learning_rate": 3.250760365955042e-06, "loss": 0.82775021, "num_input_tokens_seen": 109440575, "step": 5096, "time_per_iteration": 2.675551414489746 }, { "auxiliary_loss_clip": 0.01125431, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.04639602, "balance_loss_mlp": 1.02030659, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 3.1166257890970566, "language_loss": 0.81695235, "learning_rate": 3.250456437422258e-06, "loss": 0.83857059, "num_input_tokens_seen": 109459050, "step": 5097, "time_per_iteration": 2.6616358757019043 }, { "auxiliary_loss_clip": 0.01138165, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.04782009, "balance_loss_mlp": 1.02522099, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 2.1722798378639663, "language_loss": 0.78152639, "learning_rate": 3.250152461472041e-06, "loss": 0.80333775, "num_input_tokens_seen": 109475860, "step": 5098, "time_per_iteration": 2.581339120864868 }, { "auxiliary_loss_clip": 0.01093696, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04763365, "balance_loss_mlp": 1.02302897, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 1.8342329708039284, "language_loss": 0.84488571, "learning_rate": 3.249848438115917e-06, "loss": 0.86622083, "num_input_tokens_seen": 109494760, "step": 5099, "time_per_iteration": 2.761580467224121 }, { "auxiliary_loss_clip": 0.0113763, "auxiliary_loss_mlp": 0.01044142, "balance_loss_clip": 1.04598331, "balance_loss_mlp": 1.02683902, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 1.7645297710058767, "language_loss": 0.85650218, "learning_rate": 3.2495443673654148e-06, "loss": 0.87831986, "num_input_tokens_seen": 109516480, "step": 5100, "time_per_iteration": 4.130753517150879 }, { "auxiliary_loss_clip": 0.01099546, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.04097986, "balance_loss_mlp": 1.02268374, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 1.8121599631247622, "language_loss": 0.78980827, "learning_rate": 3.249240249232065e-06, "loss": 0.81120867, "num_input_tokens_seen": 109534615, "step": 5101, "time_per_iteration": 4.324965000152588 }, { "auxiliary_loss_clip": 0.01102347, "auxiliary_loss_mlp": 0.01054476, "balance_loss_clip": 1.04654586, "balance_loss_mlp": 1.03549778, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 3.103169454759946, "language_loss": 0.8002606, "learning_rate": 3.2489360837273998e-06, "loss": 0.82182884, "num_input_tokens_seen": 109554040, "step": 5102, "time_per_iteration": 2.6799395084381104 }, { "auxiliary_loss_clip": 0.01142197, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.05097044, "balance_loss_mlp": 1.02254653, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 2.1213785434731416, "language_loss": 0.88774347, "learning_rate": 3.2486318708629532e-06, "loss": 0.90957761, "num_input_tokens_seen": 109574345, "step": 5103, "time_per_iteration": 2.65173077583313 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.04379106, "balance_loss_mlp": 1.03051972, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 1.7904968866721789, "language_loss": 0.73977435, "learning_rate": 3.2483276106502607e-06, "loss": 0.7614246, "num_input_tokens_seen": 109593670, "step": 5104, "time_per_iteration": 4.15887975692749 }, { "auxiliary_loss_clip": 0.01124364, "auxiliary_loss_mlp": 0.00776702, "balance_loss_clip": 1.04378068, "balance_loss_mlp": 1.00128829, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 3.7241561762804496, "language_loss": 0.72777617, "learning_rate": 3.2480233031008605e-06, "loss": 0.74678683, "num_input_tokens_seen": 109613385, "step": 5105, "time_per_iteration": 2.657212972640991 }, { "auxiliary_loss_clip": 0.01112354, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.0451684, "balance_loss_mlp": 1.02401972, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 1.9297281358185925, "language_loss": 0.87290782, "learning_rate": 3.2477189482262916e-06, "loss": 0.89444917, "num_input_tokens_seen": 109632395, "step": 5106, "time_per_iteration": 4.409428119659424 }, { "auxiliary_loss_clip": 0.0110831, "auxiliary_loss_mlp": 0.01052851, "balance_loss_clip": 1.04540682, "balance_loss_mlp": 1.03390849, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 2.254355123120303, "language_loss": 0.71420276, "learning_rate": 3.2474145460380945e-06, "loss": 0.73581433, "num_input_tokens_seen": 109651380, "step": 5107, "time_per_iteration": 2.7320871353149414 }, { "auxiliary_loss_clip": 0.01101295, "auxiliary_loss_mlp": 0.0104767, "balance_loss_clip": 1.04618347, "balance_loss_mlp": 1.03034878, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.1230574515432705, "language_loss": 0.72282934, "learning_rate": 3.247110096547814e-06, "loss": 0.74431896, "num_input_tokens_seen": 109670240, "step": 5108, "time_per_iteration": 2.720196485519409 }, { "auxiliary_loss_clip": 0.01112658, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04619241, "balance_loss_mlp": 1.02325416, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 3.0053852764205695, "language_loss": 0.8601433, "learning_rate": 3.2468055997669926e-06, "loss": 0.88167822, "num_input_tokens_seen": 109690810, "step": 5109, "time_per_iteration": 2.715580940246582 }, { "auxiliary_loss_clip": 0.01109383, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.04432368, "balance_loss_mlp": 1.02017736, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 1.7463183423202828, "language_loss": 0.67169911, "learning_rate": 3.2465010557071788e-06, "loss": 0.69316053, "num_input_tokens_seen": 109711145, "step": 5110, "time_per_iteration": 2.7133336067199707 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.04854119, "balance_loss_mlp": 1.01736796, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 1.4548971516988844, "language_loss": 0.76673061, "learning_rate": 3.246196464379919e-06, "loss": 0.78833127, "num_input_tokens_seen": 109731425, "step": 5111, "time_per_iteration": 2.692505121231079 }, { "auxiliary_loss_clip": 0.01140411, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.04979658, "balance_loss_mlp": 1.02360249, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 3.7694679470365244, "language_loss": 0.67143333, "learning_rate": 3.245891825796765e-06, "loss": 0.69323719, "num_input_tokens_seen": 109752720, "step": 5112, "time_per_iteration": 2.6441125869750977 }, { "auxiliary_loss_clip": 0.01133822, "auxiliary_loss_mlp": 0.01044497, "balance_loss_clip": 1.05147326, "balance_loss_mlp": 1.02482784, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 2.062737517485213, "language_loss": 0.79524493, "learning_rate": 3.2455871399692678e-06, "loss": 0.81702805, "num_input_tokens_seen": 109772840, "step": 5113, "time_per_iteration": 2.7166647911071777 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.00138378, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 2.08885217843665, "language_loss": 0.76926446, "learning_rate": 3.2452824069089815e-06, "loss": 0.78803539, "num_input_tokens_seen": 109790150, "step": 5114, "time_per_iteration": 2.6842217445373535 }, { "auxiliary_loss_clip": 0.01100955, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.0446732, "balance_loss_mlp": 1.01589036, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 2.179333764681939, "language_loss": 0.62607706, "learning_rate": 3.2449776266274623e-06, "loss": 0.64743078, "num_input_tokens_seen": 109807985, "step": 5115, "time_per_iteration": 2.7709848880767822 }, { "auxiliary_loss_clip": 0.0113067, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02557516, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 2.4707888757665684, "language_loss": 0.82835108, "learning_rate": 3.2446727991362657e-06, "loss": 0.85007656, "num_input_tokens_seen": 109825920, "step": 5116, "time_per_iteration": 2.6891255378723145 }, { "auxiliary_loss_clip": 0.01115169, "auxiliary_loss_mlp": 0.01050095, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.03291702, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 1.792550086960714, "language_loss": 0.75943851, "learning_rate": 3.244367924446952e-06, "loss": 0.78109109, "num_input_tokens_seen": 109846220, "step": 5117, "time_per_iteration": 2.6685919761657715 }, { "auxiliary_loss_clip": 0.01096356, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04583359, "balance_loss_mlp": 1.02309084, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 2.509228810910763, "language_loss": 0.71450555, "learning_rate": 3.2440630025710826e-06, "loss": 0.7358911, "num_input_tokens_seen": 109863870, "step": 5118, "time_per_iteration": 2.7360472679138184 }, { "auxiliary_loss_clip": 0.0109679, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.05069757, "balance_loss_mlp": 1.02279758, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 1.6950758291291428, "language_loss": 0.74499059, "learning_rate": 3.243758033520219e-06, "loss": 0.76635897, "num_input_tokens_seen": 109883500, "step": 5119, "time_per_iteration": 2.7963552474975586 }, { "auxiliary_loss_clip": 0.01133391, "auxiliary_loss_mlp": 0.01054336, "balance_loss_clip": 1.05088997, "balance_loss_mlp": 1.03520322, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 2.3083726349779785, "language_loss": 0.79968077, "learning_rate": 3.243453017305926e-06, "loss": 0.821558, "num_input_tokens_seen": 109904620, "step": 5120, "time_per_iteration": 2.7600536346435547 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.04772663, "balance_loss_mlp": 1.02994657, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 1.7119475154385397, "language_loss": 0.79864663, "learning_rate": 3.24314795393977e-06, "loss": 0.8203727, "num_input_tokens_seen": 109922275, "step": 5121, "time_per_iteration": 2.6204211711883545 }, { "auxiliary_loss_clip": 0.01105091, "auxiliary_loss_mlp": 0.01039616, "balance_loss_clip": 1.04669154, "balance_loss_mlp": 1.02292657, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 1.4682711249191758, "language_loss": 0.82526803, "learning_rate": 3.242842843433319e-06, "loss": 0.84671509, "num_input_tokens_seen": 109944265, "step": 5122, "time_per_iteration": 2.7210805416107178 }, { "auxiliary_loss_clip": 0.01052784, "auxiliary_loss_mlp": 0.01010188, "balance_loss_clip": 1.03048515, "balance_loss_mlp": 1.00826919, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.7449761063336078, "language_loss": 0.58609217, "learning_rate": 3.242537685798143e-06, "loss": 0.60672188, "num_input_tokens_seen": 110014160, "step": 5123, "time_per_iteration": 3.303093433380127 }, { "auxiliary_loss_clip": 0.01133855, "auxiliary_loss_mlp": 0.00776294, "balance_loss_clip": 1.04937184, "balance_loss_mlp": 1.00136161, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 1.5927838238117058, "language_loss": 0.83550704, "learning_rate": 3.242232481045813e-06, "loss": 0.85460854, "num_input_tokens_seen": 110034865, "step": 5124, "time_per_iteration": 2.7226438522338867 }, { "auxiliary_loss_clip": 0.01143185, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.05123234, "balance_loss_mlp": 1.02206898, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 2.0767599752543657, "language_loss": 0.79332423, "learning_rate": 3.2419272291879035e-06, "loss": 0.81514347, "num_input_tokens_seen": 110052930, "step": 5125, "time_per_iteration": 2.6514153480529785 }, { "auxiliary_loss_clip": 0.01125892, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04636812, "balance_loss_mlp": 1.01694369, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 1.764828299724452, "language_loss": 0.64689863, "learning_rate": 3.241621930235989e-06, "loss": 0.66851032, "num_input_tokens_seen": 110071765, "step": 5126, "time_per_iteration": 2.6408963203430176 }, { "auxiliary_loss_clip": 0.01099238, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.05009556, "balance_loss_mlp": 1.02698874, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 1.5302214532460006, "language_loss": 0.86800975, "learning_rate": 3.241316584201646e-06, "loss": 0.88944745, "num_input_tokens_seen": 110092660, "step": 5127, "time_per_iteration": 2.793318748474121 }, { "auxiliary_loss_clip": 0.01086461, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.04368591, "balance_loss_mlp": 1.02862501, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 1.6968110238499217, "language_loss": 0.69155616, "learning_rate": 3.2410111910964538e-06, "loss": 0.71287817, "num_input_tokens_seen": 110114960, "step": 5128, "time_per_iteration": 2.777060031890869 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.00775186, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.00153518, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 1.7900045405252538, "language_loss": 0.71075535, "learning_rate": 3.240705750931993e-06, "loss": 0.7298153, "num_input_tokens_seen": 110135750, "step": 5129, "time_per_iteration": 2.7317588329315186 }, { "auxiliary_loss_clip": 0.01030892, "auxiliary_loss_mlp": 0.01007708, "balance_loss_clip": 1.0286324, "balance_loss_mlp": 1.00588405, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.8221299931057983, "language_loss": 0.59160221, "learning_rate": 3.240400263719846e-06, "loss": 0.61198819, "num_input_tokens_seen": 110189480, "step": 5130, "time_per_iteration": 3.2141849994659424 }, { "auxiliary_loss_clip": 0.01115906, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.04513061, "balance_loss_mlp": 1.02297497, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 2.986922621878904, "language_loss": 0.73292506, "learning_rate": 3.2400947294715957e-06, "loss": 0.75449622, "num_input_tokens_seen": 110206445, "step": 5131, "time_per_iteration": 2.6520204544067383 }, { "auxiliary_loss_clip": 0.01099541, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.04438055, "balance_loss_mlp": 1.01822817, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 1.569237882810685, "language_loss": 0.71420097, "learning_rate": 3.2397891481988303e-06, "loss": 0.73554134, "num_input_tokens_seen": 110226845, "step": 5132, "time_per_iteration": 2.8439948558807373 }, { "auxiliary_loss_clip": 0.01134935, "auxiliary_loss_mlp": 0.00774998, "balance_loss_clip": 1.04922795, "balance_loss_mlp": 1.00131333, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 1.9070570981004293, "language_loss": 0.89846021, "learning_rate": 3.239483519913136e-06, "loss": 0.91755956, "num_input_tokens_seen": 110244095, "step": 5133, "time_per_iteration": 2.5872273445129395 }, { "auxiliary_loss_clip": 0.01122429, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.04856205, "balance_loss_mlp": 1.02580321, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 1.7209646054950307, "language_loss": 0.67267555, "learning_rate": 3.239177844626102e-06, "loss": 0.69433594, "num_input_tokens_seen": 110264240, "step": 5134, "time_per_iteration": 2.7872183322906494 }, { "auxiliary_loss_clip": 0.01124541, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04777277, "balance_loss_mlp": 1.02393556, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 1.9145067593542924, "language_loss": 0.82794344, "learning_rate": 3.2388721223493197e-06, "loss": 0.84960246, "num_input_tokens_seen": 110282450, "step": 5135, "time_per_iteration": 2.6355140209198 }, { "auxiliary_loss_clip": 0.01026512, "auxiliary_loss_mlp": 0.01003035, "balance_loss_clip": 1.02417064, "balance_loss_mlp": 1.00113988, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.6923211570832432, "language_loss": 0.55314827, "learning_rate": 3.2385663530943824e-06, "loss": 0.57344365, "num_input_tokens_seen": 110343715, "step": 5136, "time_per_iteration": 3.31300687789917 }, { "auxiliary_loss_clip": 0.01118007, "auxiliary_loss_mlp": 0.00775624, "balance_loss_clip": 1.04826593, "balance_loss_mlp": 1.00124264, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.038560176689262, "language_loss": 0.76524079, "learning_rate": 3.2382605368728852e-06, "loss": 0.78417706, "num_input_tokens_seen": 110368430, "step": 5137, "time_per_iteration": 3.1237831115722656 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.04592168, "balance_loss_mlp": 1.02058411, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 1.655645044155811, "language_loss": 0.80083114, "learning_rate": 3.237954673696424e-06, "loss": 0.82222247, "num_input_tokens_seen": 110386735, "step": 5138, "time_per_iteration": 2.775902509689331 }, { "auxiliary_loss_clip": 0.01078807, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.03953338, "balance_loss_mlp": 1.02583957, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 1.3823165076112356, "language_loss": 0.81288958, "learning_rate": 3.2376487635765983e-06, "loss": 0.8341291, "num_input_tokens_seen": 110406820, "step": 5139, "time_per_iteration": 4.48141074180603 }, { "auxiliary_loss_clip": 0.01127056, "auxiliary_loss_mlp": 0.01044845, "balance_loss_clip": 1.04565382, "balance_loss_mlp": 1.02575994, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 2.1511159973406593, "language_loss": 0.77260494, "learning_rate": 3.2373428065250067e-06, "loss": 0.79432398, "num_input_tokens_seen": 110424225, "step": 5140, "time_per_iteration": 4.1141037940979 }, { "auxiliary_loss_clip": 0.01099157, "auxiliary_loss_mlp": 0.01048812, "balance_loss_clip": 1.04282403, "balance_loss_mlp": 1.03233695, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 1.77105935640331, "language_loss": 0.78806967, "learning_rate": 3.237036802553252e-06, "loss": 0.80954939, "num_input_tokens_seen": 110443310, "step": 5141, "time_per_iteration": 2.6497676372528076 }, { "auxiliary_loss_clip": 0.01119702, "auxiliary_loss_mlp": 0.0104967, "balance_loss_clip": 1.04679799, "balance_loss_mlp": 1.03138292, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 2.261971688212118, "language_loss": 0.86853915, "learning_rate": 3.2367307516729377e-06, "loss": 0.89023286, "num_input_tokens_seen": 110460215, "step": 5142, "time_per_iteration": 2.635495662689209 }, { "auxiliary_loss_clip": 0.01127738, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.04709148, "balance_loss_mlp": 1.03136778, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 1.7222677689082588, "language_loss": 0.79352587, "learning_rate": 3.23642465389567e-06, "loss": 0.81528366, "num_input_tokens_seen": 110479385, "step": 5143, "time_per_iteration": 2.672196388244629 }, { "auxiliary_loss_clip": 0.01108121, "auxiliary_loss_mlp": 0.01046466, "balance_loss_clip": 1.04830873, "balance_loss_mlp": 1.02858496, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 1.849759687088619, "language_loss": 0.72079581, "learning_rate": 3.236118509233055e-06, "loss": 0.7423417, "num_input_tokens_seen": 110499885, "step": 5144, "time_per_iteration": 4.2138121128082275 }, { "auxiliary_loss_clip": 0.01130266, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.04617548, "balance_loss_mlp": 1.03297877, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.9804845877808144, "language_loss": 0.74328083, "learning_rate": 3.235812317696702e-06, "loss": 0.76508898, "num_input_tokens_seen": 110519690, "step": 5145, "time_per_iteration": 4.315273761749268 }, { "auxiliary_loss_clip": 0.01110927, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.04372048, "balance_loss_mlp": 1.02788365, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 1.6657569174801012, "language_loss": 0.76391518, "learning_rate": 3.2355060792982224e-06, "loss": 0.78547978, "num_input_tokens_seen": 110540520, "step": 5146, "time_per_iteration": 2.7259135246276855 }, { "auxiliary_loss_clip": 0.0111122, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.04380584, "balance_loss_mlp": 1.02553141, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 2.148705061921787, "language_loss": 0.66899967, "learning_rate": 3.2351997940492286e-06, "loss": 0.6905365, "num_input_tokens_seen": 110557950, "step": 5147, "time_per_iteration": 2.6804444789886475 }, { "auxiliary_loss_clip": 0.01132642, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.04998684, "balance_loss_mlp": 1.0238843, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 2.0634223914225585, "language_loss": 0.74823105, "learning_rate": 3.2348934619613346e-06, "loss": 0.76996237, "num_input_tokens_seen": 110578215, "step": 5148, "time_per_iteration": 2.637509346008301 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.01047495, "balance_loss_clip": 1.0492146, "balance_loss_mlp": 1.02901721, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 2.1367843023537287, "language_loss": 0.73082036, "learning_rate": 3.2345870830461567e-06, "loss": 0.75264585, "num_input_tokens_seen": 110592990, "step": 5149, "time_per_iteration": 2.6134157180786133 }, { "auxiliary_loss_clip": 0.01097892, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.04601955, "balance_loss_mlp": 1.02615988, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 2.0797901111423274, "language_loss": 0.845025, "learning_rate": 3.2342806573153132e-06, "loss": 0.86644673, "num_input_tokens_seen": 110612130, "step": 5150, "time_per_iteration": 2.7804181575775146 }, { "auxiliary_loss_clip": 0.01086512, "auxiliary_loss_mlp": 0.01047133, "balance_loss_clip": 1.04168093, "balance_loss_mlp": 1.02820301, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 1.8768941622145223, "language_loss": 0.78431082, "learning_rate": 3.233974184780424e-06, "loss": 0.80564725, "num_input_tokens_seen": 110632045, "step": 5151, "time_per_iteration": 2.7539470195770264 }, { "auxiliary_loss_clip": 0.01131879, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.04880977, "balance_loss_mlp": 1.02362132, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 1.9606136965084777, "language_loss": 0.67416716, "learning_rate": 3.2336676654531084e-06, "loss": 0.69590038, "num_input_tokens_seen": 110649340, "step": 5152, "time_per_iteration": 2.579238176345825 }, { "auxiliary_loss_clip": 0.01080518, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.04402971, "balance_loss_mlp": 1.02807546, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 5.6670540450328355, "language_loss": 0.8251189, "learning_rate": 3.2333610993449926e-06, "loss": 0.84638333, "num_input_tokens_seen": 110668450, "step": 5153, "time_per_iteration": 2.792285203933716 }, { "auxiliary_loss_clip": 0.01113849, "auxiliary_loss_mlp": 0.00775793, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 1.00127769, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 1.937189485762574, "language_loss": 0.73793215, "learning_rate": 3.2330544864676997e-06, "loss": 0.75682855, "num_input_tokens_seen": 110689410, "step": 5154, "time_per_iteration": 2.678454875946045 }, { "auxiliary_loss_clip": 0.01132509, "auxiliary_loss_mlp": 0.0103738, "balance_loss_clip": 1.0507983, "balance_loss_mlp": 1.02009416, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 2.1601099672999586, "language_loss": 0.76069349, "learning_rate": 3.232747826832858e-06, "loss": 0.78239238, "num_input_tokens_seen": 110707350, "step": 5155, "time_per_iteration": 2.577634334564209 }, { "auxiliary_loss_clip": 0.01131155, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05483913, "balance_loss_mlp": 1.02283418, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 2.044896457109867, "language_loss": 0.79096609, "learning_rate": 3.232441120452094e-06, "loss": 0.81268191, "num_input_tokens_seen": 110724910, "step": 5156, "time_per_iteration": 2.628363609313965 }, { "auxiliary_loss_clip": 0.01127429, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.04775023, "balance_loss_mlp": 1.02779543, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 2.468311845454126, "language_loss": 0.74950963, "learning_rate": 3.23213436733704e-06, "loss": 0.77125776, "num_input_tokens_seen": 110744010, "step": 5157, "time_per_iteration": 2.6231181621551514 }, { "auxiliary_loss_clip": 0.01108321, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.04868615, "balance_loss_mlp": 1.02634752, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 1.6453166696914168, "language_loss": 0.69648343, "learning_rate": 3.231827567499327e-06, "loss": 0.71799374, "num_input_tokens_seen": 110765835, "step": 5158, "time_per_iteration": 2.734889030456543 }, { "auxiliary_loss_clip": 0.01095116, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04443944, "balance_loss_mlp": 1.0301435, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 1.9329481500014836, "language_loss": 0.84861457, "learning_rate": 3.2315207209505896e-06, "loss": 0.87002677, "num_input_tokens_seen": 110784655, "step": 5159, "time_per_iteration": 2.665311813354492 }, { "auxiliary_loss_clip": 0.01116498, "auxiliary_loss_mlp": 0.01046065, "balance_loss_clip": 1.04710639, "balance_loss_mlp": 1.02877951, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 1.9614748869944683, "language_loss": 0.85129201, "learning_rate": 3.231213827702462e-06, "loss": 0.87291765, "num_input_tokens_seen": 110802545, "step": 5160, "time_per_iteration": 2.597130298614502 }, { "auxiliary_loss_clip": 0.01133056, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.0520395, "balance_loss_mlp": 1.02582884, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 1.9459577302566504, "language_loss": 0.75555152, "learning_rate": 3.230906887766584e-06, "loss": 0.77730811, "num_input_tokens_seen": 110820265, "step": 5161, "time_per_iteration": 2.583240032196045 }, { "auxiliary_loss_clip": 0.0113313, "auxiliary_loss_mlp": 0.01045414, "balance_loss_clip": 1.05046988, "balance_loss_mlp": 1.02797401, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 1.9938857241338979, "language_loss": 0.8156144, "learning_rate": 3.2305999011545924e-06, "loss": 0.83739984, "num_input_tokens_seen": 110836195, "step": 5162, "time_per_iteration": 2.495689630508423 }, { "auxiliary_loss_clip": 0.01128762, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.02450919, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.777649785974679, "language_loss": 0.82892883, "learning_rate": 3.2302928678781295e-06, "loss": 0.85061604, "num_input_tokens_seen": 110856420, "step": 5163, "time_per_iteration": 2.591036081314087 }, { "auxiliary_loss_clip": 0.01147486, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.0273242, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 1.875247009463239, "language_loss": 0.76131678, "learning_rate": 3.2299857879488376e-06, "loss": 0.78323686, "num_input_tokens_seen": 110876650, "step": 5164, "time_per_iteration": 2.5745677947998047 }, { "auxiliary_loss_clip": 0.01103275, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.04969811, "balance_loss_mlp": 1.02880108, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 3.462886730904856, "language_loss": 0.74514711, "learning_rate": 3.2296786613783626e-06, "loss": 0.7666434, "num_input_tokens_seen": 110894445, "step": 5165, "time_per_iteration": 2.724846124649048 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01057021, "balance_loss_clip": 1.04695523, "balance_loss_mlp": 1.03841233, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 1.6273273492295701, "language_loss": 0.75827682, "learning_rate": 3.229371488178348e-06, "loss": 0.77985275, "num_input_tokens_seen": 110912855, "step": 5166, "time_per_iteration": 2.7309961318969727 }, { "auxiliary_loss_clip": 0.01121318, "auxiliary_loss_mlp": 0.01043526, "balance_loss_clip": 1.04969096, "balance_loss_mlp": 1.02665818, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.1635307284170833, "language_loss": 0.73621917, "learning_rate": 3.229064268360444e-06, "loss": 0.75786763, "num_input_tokens_seen": 110928025, "step": 5167, "time_per_iteration": 2.623375654220581 }, { "auxiliary_loss_clip": 0.01007539, "auxiliary_loss_mlp": 0.01008435, "balance_loss_clip": 1.02476823, "balance_loss_mlp": 1.0059557, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7113763854018822, "language_loss": 0.53030008, "learning_rate": 3.2287570019362997e-06, "loss": 0.55045986, "num_input_tokens_seen": 110992215, "step": 5168, "time_per_iteration": 3.3115129470825195 }, { "auxiliary_loss_clip": 0.01138497, "auxiliary_loss_mlp": 0.01050074, "balance_loss_clip": 1.05561399, "balance_loss_mlp": 1.03151321, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 3.621905149464154, "language_loss": 0.79032969, "learning_rate": 3.2284496889175668e-06, "loss": 0.81221539, "num_input_tokens_seen": 111010400, "step": 5169, "time_per_iteration": 2.595463514328003 }, { "auxiliary_loss_clip": 0.01121822, "auxiliary_loss_mlp": 0.01047209, "balance_loss_clip": 1.04804373, "balance_loss_mlp": 1.02937579, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 1.57130024638105, "language_loss": 0.64071, "learning_rate": 3.2281423293158986e-06, "loss": 0.66240036, "num_input_tokens_seen": 111033960, "step": 5170, "time_per_iteration": 2.746469497680664 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.00776539, "balance_loss_clip": 1.04874384, "balance_loss_mlp": 1.00120461, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 2.172069963879317, "language_loss": 0.7723515, "learning_rate": 3.22783492314295e-06, "loss": 0.79116607, "num_input_tokens_seen": 111053265, "step": 5171, "time_per_iteration": 2.776974678039551 }, { "auxiliary_loss_clip": 0.01100832, "auxiliary_loss_mlp": 0.01048172, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.03055298, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 1.830523579545495, "language_loss": 0.84020013, "learning_rate": 3.2275274704103785e-06, "loss": 0.86169016, "num_input_tokens_seen": 111071130, "step": 5172, "time_per_iteration": 2.718118906021118 }, { "auxiliary_loss_clip": 0.01091688, "auxiliary_loss_mlp": 0.01045541, "balance_loss_clip": 1.04622412, "balance_loss_mlp": 1.02706313, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 1.9540355263753015, "language_loss": 0.83730888, "learning_rate": 3.227219971129842e-06, "loss": 0.8586812, "num_input_tokens_seen": 111089560, "step": 5173, "time_per_iteration": 2.735163927078247 }, { "auxiliary_loss_clip": 0.01145239, "auxiliary_loss_mlp": 0.01042621, "balance_loss_clip": 1.05589437, "balance_loss_mlp": 1.02656341, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 3.2612368513370495, "language_loss": 0.83354348, "learning_rate": 3.226912425313001e-06, "loss": 0.85542202, "num_input_tokens_seen": 111109960, "step": 5174, "time_per_iteration": 2.65226411819458 }, { "auxiliary_loss_clip": 0.01122854, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.05162597, "balance_loss_mlp": 1.02928042, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 1.9777752297496725, "language_loss": 0.85181922, "learning_rate": 3.2266048329715183e-06, "loss": 0.87350869, "num_input_tokens_seen": 111127960, "step": 5175, "time_per_iteration": 2.6930692195892334 }, { "auxiliary_loss_clip": 0.01087659, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.04638839, "balance_loss_mlp": 1.02623129, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 1.845729409399547, "language_loss": 0.82990116, "learning_rate": 3.2262971941170575e-06, "loss": 0.8512246, "num_input_tokens_seen": 111146730, "step": 5176, "time_per_iteration": 2.7975289821624756 }, { "auxiliary_loss_clip": 0.01126555, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04662132, "balance_loss_mlp": 1.02361798, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 1.9258407965023028, "language_loss": 0.8096348, "learning_rate": 3.2259895087612837e-06, "loss": 0.83132547, "num_input_tokens_seen": 111166295, "step": 5177, "time_per_iteration": 2.6275687217712402 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.0077682, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.00119591, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 1.6855068015846089, "language_loss": 0.80707169, "learning_rate": 3.2256817769158657e-06, "loss": 0.82618099, "num_input_tokens_seen": 111185665, "step": 5178, "time_per_iteration": 4.142611742019653 }, { "auxiliary_loss_clip": 0.01119942, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05289316, "balance_loss_mlp": 1.03076327, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 2.5880769767242633, "language_loss": 0.80990803, "learning_rate": 3.225373998592471e-06, "loss": 0.83158416, "num_input_tokens_seen": 111201615, "step": 5179, "time_per_iteration": 2.6429331302642822 }, { "auxiliary_loss_clip": 0.01112505, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05353093, "balance_loss_mlp": 1.03139079, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 2.4201759029551813, "language_loss": 0.78532577, "learning_rate": 3.2250661738027715e-06, "loss": 0.80693662, "num_input_tokens_seen": 111220515, "step": 5180, "time_per_iteration": 4.1918723583221436 }, { "auxiliary_loss_clip": 0.01107686, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.05114985, "balance_loss_mlp": 1.02011788, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 1.6775849826612523, "language_loss": 0.83088589, "learning_rate": 3.22475830255844e-06, "loss": 0.85233486, "num_input_tokens_seen": 111240395, "step": 5181, "time_per_iteration": 2.760340929031372 }, { "auxiliary_loss_clip": 0.01110614, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.04879427, "balance_loss_mlp": 1.02881861, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 1.766790552230027, "language_loss": 0.74396992, "learning_rate": 3.2244503848711516e-06, "loss": 0.76551843, "num_input_tokens_seen": 111261100, "step": 5182, "time_per_iteration": 2.7501730918884277 }, { "auxiliary_loss_clip": 0.01093489, "auxiliary_loss_mlp": 0.00776946, "balance_loss_clip": 1.04811049, "balance_loss_mlp": 1.00152898, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 2.03695228940596, "language_loss": 0.70169222, "learning_rate": 3.2241424207525815e-06, "loss": 0.72039658, "num_input_tokens_seen": 111281320, "step": 5183, "time_per_iteration": 4.26041579246521 }, { "auxiliary_loss_clip": 0.01017812, "auxiliary_loss_mlp": 0.01006564, "balance_loss_clip": 1.01984847, "balance_loss_mlp": 1.00418019, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.9394459872440335, "language_loss": 0.59573013, "learning_rate": 3.223834410214408e-06, "loss": 0.61597383, "num_input_tokens_seen": 111341405, "step": 5184, "time_per_iteration": 4.992337226867676 }, { "auxiliary_loss_clip": 0.01115495, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.04588842, "balance_loss_mlp": 1.03422523, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 2.48453112640368, "language_loss": 0.70156622, "learning_rate": 3.223526353268311e-06, "loss": 0.72323, "num_input_tokens_seen": 111358975, "step": 5185, "time_per_iteration": 2.6406824588775635 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01051261, "balance_loss_clip": 1.05447555, "balance_loss_mlp": 1.03405905, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 2.8983279272522853, "language_loss": 0.63588691, "learning_rate": 3.2232182499259725e-06, "loss": 0.65762365, "num_input_tokens_seen": 111375845, "step": 5186, "time_per_iteration": 2.683971881866455 }, { "auxiliary_loss_clip": 0.01126858, "auxiliary_loss_mlp": 0.01049881, "balance_loss_clip": 1.05240881, "balance_loss_mlp": 1.03145099, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 2.2127415604209335, "language_loss": 0.86427295, "learning_rate": 3.2229101001990747e-06, "loss": 0.88604033, "num_input_tokens_seen": 111394150, "step": 5187, "time_per_iteration": 2.6983299255371094 }, { "auxiliary_loss_clip": 0.01146114, "auxiliary_loss_mlp": 0.0077496, "balance_loss_clip": 1.05417776, "balance_loss_mlp": 1.00131774, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 1.653121843679143, "language_loss": 0.63481069, "learning_rate": 3.2226019040993036e-06, "loss": 0.6540215, "num_input_tokens_seen": 111418355, "step": 5188, "time_per_iteration": 2.6974728107452393 }, { "auxiliary_loss_clip": 0.01106256, "auxiliary_loss_mlp": 0.01044626, "balance_loss_clip": 1.05064225, "balance_loss_mlp": 1.02799582, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 2.578497111530561, "language_loss": 0.83241487, "learning_rate": 3.222293661638346e-06, "loss": 0.85392368, "num_input_tokens_seen": 111435445, "step": 5189, "time_per_iteration": 2.6956889629364014 }, { "auxiliary_loss_clip": 0.01031008, "auxiliary_loss_mlp": 0.01045956, "balance_loss_clip": 1.03804195, "balance_loss_mlp": 1.02812243, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 1.8156368008577992, "language_loss": 0.79266763, "learning_rate": 3.22198537282789e-06, "loss": 0.81343722, "num_input_tokens_seen": 111453430, "step": 5190, "time_per_iteration": 3.0180671215057373 }, { "auxiliary_loss_clip": 0.01086186, "auxiliary_loss_mlp": 0.01053443, "balance_loss_clip": 1.04333639, "balance_loss_mlp": 1.03413141, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.571307617405072, "language_loss": 0.75174087, "learning_rate": 3.2216770376796262e-06, "loss": 0.77313721, "num_input_tokens_seen": 111475325, "step": 5191, "time_per_iteration": 3.0170204639434814 }, { "auxiliary_loss_clip": 0.01043661, "auxiliary_loss_mlp": 0.00755081, "balance_loss_clip": 1.02154636, "balance_loss_mlp": 1.00261629, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.8534965117798614, "language_loss": 0.63942307, "learning_rate": 3.221368656205247e-06, "loss": 0.6574105, "num_input_tokens_seen": 111533960, "step": 5192, "time_per_iteration": 3.288938045501709 }, { "auxiliary_loss_clip": 0.01133662, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.05246997, "balance_loss_mlp": 1.02569187, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 1.9226654053779162, "language_loss": 0.7976644, "learning_rate": 3.221060228416446e-06, "loss": 0.81943566, "num_input_tokens_seen": 111554055, "step": 5193, "time_per_iteration": 2.758859157562256 }, { "auxiliary_loss_clip": 0.01117628, "auxiliary_loss_mlp": 0.01054751, "balance_loss_clip": 1.04916263, "balance_loss_mlp": 1.03508139, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.5170295869133024, "language_loss": 0.72488689, "learning_rate": 3.2207517543249183e-06, "loss": 0.74661064, "num_input_tokens_seen": 111574305, "step": 5194, "time_per_iteration": 2.69765567779541 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.05394197, "balance_loss_mlp": 1.02819204, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 1.775027795968239, "language_loss": 0.76423192, "learning_rate": 3.2204432339423616e-06, "loss": 0.78612363, "num_input_tokens_seen": 111595680, "step": 5195, "time_per_iteration": 2.665656566619873 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01042079, "balance_loss_clip": 1.05148935, "balance_loss_mlp": 1.02544916, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 1.4414001308378115, "language_loss": 0.78089559, "learning_rate": 3.220134667280476e-06, "loss": 0.80276251, "num_input_tokens_seen": 111618135, "step": 5196, "time_per_iteration": 2.682476282119751 }, { "auxiliary_loss_clip": 0.01032618, "auxiliary_loss_mlp": 0.00755246, "balance_loss_clip": 1.02237272, "balance_loss_mlp": 1.00273037, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.794984063014186, "language_loss": 0.54770386, "learning_rate": 3.2198260543509613e-06, "loss": 0.56558245, "num_input_tokens_seen": 111682220, "step": 5197, "time_per_iteration": 3.24509334564209 }, { "auxiliary_loss_clip": 0.01144094, "auxiliary_loss_mlp": 0.01042495, "balance_loss_clip": 1.0547365, "balance_loss_mlp": 1.02586555, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 1.8260094290654212, "language_loss": 0.66137004, "learning_rate": 3.21951739516552e-06, "loss": 0.68323588, "num_input_tokens_seen": 111700815, "step": 5198, "time_per_iteration": 2.5970942974090576 }, { "auxiliary_loss_clip": 0.01102297, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0459094, "balance_loss_mlp": 1.02898037, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 2.530729988117139, "language_loss": 0.6949119, "learning_rate": 3.219208689735857e-06, "loss": 0.71640968, "num_input_tokens_seen": 111718195, "step": 5199, "time_per_iteration": 2.6682288646698 }, { "auxiliary_loss_clip": 0.01132634, "auxiliary_loss_mlp": 0.01050152, "balance_loss_clip": 1.04906189, "balance_loss_mlp": 1.03258061, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 1.8087592578592666, "language_loss": 0.78480452, "learning_rate": 3.2188999380736785e-06, "loss": 0.8066324, "num_input_tokens_seen": 111734440, "step": 5200, "time_per_iteration": 2.6664814949035645 }, { "auxiliary_loss_clip": 0.01132139, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.05233109, "balance_loss_mlp": 1.02036345, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 2.0480479984687214, "language_loss": 0.83231741, "learning_rate": 3.2185911401906917e-06, "loss": 0.85400921, "num_input_tokens_seen": 111751960, "step": 5201, "time_per_iteration": 2.674558401107788 }, { "auxiliary_loss_clip": 0.01144703, "auxiliary_loss_mlp": 0.01045083, "balance_loss_clip": 1.05244124, "balance_loss_mlp": 1.02697527, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 3.6217323271444037, "language_loss": 0.6910159, "learning_rate": 3.2182822960986072e-06, "loss": 0.71291375, "num_input_tokens_seen": 111769585, "step": 5202, "time_per_iteration": 2.563164710998535 }, { "auxiliary_loss_clip": 0.01146715, "auxiliary_loss_mlp": 0.01041598, "balance_loss_clip": 1.05293012, "balance_loss_mlp": 1.02608871, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 1.898082303559049, "language_loss": 0.84124672, "learning_rate": 3.2179734058091358e-06, "loss": 0.86312985, "num_input_tokens_seen": 111787880, "step": 5203, "time_per_iteration": 2.6024506092071533 }, { "auxiliary_loss_clip": 0.01086755, "auxiliary_loss_mlp": 0.01049344, "balance_loss_clip": 1.04461396, "balance_loss_mlp": 1.03139079, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 2.246749233698224, "language_loss": 0.61165982, "learning_rate": 3.2176644693339913e-06, "loss": 0.63302082, "num_input_tokens_seen": 111805950, "step": 5204, "time_per_iteration": 2.748486042022705 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.04439998, "balance_loss_mlp": 1.02722907, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 1.6432390116063589, "language_loss": 0.65875763, "learning_rate": 3.217355486684887e-06, "loss": 0.68024528, "num_input_tokens_seen": 111826135, "step": 5205, "time_per_iteration": 2.717499256134033 }, { "auxiliary_loss_clip": 0.01134026, "auxiliary_loss_mlp": 0.01046734, "balance_loss_clip": 1.05126929, "balance_loss_mlp": 1.02849531, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 1.6106510494401134, "language_loss": 0.76811433, "learning_rate": 3.2170464578735414e-06, "loss": 0.78992188, "num_input_tokens_seen": 111844700, "step": 5206, "time_per_iteration": 2.642439603805542 }, { "auxiliary_loss_clip": 0.01140688, "auxiliary_loss_mlp": 0.01041131, "balance_loss_clip": 1.04956853, "balance_loss_mlp": 1.02448893, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 2.214530025407602, "language_loss": 0.83204615, "learning_rate": 3.216737382911672e-06, "loss": 0.85386431, "num_input_tokens_seen": 111861585, "step": 5207, "time_per_iteration": 2.616652727127075 }, { "auxiliary_loss_clip": 0.01127002, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.0502398, "balance_loss_mlp": 1.0328126, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 1.5207985149404841, "language_loss": 0.71359724, "learning_rate": 3.216428261810999e-06, "loss": 0.73535037, "num_input_tokens_seen": 111882950, "step": 5208, "time_per_iteration": 2.674813747406006 }, { "auxiliary_loss_clip": 0.01120564, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04862344, "balance_loss_mlp": 1.02827978, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 1.848256205390157, "language_loss": 0.74558908, "learning_rate": 3.2161190945832445e-06, "loss": 0.76724535, "num_input_tokens_seen": 111901640, "step": 5209, "time_per_iteration": 2.7193644046783447 }, { "auxiliary_loss_clip": 0.01140035, "auxiliary_loss_mlp": 0.01045727, "balance_loss_clip": 1.04733396, "balance_loss_mlp": 1.02937174, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 2.0633998475681135, "language_loss": 0.77254915, "learning_rate": 3.2158098812401325e-06, "loss": 0.79440677, "num_input_tokens_seen": 111919615, "step": 5210, "time_per_iteration": 2.6212270259857178 }, { "auxiliary_loss_clip": 0.01125553, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.047261, "balance_loss_mlp": 1.02385592, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 1.9577389211395706, "language_loss": 0.79128736, "learning_rate": 3.2155006217933874e-06, "loss": 0.81294215, "num_input_tokens_seen": 111938485, "step": 5211, "time_per_iteration": 2.6618316173553467 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01042587, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.02768588, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 2.4581961413264195, "language_loss": 0.79612064, "learning_rate": 3.2151913162547367e-06, "loss": 0.81786901, "num_input_tokens_seen": 111956425, "step": 5212, "time_per_iteration": 2.81793475151062 }, { "auxiliary_loss_clip": 0.01125931, "auxiliary_loss_mlp": 0.01053393, "balance_loss_clip": 1.05156052, "balance_loss_mlp": 1.03576159, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 2.69561664367352, "language_loss": 0.71024299, "learning_rate": 3.2148819646359097e-06, "loss": 0.73203623, "num_input_tokens_seen": 111975915, "step": 5213, "time_per_iteration": 2.6739485263824463 }, { "auxiliary_loss_clip": 0.01132672, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.05284989, "balance_loss_mlp": 1.02961898, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 1.9828215257111186, "language_loss": 0.77684069, "learning_rate": 3.2145725669486374e-06, "loss": 0.79862642, "num_input_tokens_seen": 111995055, "step": 5214, "time_per_iteration": 2.6108171939849854 }, { "auxiliary_loss_clip": 0.01099316, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.0522778, "balance_loss_mlp": 1.02317524, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 2.2634840816113075, "language_loss": 0.8300609, "learning_rate": 3.2142631232046517e-06, "loss": 0.8514396, "num_input_tokens_seen": 112015830, "step": 5215, "time_per_iteration": 2.77897047996521 }, { "auxiliary_loss_clip": 0.01131919, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.05089617, "balance_loss_mlp": 1.02375078, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 2.280765330466862, "language_loss": 0.79540187, "learning_rate": 3.213953633415686e-06, "loss": 0.81713033, "num_input_tokens_seen": 112035065, "step": 5216, "time_per_iteration": 2.675492763519287 }, { "auxiliary_loss_clip": 0.01119434, "auxiliary_loss_mlp": 0.01049814, "balance_loss_clip": 1.04817545, "balance_loss_mlp": 1.03174222, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 1.97082305961493, "language_loss": 0.69007474, "learning_rate": 3.213644097593477e-06, "loss": 0.7117672, "num_input_tokens_seen": 112058405, "step": 5217, "time_per_iteration": 2.7360196113586426 }, { "auxiliary_loss_clip": 0.01121348, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04833519, "balance_loss_mlp": 1.02275062, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 1.7253432561329243, "language_loss": 0.81228399, "learning_rate": 3.2133345157497624e-06, "loss": 0.83388406, "num_input_tokens_seen": 112076420, "step": 5218, "time_per_iteration": 4.393778562545776 }, { "auxiliary_loss_clip": 0.01139073, "auxiliary_loss_mlp": 0.01041023, "balance_loss_clip": 1.04819143, "balance_loss_mlp": 1.02422082, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 2.6452768271158167, "language_loss": 0.69128895, "learning_rate": 3.2130248878962813e-06, "loss": 0.71308994, "num_input_tokens_seen": 112090775, "step": 5219, "time_per_iteration": 4.162578344345093 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04879618, "balance_loss_mlp": 1.0287652, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 5.057996341652072, "language_loss": 0.80019122, "learning_rate": 3.2127152140447747e-06, "loss": 0.82181168, "num_input_tokens_seen": 112110980, "step": 5220, "time_per_iteration": 2.693300247192383 }, { "auxiliary_loss_clip": 0.01133002, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.05214572, "balance_loss_mlp": 1.0220139, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 1.7918234828134079, "language_loss": 0.72575235, "learning_rate": 3.212405494206986e-06, "loss": 0.74746263, "num_input_tokens_seen": 112129020, "step": 5221, "time_per_iteration": 2.6918861865997314 }, { "auxiliary_loss_clip": 0.01105754, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04538214, "balance_loss_mlp": 1.02435017, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 1.7850671432610508, "language_loss": 0.82097268, "learning_rate": 3.2120957283946588e-06, "loss": 0.84243071, "num_input_tokens_seen": 112147865, "step": 5222, "time_per_iteration": 4.193262100219727 }, { "auxiliary_loss_clip": 0.01136096, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05302894, "balance_loss_mlp": 1.02764595, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 2.3946225731958073, "language_loss": 0.70159894, "learning_rate": 3.2117859166195407e-06, "loss": 0.7234093, "num_input_tokens_seen": 112166745, "step": 5223, "time_per_iteration": 2.642608642578125 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.00773089, "balance_loss_clip": 1.04545665, "balance_loss_mlp": 1.0012387, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 1.5662600408509175, "language_loss": 0.80818307, "learning_rate": 3.211476058893379e-06, "loss": 0.82711768, "num_input_tokens_seen": 112185895, "step": 5224, "time_per_iteration": 4.334134101867676 }, { "auxiliary_loss_clip": 0.0113849, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 1.05376673, "balance_loss_mlp": 1.02807033, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 2.581635190586104, "language_loss": 0.57647121, "learning_rate": 3.2111661552279243e-06, "loss": 0.59830517, "num_input_tokens_seen": 112204465, "step": 5225, "time_per_iteration": 2.680227041244507 }, { "auxiliary_loss_clip": 0.01086502, "auxiliary_loss_mlp": 0.01032759, "balance_loss_clip": 1.04252625, "balance_loss_mlp": 1.0179472, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 2.0500851879408577, "language_loss": 0.81726074, "learning_rate": 3.2108562056349273e-06, "loss": 0.83845341, "num_input_tokens_seen": 112221635, "step": 5226, "time_per_iteration": 2.8080878257751465 }, { "auxiliary_loss_clip": 0.01123539, "auxiliary_loss_mlp": 0.01053238, "balance_loss_clip": 1.04718053, "balance_loss_mlp": 1.03557122, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 1.8156350578732643, "language_loss": 0.7435357, "learning_rate": 3.210546210126141e-06, "loss": 0.76530349, "num_input_tokens_seen": 112241240, "step": 5227, "time_per_iteration": 2.6420040130615234 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01036288, "balance_loss_clip": 1.05315053, "balance_loss_mlp": 1.01981306, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 1.9798889840887306, "language_loss": 0.6779027, "learning_rate": 3.2102361687133213e-06, "loss": 0.69954711, "num_input_tokens_seen": 112262350, "step": 5228, "time_per_iteration": 2.6904454231262207 }, { "auxiliary_loss_clip": 0.01116854, "auxiliary_loss_mlp": 0.01042698, "balance_loss_clip": 1.04812217, "balance_loss_mlp": 1.02755868, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 2.2592581290101648, "language_loss": 0.802086, "learning_rate": 3.2099260814082254e-06, "loss": 0.82368147, "num_input_tokens_seen": 112283710, "step": 5229, "time_per_iteration": 2.720972776412964 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.04888391, "balance_loss_mlp": 1.01917148, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 2.206396959728329, "language_loss": 0.69972271, "learning_rate": 3.209615948222611e-06, "loss": 0.72123438, "num_input_tokens_seen": 112304285, "step": 5230, "time_per_iteration": 2.69555401802063 }, { "auxiliary_loss_clip": 0.01094216, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.042889, "balance_loss_mlp": 1.03331971, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 11.083232715551919, "language_loss": 0.79441226, "learning_rate": 3.209305769168239e-06, "loss": 0.81586754, "num_input_tokens_seen": 112325110, "step": 5231, "time_per_iteration": 2.742414712905884 }, { "auxiliary_loss_clip": 0.01111136, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.05004621, "balance_loss_mlp": 1.02751017, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 68.21693219117104, "language_loss": 0.84846044, "learning_rate": 3.2089955442568704e-06, "loss": 0.87001216, "num_input_tokens_seen": 112339855, "step": 5232, "time_per_iteration": 2.681541919708252 }, { "auxiliary_loss_clip": 0.01082351, "auxiliary_loss_mlp": 0.01063678, "balance_loss_clip": 1.04169703, "balance_loss_mlp": 1.04589176, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 1.732593505271442, "language_loss": 0.79899549, "learning_rate": 3.2086852735002692e-06, "loss": 0.82045579, "num_input_tokens_seen": 112358480, "step": 5233, "time_per_iteration": 2.7261524200439453 }, { "auxiliary_loss_clip": 0.01095476, "auxiliary_loss_mlp": 0.01043701, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.02775121, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 1.8884411146751285, "language_loss": 0.71124369, "learning_rate": 3.2083749569102024e-06, "loss": 0.73263544, "num_input_tokens_seen": 112382350, "step": 5234, "time_per_iteration": 3.0071427822113037 }, { "auxiliary_loss_clip": 0.01105209, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02060878, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 2.1537517260325396, "language_loss": 0.72106552, "learning_rate": 3.2080645944984356e-06, "loss": 0.74248433, "num_input_tokens_seen": 112400260, "step": 5235, "time_per_iteration": 2.7347464561462402 }, { "auxiliary_loss_clip": 0.011281, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.0479089, "balance_loss_mlp": 1.0225656, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 2.047935998004664, "language_loss": 0.78640145, "learning_rate": 3.2077541862767384e-06, "loss": 0.80806667, "num_input_tokens_seen": 112419400, "step": 5236, "time_per_iteration": 2.6480181217193604 }, { "auxiliary_loss_clip": 0.01142531, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04929006, "balance_loss_mlp": 1.02536416, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 1.8469097199945863, "language_loss": 0.75903904, "learning_rate": 3.207443732256881e-06, "loss": 0.78088653, "num_input_tokens_seen": 112440825, "step": 5237, "time_per_iteration": 2.7113847732543945 }, { "auxiliary_loss_clip": 0.01133953, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.04817045, "balance_loss_mlp": 1.02128255, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 2.176202072112168, "language_loss": 0.79725033, "learning_rate": 3.2071332324506372e-06, "loss": 0.81894737, "num_input_tokens_seen": 112459180, "step": 5238, "time_per_iteration": 2.649968147277832 }, { "auxiliary_loss_clip": 0.01046118, "auxiliary_loss_mlp": 0.01018852, "balance_loss_clip": 1.02561212, "balance_loss_mlp": 1.01676548, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 0.8324046464960934, "language_loss": 0.67913729, "learning_rate": 3.2068226868697795e-06, "loss": 0.69978696, "num_input_tokens_seen": 112516680, "step": 5239, "time_per_iteration": 3.130643606185913 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01043617, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.02528274, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 2.4702861290170235, "language_loss": 0.82906926, "learning_rate": 3.2065120955260846e-06, "loss": 0.85072124, "num_input_tokens_seen": 112535895, "step": 5240, "time_per_iteration": 2.6314027309417725 }, { "auxiliary_loss_clip": 0.0111196, "auxiliary_loss_mlp": 0.0077379, "balance_loss_clip": 1.04708409, "balance_loss_mlp": 1.00132334, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 1.6854261536361361, "language_loss": 0.81405544, "learning_rate": 3.2062014584313302e-06, "loss": 0.83291298, "num_input_tokens_seen": 112557490, "step": 5241, "time_per_iteration": 2.7245657444000244 }, { "auxiliary_loss_clip": 0.01138561, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.05094576, "balance_loss_mlp": 1.0230633, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 1.7554610875937957, "language_loss": 0.74513441, "learning_rate": 3.2058907755972956e-06, "loss": 0.7669059, "num_input_tokens_seen": 112577075, "step": 5242, "time_per_iteration": 2.5925803184509277 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.04686832, "balance_loss_mlp": 1.02230775, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 12.905078117761404, "language_loss": 0.73457384, "learning_rate": 3.2055800470357626e-06, "loss": 0.75603199, "num_input_tokens_seen": 112597620, "step": 5243, "time_per_iteration": 2.721261739730835 }, { "auxiliary_loss_clip": 0.01126602, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.04783881, "balance_loss_mlp": 1.02524936, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 2.079273463581607, "language_loss": 0.6462577, "learning_rate": 3.205269272758513e-06, "loss": 0.66793752, "num_input_tokens_seen": 112617150, "step": 5244, "time_per_iteration": 2.6753153800964355 }, { "auxiliary_loss_clip": 0.01087107, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.04454994, "balance_loss_mlp": 1.02158141, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 2.126512737541558, "language_loss": 0.91117549, "learning_rate": 3.2049584527773313e-06, "loss": 0.93242127, "num_input_tokens_seen": 112631090, "step": 5245, "time_per_iteration": 2.717316150665283 }, { "auxiliary_loss_clip": 0.01129236, "auxiliary_loss_mlp": 0.01046116, "balance_loss_clip": 1.04892504, "balance_loss_mlp": 1.02911687, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 2.0341104694483296, "language_loss": 0.75199413, "learning_rate": 3.2046475871040048e-06, "loss": 0.77374756, "num_input_tokens_seen": 112651220, "step": 5246, "time_per_iteration": 2.738969564437866 }, { "auxiliary_loss_clip": 0.01139621, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.04860735, "balance_loss_mlp": 1.027946, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 1.7161631839732394, "language_loss": 0.61524433, "learning_rate": 3.204336675750321e-06, "loss": 0.63708878, "num_input_tokens_seen": 112671560, "step": 5247, "time_per_iteration": 2.714258909225464 }, { "auxiliary_loss_clip": 0.01129569, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.0283072, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 2.438581052681848, "language_loss": 0.82096362, "learning_rate": 3.2040257187280693e-06, "loss": 0.84271014, "num_input_tokens_seen": 112689790, "step": 5248, "time_per_iteration": 2.6235198974609375 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.04964209, "balance_loss_mlp": 1.0292145, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 5.654706808285272, "language_loss": 0.84601712, "learning_rate": 3.2037147160490423e-06, "loss": 0.86770785, "num_input_tokens_seen": 112708265, "step": 5249, "time_per_iteration": 2.664454698562622 }, { "auxiliary_loss_clip": 0.01105599, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.04724038, "balance_loss_mlp": 1.02252758, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 2.1333510394712034, "language_loss": 0.85412121, "learning_rate": 3.2034036677250322e-06, "loss": 0.87557989, "num_input_tokens_seen": 112727820, "step": 5250, "time_per_iteration": 2.7892768383026123 }, { "auxiliary_loss_clip": 0.01110748, "auxiliary_loss_mlp": 0.01044305, "balance_loss_clip": 1.04626083, "balance_loss_mlp": 1.02721059, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 3.250818956981283, "language_loss": 0.68651402, "learning_rate": 3.203092573767835e-06, "loss": 0.70806456, "num_input_tokens_seen": 112743140, "step": 5251, "time_per_iteration": 2.660738468170166 }, { "auxiliary_loss_clip": 0.01141131, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.05063367, "balance_loss_mlp": 1.02374566, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 1.6959923935223091, "language_loss": 0.79367268, "learning_rate": 3.202781434189246e-06, "loss": 0.81549257, "num_input_tokens_seen": 112764705, "step": 5252, "time_per_iteration": 2.6600146293640137 }, { "auxiliary_loss_clip": 0.01123952, "auxiliary_loss_mlp": 0.01055554, "balance_loss_clip": 1.04919744, "balance_loss_mlp": 1.03742182, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 1.5850214403847396, "language_loss": 0.74167955, "learning_rate": 3.202470249001066e-06, "loss": 0.76347458, "num_input_tokens_seen": 112785310, "step": 5253, "time_per_iteration": 2.6831557750701904 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04685211, "balance_loss_mlp": 1.02571261, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 1.8578399335985847, "language_loss": 0.73295557, "learning_rate": 3.2021590182150924e-06, "loss": 0.75456059, "num_input_tokens_seen": 112802905, "step": 5254, "time_per_iteration": 2.664445161819458 }, { "auxiliary_loss_clip": 0.0112999, "auxiliary_loss_mlp": 0.0104166, "balance_loss_clip": 1.04998255, "balance_loss_mlp": 1.02442837, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 1.9116991379626416, "language_loss": 0.77497417, "learning_rate": 3.201847741843128e-06, "loss": 0.7966907, "num_input_tokens_seen": 112820305, "step": 5255, "time_per_iteration": 2.5817084312438965 }, { "auxiliary_loss_clip": 0.01116092, "auxiliary_loss_mlp": 0.01045862, "balance_loss_clip": 1.0481391, "balance_loss_mlp": 1.02718151, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 2.396272573281143, "language_loss": 0.7821492, "learning_rate": 3.2015364198969772e-06, "loss": 0.80376875, "num_input_tokens_seen": 112841185, "step": 5256, "time_per_iteration": 2.6798577308654785 }, { "auxiliary_loss_clip": 0.0109858, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.04874921, "balance_loss_mlp": 1.02676511, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 1.575034121408654, "language_loss": 0.71175283, "learning_rate": 3.2012250523884453e-06, "loss": 0.73316103, "num_input_tokens_seen": 112860570, "step": 5257, "time_per_iteration": 4.252342462539673 }, { "auxiliary_loss_clip": 0.01132481, "auxiliary_loss_mlp": 0.01043271, "balance_loss_clip": 1.05120182, "balance_loss_mlp": 1.02524674, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 2.0196036815267036, "language_loss": 0.76539034, "learning_rate": 3.2009136393293393e-06, "loss": 0.78714788, "num_input_tokens_seen": 112877975, "step": 5258, "time_per_iteration": 4.240477085113525 }, { "auxiliary_loss_clip": 0.01110908, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.04727268, "balance_loss_mlp": 1.02917099, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 3.2354010090655403, "language_loss": 0.72901475, "learning_rate": 3.200602180731467e-06, "loss": 0.75059474, "num_input_tokens_seen": 112896170, "step": 5259, "time_per_iteration": 2.726944923400879 }, { "auxiliary_loss_clip": 0.01117115, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.04983401, "balance_loss_mlp": 1.0013001, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 2.1961272089612307, "language_loss": 0.66124642, "learning_rate": 3.20029067660664e-06, "loss": 0.68018734, "num_input_tokens_seen": 112916180, "step": 5260, "time_per_iteration": 2.7605621814727783 }, { "auxiliary_loss_clip": 0.01130372, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.04645884, "balance_loss_mlp": 1.02016842, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 1.8277182943015604, "language_loss": 0.71989, "learning_rate": 3.1999791269666706e-06, "loss": 0.74156475, "num_input_tokens_seen": 112936745, "step": 5261, "time_per_iteration": 4.231431484222412 }, { "auxiliary_loss_clip": 0.01044321, "auxiliary_loss_mlp": 0.01007323, "balance_loss_clip": 1.02311194, "balance_loss_mlp": 1.00424767, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7429950107461195, "language_loss": 0.50646758, "learning_rate": 3.1996675318233716e-06, "loss": 0.5269841, "num_input_tokens_seen": 112994845, "step": 5262, "time_per_iteration": 3.232384443283081 }, { "auxiliary_loss_clip": 0.01131333, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.05222106, "balance_loss_mlp": 1.02932084, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 1.5863649349069382, "language_loss": 0.85187083, "learning_rate": 3.19935589118856e-06, "loss": 0.8736518, "num_input_tokens_seen": 113015125, "step": 5263, "time_per_iteration": 4.33522629737854 }, { "auxiliary_loss_clip": 0.01112644, "auxiliary_loss_mlp": 0.01048382, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.03256297, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 1.550008856477613, "language_loss": 0.81648135, "learning_rate": 3.1990442050740535e-06, "loss": 0.83809161, "num_input_tokens_seen": 113035535, "step": 5264, "time_per_iteration": 2.8155312538146973 }, { "auxiliary_loss_clip": 0.01121259, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04812968, "balance_loss_mlp": 1.02431464, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 2.234025317189389, "language_loss": 0.78969181, "learning_rate": 3.19873247349167e-06, "loss": 0.81132656, "num_input_tokens_seen": 113052720, "step": 5265, "time_per_iteration": 2.6533524990081787 }, { "auxiliary_loss_clip": 0.0113452, "auxiliary_loss_mlp": 0.01049591, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.03144741, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 1.789116232573577, "language_loss": 0.74705631, "learning_rate": 3.1984206964532307e-06, "loss": 0.76889741, "num_input_tokens_seen": 113071435, "step": 5266, "time_per_iteration": 2.66683292388916 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.0104338, "balance_loss_clip": 1.04636073, "balance_loss_mlp": 1.02660751, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 2.507852328081816, "language_loss": 0.79178059, "learning_rate": 3.1981088739705585e-06, "loss": 0.81329834, "num_input_tokens_seen": 113088645, "step": 5267, "time_per_iteration": 2.6870310306549072 }, { "auxiliary_loss_clip": 0.0103642, "auxiliary_loss_mlp": 0.01002482, "balance_loss_clip": 1.02563763, "balance_loss_mlp": 1.00002623, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 0.7343006553516018, "language_loss": 0.57840127, "learning_rate": 3.197797006055478e-06, "loss": 0.59879029, "num_input_tokens_seen": 113152775, "step": 5268, "time_per_iteration": 3.211494207382202 }, { "auxiliary_loss_clip": 0.01144761, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.0517385, "balance_loss_mlp": 1.02729666, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 2.2657818682072146, "language_loss": 0.73009932, "learning_rate": 3.197485092719815e-06, "loss": 0.75198865, "num_input_tokens_seen": 113171410, "step": 5269, "time_per_iteration": 2.5840115547180176 }, { "auxiliary_loss_clip": 0.01108492, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.0489136, "balance_loss_mlp": 1.03283644, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 2.2273308320264995, "language_loss": 0.79972744, "learning_rate": 3.1971731339753973e-06, "loss": 0.82131052, "num_input_tokens_seen": 113189965, "step": 5270, "time_per_iteration": 2.858154535293579 }, { "auxiliary_loss_clip": 0.01146892, "auxiliary_loss_mlp": 0.01050124, "balance_loss_clip": 1.05206418, "balance_loss_mlp": 1.03207529, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 9.25747726986636, "language_loss": 0.7941646, "learning_rate": 3.1968611298340545e-06, "loss": 0.81613475, "num_input_tokens_seen": 113206355, "step": 5271, "time_per_iteration": 2.6510884761810303 }, { "auxiliary_loss_clip": 0.01144344, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.05230093, "balance_loss_mlp": 1.02269578, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 1.806612869692892, "language_loss": 0.72429144, "learning_rate": 3.1965490803076173e-06, "loss": 0.74613577, "num_input_tokens_seen": 113225440, "step": 5272, "time_per_iteration": 2.6807363033294678 }, { "auxiliary_loss_clip": 0.01123855, "auxiliary_loss_mlp": 0.01052611, "balance_loss_clip": 1.04942703, "balance_loss_mlp": 1.03365636, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 2.241731745129767, "language_loss": 0.69146693, "learning_rate": 3.1962369854079194e-06, "loss": 0.71323156, "num_input_tokens_seen": 113248840, "step": 5273, "time_per_iteration": 2.9202728271484375 }, { "auxiliary_loss_clip": 0.01128467, "auxiliary_loss_mlp": 0.00775845, "balance_loss_clip": 1.04869509, "balance_loss_mlp": 1.00146461, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 1.872718303622414, "language_loss": 0.67764306, "learning_rate": 3.195924845146795e-06, "loss": 0.69668615, "num_input_tokens_seen": 113269630, "step": 5274, "time_per_iteration": 2.6541714668273926 }, { "auxiliary_loss_clip": 0.01092683, "auxiliary_loss_mlp": 0.0106112, "balance_loss_clip": 1.04346347, "balance_loss_mlp": 1.04305935, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 1.7402048894999724, "language_loss": 0.80815518, "learning_rate": 3.195612659536081e-06, "loss": 0.8296932, "num_input_tokens_seen": 113291200, "step": 5275, "time_per_iteration": 2.840696096420288 }, { "auxiliary_loss_clip": 0.0113287, "auxiliary_loss_mlp": 0.01047853, "balance_loss_clip": 1.04862475, "balance_loss_mlp": 1.02979279, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 2.28886723118271, "language_loss": 0.72418922, "learning_rate": 3.1953004285876147e-06, "loss": 0.74599648, "num_input_tokens_seen": 113310170, "step": 5276, "time_per_iteration": 2.6426591873168945 }, { "auxiliary_loss_clip": 0.01122606, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.05439019, "balance_loss_mlp": 1.02588356, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 1.4542936031710312, "language_loss": 0.77923822, "learning_rate": 3.194988152313236e-06, "loss": 0.80087811, "num_input_tokens_seen": 113331140, "step": 5277, "time_per_iteration": 2.7192864418029785 }, { "auxiliary_loss_clip": 0.01113098, "auxiliary_loss_mlp": 0.01054598, "balance_loss_clip": 1.04708886, "balance_loss_mlp": 1.03432024, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 2.071832444797603, "language_loss": 0.79029107, "learning_rate": 3.1946758307247878e-06, "loss": 0.81196797, "num_input_tokens_seen": 113350030, "step": 5278, "time_per_iteration": 2.606973648071289 }, { "auxiliary_loss_clip": 0.01041198, "auxiliary_loss_mlp": 0.01006121, "balance_loss_clip": 1.02207565, "balance_loss_mlp": 1.00391531, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8783580735908582, "language_loss": 0.62817574, "learning_rate": 3.1943634638341114e-06, "loss": 0.64864898, "num_input_tokens_seen": 113395820, "step": 5279, "time_per_iteration": 2.998594284057617 }, { "auxiliary_loss_clip": 0.01146927, "auxiliary_loss_mlp": 0.01055699, "balance_loss_clip": 1.05080009, "balance_loss_mlp": 1.03651857, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 1.4881688285488497, "language_loss": 0.80855167, "learning_rate": 3.194051051653053e-06, "loss": 0.83057791, "num_input_tokens_seen": 113416835, "step": 5280, "time_per_iteration": 2.662240743637085 }, { "auxiliary_loss_clip": 0.0110603, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.04850507, "balance_loss_mlp": 1.0339663, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 1.6411021360183768, "language_loss": 0.77964067, "learning_rate": 3.19373859419346e-06, "loss": 0.80120289, "num_input_tokens_seen": 113440850, "step": 5281, "time_per_iteration": 2.8303840160369873 }, { "auxiliary_loss_clip": 0.01119054, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.04812443, "balance_loss_mlp": 1.02194262, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 2.6184534699054116, "language_loss": 0.78539747, "learning_rate": 3.193426091467179e-06, "loss": 0.80698353, "num_input_tokens_seen": 113461000, "step": 5282, "time_per_iteration": 2.75915265083313 }, { "auxiliary_loss_clip": 0.01122553, "auxiliary_loss_mlp": 0.01050996, "balance_loss_clip": 1.0517695, "balance_loss_mlp": 1.03284001, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 1.8901773671102746, "language_loss": 0.67857707, "learning_rate": 3.193113543486061e-06, "loss": 0.70031261, "num_input_tokens_seen": 113480820, "step": 5283, "time_per_iteration": 2.710601329803467 }, { "auxiliary_loss_clip": 0.01039071, "auxiliary_loss_mlp": 0.01003581, "balance_loss_clip": 1.02084279, "balance_loss_mlp": 1.00145948, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.7284643981615322, "language_loss": 0.52787578, "learning_rate": 3.192800950261958e-06, "loss": 0.54830229, "num_input_tokens_seen": 113536910, "step": 5284, "time_per_iteration": 3.1312994956970215 }, { "auxiliary_loss_clip": 0.01123508, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05256152, "balance_loss_mlp": 1.02529633, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 1.6358492252526933, "language_loss": 0.70703542, "learning_rate": 3.1924883118067235e-06, "loss": 0.72868699, "num_input_tokens_seen": 113555480, "step": 5285, "time_per_iteration": 2.66414213180542 }, { "auxiliary_loss_clip": 0.01051594, "auxiliary_loss_mlp": 0.01001353, "balance_loss_clip": 1.02112103, "balance_loss_mlp": 0.99919558, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.8795363824150627, "language_loss": 0.60495377, "learning_rate": 3.1921756281322123e-06, "loss": 0.62548316, "num_input_tokens_seen": 113616790, "step": 5286, "time_per_iteration": 3.1636195182800293 }, { "auxiliary_loss_clip": 0.01145219, "auxiliary_loss_mlp": 0.01047411, "balance_loss_clip": 1.05137587, "balance_loss_mlp": 1.02995849, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 10.257300688850748, "language_loss": 0.72160053, "learning_rate": 3.1918628992502826e-06, "loss": 0.74352682, "num_input_tokens_seen": 113635320, "step": 5287, "time_per_iteration": 2.628863573074341 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.0105662, "balance_loss_clip": 1.04966712, "balance_loss_mlp": 1.03823805, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 2.3229849512265126, "language_loss": 0.75706261, "learning_rate": 3.191550125172792e-06, "loss": 0.77895868, "num_input_tokens_seen": 113654000, "step": 5288, "time_per_iteration": 2.7565319538116455 }, { "auxiliary_loss_clip": 0.01128698, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02223587, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 3.550043827117326, "language_loss": 0.87827504, "learning_rate": 3.1912373059116007e-06, "loss": 0.89993572, "num_input_tokens_seen": 113672375, "step": 5289, "time_per_iteration": 2.6671485900878906 }, { "auxiliary_loss_clip": 0.01126628, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.05225897, "balance_loss_mlp": 1.02443218, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 1.767762146387748, "language_loss": 0.68103814, "learning_rate": 3.190924441478572e-06, "loss": 0.70270097, "num_input_tokens_seen": 113692385, "step": 5290, "time_per_iteration": 2.6986947059631348 }, { "auxiliary_loss_clip": 0.01120385, "auxiliary_loss_mlp": 0.01046806, "balance_loss_clip": 1.04791737, "balance_loss_mlp": 1.02924609, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 2.1353951835610303, "language_loss": 0.80298805, "learning_rate": 3.1906115318855687e-06, "loss": 0.82465994, "num_input_tokens_seen": 113712145, "step": 5291, "time_per_iteration": 2.67692494392395 }, { "auxiliary_loss_clip": 0.01112404, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.05768418, "balance_loss_mlp": 1.02066636, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 4.0426741537939614, "language_loss": 0.79877901, "learning_rate": 3.1902985771444577e-06, "loss": 0.82028592, "num_input_tokens_seen": 113731435, "step": 5292, "time_per_iteration": 2.8386974334716797 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.05076253, "balance_loss_mlp": 1.0233407, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 1.5696258430885255, "language_loss": 0.74754488, "learning_rate": 3.1899855772671043e-06, "loss": 0.7691924, "num_input_tokens_seen": 113750825, "step": 5293, "time_per_iteration": 2.651566982269287 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.01045458, "balance_loss_clip": 1.05253696, "balance_loss_mlp": 1.03027081, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 1.9205945835079516, "language_loss": 0.74100351, "learning_rate": 3.189672532265379e-06, "loss": 0.76274973, "num_input_tokens_seen": 113770010, "step": 5294, "time_per_iteration": 2.6593024730682373 }, { "auxiliary_loss_clip": 0.01145372, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.05254447, "balance_loss_mlp": 1.02166462, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 3.618714545146935, "language_loss": 0.76019043, "learning_rate": 3.189359442151152e-06, "loss": 0.78203136, "num_input_tokens_seen": 113788640, "step": 5295, "time_per_iteration": 2.597567558288574 }, { "auxiliary_loss_clip": 0.01110615, "auxiliary_loss_mlp": 0.01046432, "balance_loss_clip": 1.04994202, "balance_loss_mlp": 1.02979052, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 2.278908740959458, "language_loss": 0.69146252, "learning_rate": 3.189046306936296e-06, "loss": 0.71303296, "num_input_tokens_seen": 113809515, "step": 5296, "time_per_iteration": 4.286029100418091 }, { "auxiliary_loss_clip": 0.01115954, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.04866266, "balance_loss_mlp": 1.02709007, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 1.7786470593469696, "language_loss": 0.77374327, "learning_rate": 3.1887331266326846e-06, "loss": 0.79533565, "num_input_tokens_seen": 113829770, "step": 5297, "time_per_iteration": 4.164870023727417 }, { "auxiliary_loss_clip": 0.0111312, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.05341816, "balance_loss_mlp": 1.01857328, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 2.4185702861431104, "language_loss": 0.79294181, "learning_rate": 3.1884199012521942e-06, "loss": 0.81443709, "num_input_tokens_seen": 113849320, "step": 5298, "time_per_iteration": 2.761035919189453 }, { "auxiliary_loss_clip": 0.01127152, "auxiliary_loss_mlp": 0.01052383, "balance_loss_clip": 1.05250955, "balance_loss_mlp": 1.0361588, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 2.109744523678234, "language_loss": 0.74082595, "learning_rate": 3.1881066308067016e-06, "loss": 0.76262128, "num_input_tokens_seen": 113867860, "step": 5299, "time_per_iteration": 2.6674296855926514 }, { "auxiliary_loss_clip": 0.01133842, "auxiliary_loss_mlp": 0.01048899, "balance_loss_clip": 1.05652189, "balance_loss_mlp": 1.03213775, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 2.0125699214837627, "language_loss": 0.78636098, "learning_rate": 3.1877933153080873e-06, "loss": 0.80818832, "num_input_tokens_seen": 113886375, "step": 5300, "time_per_iteration": 2.721202850341797 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01050293, "balance_loss_clip": 1.04830885, "balance_loss_mlp": 1.03297138, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 1.8639511619571896, "language_loss": 0.83660495, "learning_rate": 3.1874799547682304e-06, "loss": 0.8583042, "num_input_tokens_seen": 113904065, "step": 5301, "time_per_iteration": 4.22704291343689 }, { "auxiliary_loss_clip": 0.01131996, "auxiliary_loss_mlp": 0.01049945, "balance_loss_clip": 1.05371821, "balance_loss_mlp": 1.03263569, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 2.3173946845583444, "language_loss": 0.77328432, "learning_rate": 3.187166549199015e-06, "loss": 0.79510373, "num_input_tokens_seen": 113918415, "step": 5302, "time_per_iteration": 2.6678919792175293 }, { "auxiliary_loss_clip": 0.011364, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.04891157, "balance_loss_mlp": 1.02270818, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 2.352282677018458, "language_loss": 0.79816842, "learning_rate": 3.1868530986123255e-06, "loss": 0.81993073, "num_input_tokens_seen": 113938135, "step": 5303, "time_per_iteration": 4.289660453796387 }, { "auxiliary_loss_clip": 0.0113563, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.05256605, "balance_loss_mlp": 1.02739668, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 2.03328242361333, "language_loss": 0.72914493, "learning_rate": 3.186539603020047e-06, "loss": 0.7509557, "num_input_tokens_seen": 113957125, "step": 5304, "time_per_iteration": 2.6123225688934326 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.04701817, "balance_loss_mlp": 1.02234125, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 2.816339992135166, "language_loss": 0.71918428, "learning_rate": 3.186226062434068e-06, "loss": 0.74063241, "num_input_tokens_seen": 113974875, "step": 5305, "time_per_iteration": 2.7341108322143555 }, { "auxiliary_loss_clip": 0.01120594, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.05007052, "balance_loss_mlp": 1.0271126, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 2.1368418928112067, "language_loss": 0.64082253, "learning_rate": 3.1859124768662778e-06, "loss": 0.66245496, "num_input_tokens_seen": 113994450, "step": 5306, "time_per_iteration": 2.678497791290283 }, { "auxiliary_loss_clip": 0.01113987, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.04777002, "balance_loss_mlp": 1.02913976, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 2.249856956834014, "language_loss": 0.7981708, "learning_rate": 3.1855988463285678e-06, "loss": 0.81977379, "num_input_tokens_seen": 114013945, "step": 5307, "time_per_iteration": 2.684825897216797 }, { "auxiliary_loss_clip": 0.01110939, "auxiliary_loss_mlp": 0.01046246, "balance_loss_clip": 1.04708028, "balance_loss_mlp": 1.02869821, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 1.891192054321282, "language_loss": 0.77413881, "learning_rate": 3.1852851708328308e-06, "loss": 0.79571068, "num_input_tokens_seen": 114031375, "step": 5308, "time_per_iteration": 2.62485408782959 }, { "auxiliary_loss_clip": 0.01142071, "auxiliary_loss_mlp": 0.01050679, "balance_loss_clip": 1.05399549, "balance_loss_mlp": 1.03109312, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 3.6914677983836586, "language_loss": 0.73960984, "learning_rate": 3.184971450390961e-06, "loss": 0.76153737, "num_input_tokens_seen": 114048465, "step": 5309, "time_per_iteration": 2.6268463134765625 }, { "auxiliary_loss_clip": 0.01134349, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.05286658, "balance_loss_mlp": 1.01932931, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 1.9182514579370458, "language_loss": 0.82652342, "learning_rate": 3.184657685014856e-06, "loss": 0.84821963, "num_input_tokens_seen": 114068415, "step": 5310, "time_per_iteration": 2.649099111557007 }, { "auxiliary_loss_clip": 0.01116653, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.04808259, "balance_loss_mlp": 1.02340484, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 2.200225110342558, "language_loss": 0.78296745, "learning_rate": 3.184343874716412e-06, "loss": 0.80452585, "num_input_tokens_seen": 114088565, "step": 5311, "time_per_iteration": 2.7054250240325928 }, { "auxiliary_loss_clip": 0.01106724, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.04822886, "balance_loss_mlp": 1.01952648, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 2.0057857548781883, "language_loss": 0.84169972, "learning_rate": 3.1840300195075295e-06, "loss": 0.86313581, "num_input_tokens_seen": 114107160, "step": 5312, "time_per_iteration": 2.749263048171997 }, { "auxiliary_loss_clip": 0.01093899, "auxiliary_loss_mlp": 0.01053441, "balance_loss_clip": 1.04266024, "balance_loss_mlp": 1.03477311, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 3.6700749085790063, "language_loss": 0.78648412, "learning_rate": 3.1837161194001102e-06, "loss": 0.80795753, "num_input_tokens_seen": 114123420, "step": 5313, "time_per_iteration": 2.720930814743042 }, { "auxiliary_loss_clip": 0.01130677, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.05141878, "balance_loss_mlp": 1.0219605, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 2.386195329240294, "language_loss": 0.86217451, "learning_rate": 3.183402174406057e-06, "loss": 0.88386285, "num_input_tokens_seen": 114139230, "step": 5314, "time_per_iteration": 2.6785764694213867 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01050856, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.03231871, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 1.996028492072791, "language_loss": 0.79866767, "learning_rate": 3.1830881845372747e-06, "loss": 0.82034278, "num_input_tokens_seen": 114159290, "step": 5315, "time_per_iteration": 2.723097085952759 }, { "auxiliary_loss_clip": 0.0110521, "auxiliary_loss_mlp": 0.01063258, "balance_loss_clip": 1.04667854, "balance_loss_mlp": 1.04386258, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 2.2633227615123275, "language_loss": 0.67312729, "learning_rate": 3.18277414980567e-06, "loss": 0.69481194, "num_input_tokens_seen": 114177655, "step": 5316, "time_per_iteration": 2.7841827869415283 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.05015874, "balance_loss_mlp": 1.03126907, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 1.540647016415601, "language_loss": 0.69375229, "learning_rate": 3.1824600702231515e-06, "loss": 0.71553081, "num_input_tokens_seen": 114200880, "step": 5317, "time_per_iteration": 2.7080705165863037 }, { "auxiliary_loss_clip": 0.01036788, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.02571428, "balance_loss_mlp": 1.03117692, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.7974882454120521, "language_loss": 0.53049421, "learning_rate": 3.182145945801628e-06, "loss": 0.55119646, "num_input_tokens_seen": 114267145, "step": 5318, "time_per_iteration": 3.5072765350341797 }, { "auxiliary_loss_clip": 0.0114058, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02509975, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 3.679429868734815, "language_loss": 0.84239668, "learning_rate": 3.181831776553012e-06, "loss": 0.86421257, "num_input_tokens_seen": 114284630, "step": 5319, "time_per_iteration": 2.6148228645324707 }, { "auxiliary_loss_clip": 0.0112589, "auxiliary_loss_mlp": 0.01041338, "balance_loss_clip": 1.04876614, "balance_loss_mlp": 1.02552485, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 1.684363339069699, "language_loss": 0.63463295, "learning_rate": 3.1815175624892165e-06, "loss": 0.65630519, "num_input_tokens_seen": 114305830, "step": 5320, "time_per_iteration": 2.7444913387298584 }, { "auxiliary_loss_clip": 0.01120865, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.05072045, "balance_loss_mlp": 1.02682114, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 2.113040492667506, "language_loss": 0.70552826, "learning_rate": 3.1812033036221567e-06, "loss": 0.72716618, "num_input_tokens_seen": 114325165, "step": 5321, "time_per_iteration": 2.7078404426574707 }, { "auxiliary_loss_clip": 0.01151862, "auxiliary_loss_mlp": 0.00776802, "balance_loss_clip": 1.05639851, "balance_loss_mlp": 1.00126243, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 2.699319417691227, "language_loss": 0.8659147, "learning_rate": 3.180888999963749e-06, "loss": 0.88520133, "num_input_tokens_seen": 114341310, "step": 5322, "time_per_iteration": 2.5562047958374023 }, { "auxiliary_loss_clip": 0.01119411, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.05106568, "balance_loss_mlp": 1.02265561, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 1.7451682184714292, "language_loss": 0.83021653, "learning_rate": 3.1805746515259123e-06, "loss": 0.85180014, "num_input_tokens_seen": 114360355, "step": 5323, "time_per_iteration": 2.6323180198669434 }, { "auxiliary_loss_clip": 0.01129356, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.05092812, "balance_loss_mlp": 1.02440214, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 1.6785162629315, "language_loss": 0.77686846, "learning_rate": 3.1802602583205663e-06, "loss": 0.79857814, "num_input_tokens_seen": 114379220, "step": 5324, "time_per_iteration": 2.6361289024353027 }, { "auxiliary_loss_clip": 0.01115575, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.04754376, "balance_loss_mlp": 1.01861751, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 1.9010400542588533, "language_loss": 0.80500418, "learning_rate": 3.1799458203596333e-06, "loss": 0.82651764, "num_input_tokens_seen": 114396365, "step": 5325, "time_per_iteration": 2.681349277496338 }, { "auxiliary_loss_clip": 0.01133585, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.05378425, "balance_loss_mlp": 1.02394414, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 1.7412856997403743, "language_loss": 0.74817789, "learning_rate": 3.179631337655037e-06, "loss": 0.76991343, "num_input_tokens_seen": 114416780, "step": 5326, "time_per_iteration": 2.6932616233825684 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.0104309, "balance_loss_clip": 1.05045807, "balance_loss_mlp": 1.02659154, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 1.642662123916105, "language_loss": 0.80796289, "learning_rate": 3.179316810218701e-06, "loss": 0.82943213, "num_input_tokens_seen": 114437405, "step": 5327, "time_per_iteration": 2.7527899742126465 }, { "auxiliary_loss_clip": 0.01115203, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05185604, "balance_loss_mlp": 1.02162015, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 1.846540372387515, "language_loss": 0.77796161, "learning_rate": 3.179002238062554e-06, "loss": 0.79949659, "num_input_tokens_seen": 114458505, "step": 5328, "time_per_iteration": 2.7631096839904785 }, { "auxiliary_loss_clip": 0.01087281, "auxiliary_loss_mlp": 0.01043102, "balance_loss_clip": 1.0453198, "balance_loss_mlp": 1.0245527, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 1.6837826518335735, "language_loss": 0.74184239, "learning_rate": 3.178687621198524e-06, "loss": 0.76314622, "num_input_tokens_seen": 114479050, "step": 5329, "time_per_iteration": 2.7749221324920654 }, { "auxiliary_loss_clip": 0.01110066, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.04650402, "balance_loss_mlp": 1.02133203, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 1.7163505659405243, "language_loss": 0.71138644, "learning_rate": 3.1783729596385415e-06, "loss": 0.73285371, "num_input_tokens_seen": 114497415, "step": 5330, "time_per_iteration": 2.655578136444092 }, { "auxiliary_loss_clip": 0.01093261, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03379714, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 1.6854796065505788, "language_loss": 0.80175424, "learning_rate": 3.1780582533945376e-06, "loss": 0.82322645, "num_input_tokens_seen": 114518785, "step": 5331, "time_per_iteration": 2.851639747619629 }, { "auxiliary_loss_clip": 0.01040347, "auxiliary_loss_mlp": 0.01008357, "balance_loss_clip": 1.02573299, "balance_loss_mlp": 1.0059495, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8321512232204817, "language_loss": 0.57821107, "learning_rate": 3.177743502478447e-06, "loss": 0.59869808, "num_input_tokens_seen": 114577710, "step": 5332, "time_per_iteration": 3.1104307174682617 }, { "auxiliary_loss_clip": 0.01104131, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.04842329, "balance_loss_mlp": 1.02194548, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 1.7127909178457088, "language_loss": 0.72918129, "learning_rate": 3.177428706902205e-06, "loss": 0.75060534, "num_input_tokens_seen": 114598640, "step": 5333, "time_per_iteration": 2.7683963775634766 }, { "auxiliary_loss_clip": 0.01118957, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.04778981, "balance_loss_mlp": 1.02685761, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 2.1728626414536767, "language_loss": 0.70592654, "learning_rate": 3.1771138666777485e-06, "loss": 0.72755098, "num_input_tokens_seen": 114618780, "step": 5334, "time_per_iteration": 2.6861116886138916 }, { "auxiliary_loss_clip": 0.01100969, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04742825, "balance_loss_mlp": 1.02536023, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 2.526978692505362, "language_loss": 0.77161503, "learning_rate": 3.1767989818170156e-06, "loss": 0.79304117, "num_input_tokens_seen": 114637525, "step": 5335, "time_per_iteration": 4.33164381980896 }, { "auxiliary_loss_clip": 0.01130469, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05087018, "balance_loss_mlp": 1.02213204, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 1.6997548644452432, "language_loss": 0.68414462, "learning_rate": 3.1764840523319477e-06, "loss": 0.7058323, "num_input_tokens_seen": 114659705, "step": 5336, "time_per_iteration": 2.840373992919922 }, { "auxiliary_loss_clip": 0.01102432, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04495001, "balance_loss_mlp": 1.03862596, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 1.733261513029939, "language_loss": 0.78828537, "learning_rate": 3.176169078234487e-06, "loss": 0.8098622, "num_input_tokens_seen": 114678340, "step": 5337, "time_per_iteration": 4.268811464309692 }, { "auxiliary_loss_clip": 0.01121282, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.04696417, "balance_loss_mlp": 1.02512085, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 2.1583979373304194, "language_loss": 0.74322718, "learning_rate": 3.1758540595365766e-06, "loss": 0.76483715, "num_input_tokens_seen": 114696980, "step": 5338, "time_per_iteration": 2.6442766189575195 }, { "auxiliary_loss_clip": 0.01119062, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.04633641, "balance_loss_mlp": 1.03078675, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 2.118549362741933, "language_loss": 0.62622869, "learning_rate": 3.1755389962501626e-06, "loss": 0.64789224, "num_input_tokens_seen": 114717330, "step": 5339, "time_per_iteration": 2.684843063354492 }, { "auxiliary_loss_clip": 0.01141698, "auxiliary_loss_mlp": 0.01046177, "balance_loss_clip": 1.05127931, "balance_loss_mlp": 1.02954674, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 2.480509085809345, "language_loss": 0.81685597, "learning_rate": 3.175223888387192e-06, "loss": 0.83873475, "num_input_tokens_seen": 114736320, "step": 5340, "time_per_iteration": 4.130942344665527 }, { "auxiliary_loss_clip": 0.01110441, "auxiliary_loss_mlp": 0.01050741, "balance_loss_clip": 1.04820514, "balance_loss_mlp": 1.03462362, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 2.326860742494733, "language_loss": 0.76571834, "learning_rate": 3.1749087359596137e-06, "loss": 0.78733015, "num_input_tokens_seen": 114754575, "step": 5341, "time_per_iteration": 2.7302300930023193 }, { "auxiliary_loss_clip": 0.01101828, "auxiliary_loss_mlp": 0.01044591, "balance_loss_clip": 1.04797173, "balance_loss_mlp": 1.02840281, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 1.680960149410583, "language_loss": 0.79268491, "learning_rate": 3.1745935389793786e-06, "loss": 0.81414914, "num_input_tokens_seen": 114773590, "step": 5342, "time_per_iteration": 4.462036609649658 }, { "auxiliary_loss_clip": 0.01118478, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.05000186, "balance_loss_mlp": 1.02876329, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 3.232512085646521, "language_loss": 0.74449253, "learning_rate": 3.174278297458438e-06, "loss": 0.76613677, "num_input_tokens_seen": 114790775, "step": 5343, "time_per_iteration": 2.7057244777679443 }, { "auxiliary_loss_clip": 0.01080228, "auxiliary_loss_mlp": 0.0104431, "balance_loss_clip": 1.04317784, "balance_loss_mlp": 1.02704811, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 1.672847320129023, "language_loss": 0.82661629, "learning_rate": 3.173963011408748e-06, "loss": 0.84786165, "num_input_tokens_seen": 114809835, "step": 5344, "time_per_iteration": 2.801013231277466 }, { "auxiliary_loss_clip": 0.01088811, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.04556143, "balance_loss_mlp": 1.02565217, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 22.33494793204904, "language_loss": 0.79863501, "learning_rate": 3.173647680842262e-06, "loss": 0.81994879, "num_input_tokens_seen": 114826505, "step": 5345, "time_per_iteration": 2.743778944015503 }, { "auxiliary_loss_clip": 0.01114864, "auxiliary_loss_mlp": 0.01041047, "balance_loss_clip": 1.04774046, "balance_loss_mlp": 1.02507281, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 2.095379605818748, "language_loss": 0.83340824, "learning_rate": 3.1733323057709384e-06, "loss": 0.85496742, "num_input_tokens_seen": 114846140, "step": 5346, "time_per_iteration": 2.8187026977539062 }, { "auxiliary_loss_clip": 0.01110187, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.04783988, "balance_loss_mlp": 1.02797008, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 1.6371928172660764, "language_loss": 0.81853002, "learning_rate": 3.1730168862067366e-06, "loss": 0.84008235, "num_input_tokens_seen": 114866660, "step": 5347, "time_per_iteration": 2.724003553390503 }, { "auxiliary_loss_clip": 0.0112676, "auxiliary_loss_mlp": 0.01047135, "balance_loss_clip": 1.048388, "balance_loss_mlp": 1.02891994, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 4.152516057334243, "language_loss": 0.80263776, "learning_rate": 3.1727014221616164e-06, "loss": 0.8243767, "num_input_tokens_seen": 114882820, "step": 5348, "time_per_iteration": 2.6249122619628906 }, { "auxiliary_loss_clip": 0.01113488, "auxiliary_loss_mlp": 0.0105622, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.03931606, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 2.570277900111974, "language_loss": 0.85020632, "learning_rate": 3.172385913647542e-06, "loss": 0.87190342, "num_input_tokens_seen": 114900745, "step": 5349, "time_per_iteration": 2.6685211658477783 }, { "auxiliary_loss_clip": 0.01113139, "auxiliary_loss_mlp": 0.0104332, "balance_loss_clip": 1.04840457, "balance_loss_mlp": 1.02644002, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 2.7209437086115282, "language_loss": 0.80619532, "learning_rate": 3.172070360676475e-06, "loss": 0.82775992, "num_input_tokens_seen": 114917940, "step": 5350, "time_per_iteration": 2.6857874393463135 }, { "auxiliary_loss_clip": 0.01128309, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.05025196, "balance_loss_mlp": 1.02955103, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 5.5112684101117395, "language_loss": 0.80060112, "learning_rate": 3.1717547632603828e-06, "loss": 0.82233858, "num_input_tokens_seen": 114937735, "step": 5351, "time_per_iteration": 2.68406081199646 }, { "auxiliary_loss_clip": 0.01104774, "auxiliary_loss_mlp": 0.01045518, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02811348, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 2.189681121413186, "language_loss": 0.75826663, "learning_rate": 3.1714391214112326e-06, "loss": 0.7797696, "num_input_tokens_seen": 114956630, "step": 5352, "time_per_iteration": 2.7035396099090576 }, { "auxiliary_loss_clip": 0.0109763, "auxiliary_loss_mlp": 0.01043305, "balance_loss_clip": 1.04897571, "balance_loss_mlp": 1.02579308, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 2.4508783518814807, "language_loss": 0.81992233, "learning_rate": 3.1711234351409933e-06, "loss": 0.84133166, "num_input_tokens_seen": 114976470, "step": 5353, "time_per_iteration": 2.731339931488037 }, { "auxiliary_loss_clip": 0.01074627, "auxiliary_loss_mlp": 0.0104331, "balance_loss_clip": 1.04917347, "balance_loss_mlp": 1.02605999, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 2.2390857397461246, "language_loss": 0.73474252, "learning_rate": 3.1708077044616365e-06, "loss": 0.75592184, "num_input_tokens_seen": 114996710, "step": 5354, "time_per_iteration": 2.8337595462799072 }, { "auxiliary_loss_clip": 0.01103547, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.04475546, "balance_loss_mlp": 1.02428102, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 1.8690515367544651, "language_loss": 0.83792925, "learning_rate": 3.1704919293851334e-06, "loss": 0.85936201, "num_input_tokens_seen": 115015775, "step": 5355, "time_per_iteration": 2.7299652099609375 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01046795, "balance_loss_clip": 1.05450225, "balance_loss_mlp": 1.03032064, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 1.9705527058452093, "language_loss": 0.70895493, "learning_rate": 3.1701761099234597e-06, "loss": 0.73088312, "num_input_tokens_seen": 115034265, "step": 5356, "time_per_iteration": 2.638268232345581 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.04954576, "balance_loss_mlp": 1.02245283, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 2.5241040535813095, "language_loss": 0.67760962, "learning_rate": 3.1698602460885903e-06, "loss": 0.69903815, "num_input_tokens_seen": 115051945, "step": 5357, "time_per_iteration": 2.7816576957702637 }, { "auxiliary_loss_clip": 0.01037625, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.0279882, "balance_loss_mlp": 1.02722347, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.7244200234208643, "language_loss": 0.58319688, "learning_rate": 3.1695443378925035e-06, "loss": 0.60386384, "num_input_tokens_seen": 115119090, "step": 5358, "time_per_iteration": 3.3341448307037354 }, { "auxiliary_loss_clip": 0.01076802, "auxiliary_loss_mlp": 0.01044493, "balance_loss_clip": 1.04142976, "balance_loss_mlp": 1.0270052, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 2.2322811787478427, "language_loss": 0.83184302, "learning_rate": 3.1692283853471777e-06, "loss": 0.85305595, "num_input_tokens_seen": 115137755, "step": 5359, "time_per_iteration": 2.836543083190918 }, { "auxiliary_loss_clip": 0.01129966, "auxiliary_loss_mlp": 0.01035598, "balance_loss_clip": 1.04800034, "balance_loss_mlp": 1.01938617, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 2.0261007556732964, "language_loss": 0.79563689, "learning_rate": 3.168912388464595e-06, "loss": 0.81729257, "num_input_tokens_seen": 115158150, "step": 5360, "time_per_iteration": 2.66043758392334 }, { "auxiliary_loss_clip": 0.01045199, "auxiliary_loss_mlp": 0.01009155, "balance_loss_clip": 1.02352595, "balance_loss_mlp": 1.00706911, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6569282603798298, "language_loss": 0.56928504, "learning_rate": 3.168596347256737e-06, "loss": 0.58982855, "num_input_tokens_seen": 115212755, "step": 5361, "time_per_iteration": 3.007119655609131 }, { "auxiliary_loss_clip": 0.01078785, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.04366553, "balance_loss_mlp": 1.03166366, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 3.2787914187636495, "language_loss": 0.71563178, "learning_rate": 3.168280261735588e-06, "loss": 0.73691058, "num_input_tokens_seen": 115233090, "step": 5362, "time_per_iteration": 2.8345048427581787 }, { "auxiliary_loss_clip": 0.0112485, "auxiliary_loss_mlp": 0.01053523, "balance_loss_clip": 1.04899716, "balance_loss_mlp": 1.03670287, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 2.1292104037374773, "language_loss": 0.74106693, "learning_rate": 3.167964131913135e-06, "loss": 0.76285076, "num_input_tokens_seen": 115252645, "step": 5363, "time_per_iteration": 2.70552659034729 }, { "auxiliary_loss_clip": 0.01134941, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05024791, "balance_loss_mlp": 1.02637601, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 3.812297759050374, "language_loss": 0.77379405, "learning_rate": 3.167647957801365e-06, "loss": 0.7955696, "num_input_tokens_seen": 115269085, "step": 5364, "time_per_iteration": 2.66058087348938 }, { "auxiliary_loss_clip": 0.01120766, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05058861, "balance_loss_mlp": 1.02468252, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 3.514939630870356, "language_loss": 0.76727009, "learning_rate": 3.1673317394122672e-06, "loss": 0.78890389, "num_input_tokens_seen": 115286470, "step": 5365, "time_per_iteration": 2.6493194103240967 }, { "auxiliary_loss_clip": 0.01124156, "auxiliary_loss_mlp": 0.01048476, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.03201342, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 7.419360933702927, "language_loss": 0.76938248, "learning_rate": 3.1670154767578333e-06, "loss": 0.79110885, "num_input_tokens_seen": 115307000, "step": 5366, "time_per_iteration": 2.6984689235687256 }, { "auxiliary_loss_clip": 0.01110868, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.04554594, "balance_loss_mlp": 1.02792382, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 2.2843777844497453, "language_loss": 0.71972823, "learning_rate": 3.166699169850055e-06, "loss": 0.74128091, "num_input_tokens_seen": 115325925, "step": 5367, "time_per_iteration": 2.6944496631622314 }, { "auxiliary_loss_clip": 0.01138096, "auxiliary_loss_mlp": 0.01043716, "balance_loss_clip": 1.05035067, "balance_loss_mlp": 1.0286001, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 13.04054524246424, "language_loss": 0.74414504, "learning_rate": 3.1663828187009274e-06, "loss": 0.76596308, "num_input_tokens_seen": 115343705, "step": 5368, "time_per_iteration": 2.670567750930786 }, { "auxiliary_loss_clip": 0.01103298, "auxiliary_loss_mlp": 0.01049074, "balance_loss_clip": 1.04370904, "balance_loss_mlp": 1.0322659, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 1.655769512058306, "language_loss": 0.78693509, "learning_rate": 3.1660664233224467e-06, "loss": 0.80845881, "num_input_tokens_seen": 115364170, "step": 5369, "time_per_iteration": 2.777437448501587 }, { "auxiliary_loss_clip": 0.01099309, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.04874706, "balance_loss_mlp": 1.0222764, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 13.189929997499553, "language_loss": 0.83189309, "learning_rate": 3.16574998372661e-06, "loss": 0.85326445, "num_input_tokens_seen": 115382495, "step": 5370, "time_per_iteration": 2.734342336654663 }, { "auxiliary_loss_clip": 0.01141788, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05202413, "balance_loss_mlp": 1.0291779, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 3.3293058605981614, "language_loss": 0.8288244, "learning_rate": 3.1654334999254177e-06, "loss": 0.85068965, "num_input_tokens_seen": 115399450, "step": 5371, "time_per_iteration": 2.620091676712036 }, { "auxiliary_loss_clip": 0.01133164, "auxiliary_loss_mlp": 0.00776239, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.00122416, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 3.1117013800624993, "language_loss": 0.8852632, "learning_rate": 3.1651169719308695e-06, "loss": 0.90435725, "num_input_tokens_seen": 115417700, "step": 5372, "time_per_iteration": 2.673567056655884 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01049295, "balance_loss_clip": 1.05098414, "balance_loss_mlp": 1.03341591, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 2.7114986433136727, "language_loss": 0.73388374, "learning_rate": 3.1648003997549694e-06, "loss": 0.75577939, "num_input_tokens_seen": 115435840, "step": 5373, "time_per_iteration": 2.6910293102264404 }, { "auxiliary_loss_clip": 0.0110976, "auxiliary_loss_mlp": 0.01044756, "balance_loss_clip": 1.04653084, "balance_loss_mlp": 1.02873468, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 2.3161305262959573, "language_loss": 0.81114149, "learning_rate": 3.1644837834097214e-06, "loss": 0.83268672, "num_input_tokens_seen": 115454210, "step": 5374, "time_per_iteration": 2.666707992553711 }, { "auxiliary_loss_clip": 0.01095169, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.0438931, "balance_loss_mlp": 1.02254975, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 2.1309099752285863, "language_loss": 0.87817222, "learning_rate": 3.1641671229071317e-06, "loss": 0.89951062, "num_input_tokens_seen": 115471785, "step": 5375, "time_per_iteration": 4.252593994140625 }, { "auxiliary_loss_clip": 0.01140942, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.04865098, "balance_loss_mlp": 1.01960015, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 2.12002794330764, "language_loss": 0.75837636, "learning_rate": 3.1638504182592076e-06, "loss": 0.78014749, "num_input_tokens_seen": 115491405, "step": 5376, "time_per_iteration": 2.64569091796875 }, { "auxiliary_loss_clip": 0.01100111, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.04745007, "balance_loss_mlp": 1.0227654, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 16.356053535517315, "language_loss": 0.66570163, "learning_rate": 3.1635336694779594e-06, "loss": 0.68708175, "num_input_tokens_seen": 115511555, "step": 5377, "time_per_iteration": 4.228315591812134 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01059488, "balance_loss_clip": 1.04591548, "balance_loss_mlp": 1.04070055, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 1.5026052482517693, "language_loss": 0.72276354, "learning_rate": 3.1632168765753982e-06, "loss": 0.74439251, "num_input_tokens_seen": 115532860, "step": 5378, "time_per_iteration": 2.7754812240600586 }, { "auxiliary_loss_clip": 0.0112205, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.04869092, "balance_loss_mlp": 1.0214678, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 2.7898138283200344, "language_loss": 0.82221997, "learning_rate": 3.1629000395635357e-06, "loss": 0.84380603, "num_input_tokens_seen": 115553850, "step": 5379, "time_per_iteration": 2.672743320465088 }, { "auxiliary_loss_clip": 0.01130962, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.04864693, "balance_loss_mlp": 1.02083325, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 1.5555457678220286, "language_loss": 0.78895414, "learning_rate": 3.162583158454388e-06, "loss": 0.81062359, "num_input_tokens_seen": 115575530, "step": 5380, "time_per_iteration": 4.130786180496216 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.04988194, "balance_loss_mlp": 1.0286541, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 1.7365933554134192, "language_loss": 0.76877856, "learning_rate": 3.1622662332599697e-06, "loss": 0.79046834, "num_input_tokens_seen": 115594885, "step": 5381, "time_per_iteration": 2.6297740936279297 }, { "auxiliary_loss_clip": 0.01122723, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.0485673, "balance_loss_mlp": 1.02333474, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 1.9510545380996942, "language_loss": 0.71868116, "learning_rate": 3.1619492639922998e-06, "loss": 0.7402842, "num_input_tokens_seen": 115614080, "step": 5382, "time_per_iteration": 4.239168167114258 }, { "auxiliary_loss_clip": 0.01114051, "auxiliary_loss_mlp": 0.01051511, "balance_loss_clip": 1.0454843, "balance_loss_mlp": 1.03392792, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.5669193665709815, "language_loss": 0.70947385, "learning_rate": 3.1616322506633964e-06, "loss": 0.73112947, "num_input_tokens_seen": 115632820, "step": 5383, "time_per_iteration": 2.701462507247925 }, { "auxiliary_loss_clip": 0.01123558, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.04770291, "balance_loss_mlp": 1.02382779, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 1.9442688765107798, "language_loss": 0.78333974, "learning_rate": 3.161315193285283e-06, "loss": 0.8049649, "num_input_tokens_seen": 115652860, "step": 5384, "time_per_iteration": 2.6939637660980225 }, { "auxiliary_loss_clip": 0.01078749, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.04298878, "balance_loss_mlp": 1.03203273, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 2.1298780259276575, "language_loss": 0.75396919, "learning_rate": 3.16099809186998e-06, "loss": 0.77525795, "num_input_tokens_seen": 115670940, "step": 5385, "time_per_iteration": 2.7813403606414795 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.04995322, "balance_loss_mlp": 1.0248363, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 2.042597717530735, "language_loss": 0.71488941, "learning_rate": 3.1606809464295145e-06, "loss": 0.73642552, "num_input_tokens_seen": 115691155, "step": 5386, "time_per_iteration": 2.754636526107788 }, { "auxiliary_loss_clip": 0.01142583, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.0499016, "balance_loss_mlp": 1.02334547, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 5.057227062214219, "language_loss": 0.94889075, "learning_rate": 3.1603637569759095e-06, "loss": 0.97071928, "num_input_tokens_seen": 115710340, "step": 5387, "time_per_iteration": 2.6547048091888428 }, { "auxiliary_loss_clip": 0.01133488, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.05193102, "balance_loss_mlp": 1.02696419, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 10.717385990424205, "language_loss": 0.77620786, "learning_rate": 3.1600465235211956e-06, "loss": 0.79798394, "num_input_tokens_seen": 115726745, "step": 5388, "time_per_iteration": 2.657205820083618 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04523969, "balance_loss_mlp": 1.01978493, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 2.237731185409586, "language_loss": 0.71233571, "learning_rate": 3.1597292460774006e-06, "loss": 0.73382103, "num_input_tokens_seen": 115749385, "step": 5389, "time_per_iteration": 2.799731731414795 }, { "auxiliary_loss_clip": 0.01099836, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.04759645, "balance_loss_mlp": 1.02302158, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 1.8547230503773184, "language_loss": 0.80461568, "learning_rate": 3.159411924656557e-06, "loss": 0.82600403, "num_input_tokens_seen": 115768105, "step": 5390, "time_per_iteration": 2.703913450241089 }, { "auxiliary_loss_clip": 0.01112322, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.04881656, "balance_loss_mlp": 1.0330621, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 4.514534114801655, "language_loss": 0.72674775, "learning_rate": 3.1590945592706967e-06, "loss": 0.74837172, "num_input_tokens_seen": 115787340, "step": 5391, "time_per_iteration": 2.8789660930633545 }, { "auxiliary_loss_clip": 0.01110171, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.04422975, "balance_loss_mlp": 1.02517664, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 2.092129040046021, "language_loss": 0.77347648, "learning_rate": 3.158777149931855e-06, "loss": 0.79498285, "num_input_tokens_seen": 115805565, "step": 5392, "time_per_iteration": 2.6689188480377197 }, { "auxiliary_loss_clip": 0.01112252, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04517519, "balance_loss_mlp": 1.03289127, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 1.9207699243041063, "language_loss": 0.62606925, "learning_rate": 3.158459696652067e-06, "loss": 0.6477111, "num_input_tokens_seen": 115826725, "step": 5393, "time_per_iteration": 2.758423328399658 }, { "auxiliary_loss_clip": 0.01122257, "auxiliary_loss_mlp": 0.01043934, "balance_loss_clip": 1.04730856, "balance_loss_mlp": 1.02770925, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 1.583732116281239, "language_loss": 0.82284617, "learning_rate": 3.158142199443371e-06, "loss": 0.84450811, "num_input_tokens_seen": 115846955, "step": 5394, "time_per_iteration": 2.6715636253356934 }, { "auxiliary_loss_clip": 0.01111969, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.04729748, "balance_loss_mlp": 1.03120947, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 1.873068954405441, "language_loss": 0.817029, "learning_rate": 3.1578246583178076e-06, "loss": 0.83860689, "num_input_tokens_seen": 115865975, "step": 5395, "time_per_iteration": 2.7120518684387207 }, { "auxiliary_loss_clip": 0.01126983, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.0519104, "balance_loss_mlp": 1.02413607, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 1.8441183317386671, "language_loss": 0.83172363, "learning_rate": 3.157507073287417e-06, "loss": 0.85338825, "num_input_tokens_seen": 115884950, "step": 5396, "time_per_iteration": 2.6589252948760986 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01053141, "balance_loss_clip": 1.04818082, "balance_loss_mlp": 1.03462827, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 2.3735724483298553, "language_loss": 0.75721765, "learning_rate": 3.1571894443642414e-06, "loss": 0.77878618, "num_input_tokens_seen": 115904170, "step": 5397, "time_per_iteration": 2.7118513584136963 }, { "auxiliary_loss_clip": 0.01104001, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02504468, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 7.349892433890134, "language_loss": 0.67359912, "learning_rate": 3.1568717715603263e-06, "loss": 0.69504505, "num_input_tokens_seen": 115919255, "step": 5398, "time_per_iteration": 2.690317153930664 }, { "auxiliary_loss_clip": 0.01111486, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.01784301, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 1.692830304346276, "language_loss": 0.73074687, "learning_rate": 3.156554054887718e-06, "loss": 0.7521975, "num_input_tokens_seen": 115938535, "step": 5399, "time_per_iteration": 2.754539728164673 }, { "auxiliary_loss_clip": 0.01101582, "auxiliary_loss_mlp": 0.01036858, "balance_loss_clip": 1.04522848, "balance_loss_mlp": 1.02056217, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 2.780796864612311, "language_loss": 0.71580744, "learning_rate": 3.1562362943584645e-06, "loss": 0.7371918, "num_input_tokens_seen": 115955005, "step": 5400, "time_per_iteration": 2.707712173461914 }, { "auxiliary_loss_clip": 0.01127225, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.0472424, "balance_loss_mlp": 1.02469516, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 2.1905750946262805, "language_loss": 0.79769576, "learning_rate": 3.155918489984614e-06, "loss": 0.81937146, "num_input_tokens_seen": 115975305, "step": 5401, "time_per_iteration": 2.7813303470611572 }, { "auxiliary_loss_clip": 0.01109499, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04414558, "balance_loss_mlp": 1.02341187, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 4.743153882711402, "language_loss": 0.87785316, "learning_rate": 3.1556006417782196e-06, "loss": 0.89936143, "num_input_tokens_seen": 115994810, "step": 5402, "time_per_iteration": 2.7685606479644775 }, { "auxiliary_loss_clip": 0.01078796, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.03948891, "balance_loss_mlp": 1.02792931, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 4.964706141121962, "language_loss": 0.84572911, "learning_rate": 3.155282749751332e-06, "loss": 0.86696494, "num_input_tokens_seen": 116011095, "step": 5403, "time_per_iteration": 2.7299063205718994 }, { "auxiliary_loss_clip": 0.01104053, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04597795, "balance_loss_mlp": 1.03049469, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 3.7265891750540785, "language_loss": 0.87614954, "learning_rate": 3.154964813916007e-06, "loss": 0.89764082, "num_input_tokens_seen": 116028805, "step": 5404, "time_per_iteration": 2.7740931510925293 }, { "auxiliary_loss_clip": 0.01125798, "auxiliary_loss_mlp": 0.01043439, "balance_loss_clip": 1.04930234, "balance_loss_mlp": 1.02685738, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 2.5497237434599964, "language_loss": 0.72717422, "learning_rate": 3.1546468342843008e-06, "loss": 0.74886656, "num_input_tokens_seen": 116047765, "step": 5405, "time_per_iteration": 2.6756839752197266 }, { "auxiliary_loss_clip": 0.01098309, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.04964566, "balance_loss_mlp": 1.02390265, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 1.6968031771183532, "language_loss": 0.82927752, "learning_rate": 3.1543288108682707e-06, "loss": 0.8506552, "num_input_tokens_seen": 116068385, "step": 5406, "time_per_iteration": 2.728217124938965 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.01728487, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 1.9312900503750694, "language_loss": 0.87836796, "learning_rate": 3.1540107436799764e-06, "loss": 0.90005869, "num_input_tokens_seen": 116085350, "step": 5407, "time_per_iteration": 2.5519261360168457 }, { "auxiliary_loss_clip": 0.01112002, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.04575169, "balance_loss_mlp": 1.02506793, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.6044550363094983, "language_loss": 0.69804603, "learning_rate": 3.153692632731479e-06, "loss": 0.71957088, "num_input_tokens_seen": 116107560, "step": 5408, "time_per_iteration": 2.7141807079315186 }, { "auxiliary_loss_clip": 0.01131975, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.05021083, "balance_loss_mlp": 1.01977742, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 10.423580562540607, "language_loss": 0.77558911, "learning_rate": 3.153374478034841e-06, "loss": 0.79726762, "num_input_tokens_seen": 116125980, "step": 5409, "time_per_iteration": 2.644792318344116 }, { "auxiliary_loss_clip": 0.01079567, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.03893065, "balance_loss_mlp": 1.0280745, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 2.0524453166640146, "language_loss": 0.83282518, "learning_rate": 3.1530562796021285e-06, "loss": 0.85405946, "num_input_tokens_seen": 116146530, "step": 5410, "time_per_iteration": 2.846480131149292 }, { "auxiliary_loss_clip": 0.01086095, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.04789686, "balance_loss_mlp": 1.02272296, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 1.6475099523255856, "language_loss": 0.7081182, "learning_rate": 3.152738037445405e-06, "loss": 0.72935545, "num_input_tokens_seen": 116165695, "step": 5411, "time_per_iteration": 2.779330253601074 }, { "auxiliary_loss_clip": 0.0108148, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.04331398, "balance_loss_mlp": 1.02688956, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 1.6354124554173295, "language_loss": 0.82894456, "learning_rate": 3.1524197515767403e-06, "loss": 0.85017526, "num_input_tokens_seen": 116185375, "step": 5412, "time_per_iteration": 2.7841992378234863 }, { "auxiliary_loss_clip": 0.01106895, "auxiliary_loss_mlp": 0.01041599, "balance_loss_clip": 1.04730868, "balance_loss_mlp": 1.02430189, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 1.867437266565155, "language_loss": 0.80913842, "learning_rate": 3.152101422008203e-06, "loss": 0.83062339, "num_input_tokens_seen": 116204335, "step": 5413, "time_per_iteration": 2.7533957958221436 }, { "auxiliary_loss_clip": 0.01115005, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.04923081, "balance_loss_mlp": 1.02155089, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 3.355430774898342, "language_loss": 0.76891947, "learning_rate": 3.151783048751864e-06, "loss": 0.79045498, "num_input_tokens_seen": 116222840, "step": 5414, "time_per_iteration": 4.331217527389526 }, { "auxiliary_loss_clip": 0.01030644, "auxiliary_loss_mlp": 0.01012699, "balance_loss_clip": 1.02726388, "balance_loss_mlp": 1.01063681, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9066964616955783, "language_loss": 0.63865513, "learning_rate": 3.1514646318197965e-06, "loss": 0.65908855, "num_input_tokens_seen": 116274940, "step": 5415, "time_per_iteration": 3.172816753387451 }, { "auxiliary_loss_clip": 0.01088465, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.04119301, "balance_loss_mlp": 1.02279866, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 1.52454367487569, "language_loss": 0.74014068, "learning_rate": 3.151146171224075e-06, "loss": 0.76141143, "num_input_tokens_seen": 116297300, "step": 5416, "time_per_iteration": 4.326166868209839 }, { "auxiliary_loss_clip": 0.01062287, "auxiliary_loss_mlp": 0.0100407, "balance_loss_clip": 1.03045964, "balance_loss_mlp": 1.00160217, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.7686966052914506, "language_loss": 0.57851374, "learning_rate": 3.1508276669767757e-06, "loss": 0.59917736, "num_input_tokens_seen": 116362370, "step": 5417, "time_per_iteration": 3.2102463245391846 }, { "auxiliary_loss_clip": 0.01040835, "auxiliary_loss_mlp": 0.01012103, "balance_loss_clip": 1.02768993, "balance_loss_mlp": 1.00975466, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.7997987203444133, "language_loss": 0.63392216, "learning_rate": 3.150509119089975e-06, "loss": 0.65445155, "num_input_tokens_seen": 116430365, "step": 5418, "time_per_iteration": 4.847350120544434 }, { "auxiliary_loss_clip": 0.01110249, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.05171919, "balance_loss_mlp": 1.02794838, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 2.0985111563442325, "language_loss": 0.69086784, "learning_rate": 3.1501905275757537e-06, "loss": 0.71240497, "num_input_tokens_seen": 116447525, "step": 5419, "time_per_iteration": 2.6837174892425537 }, { "auxiliary_loss_clip": 0.0112744, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.05152702, "balance_loss_mlp": 1.02099252, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 1.6553118170887535, "language_loss": 0.77041519, "learning_rate": 3.1498718924461926e-06, "loss": 0.79206121, "num_input_tokens_seen": 116466310, "step": 5420, "time_per_iteration": 2.690243721008301 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.00774579, "balance_loss_clip": 1.04583097, "balance_loss_mlp": 1.00118852, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 1.6758047570714483, "language_loss": 0.8033973, "learning_rate": 3.1495532137133736e-06, "loss": 0.82238531, "num_input_tokens_seen": 116487825, "step": 5421, "time_per_iteration": 4.346652984619141 }, { "auxiliary_loss_clip": 0.01133401, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.04982162, "balance_loss_mlp": 1.0212909, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 1.7368751669124027, "language_loss": 0.75101721, "learning_rate": 3.149234491389381e-06, "loss": 0.77270067, "num_input_tokens_seen": 116509950, "step": 5422, "time_per_iteration": 2.698486566543579 }, { "auxiliary_loss_clip": 0.01104722, "auxiliary_loss_mlp": 0.00773675, "balance_loss_clip": 1.04894829, "balance_loss_mlp": 1.00120938, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 2.1580318636917384, "language_loss": 0.63323581, "learning_rate": 3.1489157254863026e-06, "loss": 0.65201974, "num_input_tokens_seen": 116527695, "step": 5423, "time_per_iteration": 2.7364964485168457 }, { "auxiliary_loss_clip": 0.01098661, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.04357564, "balance_loss_mlp": 1.01884615, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 1.5676988826806029, "language_loss": 0.74530792, "learning_rate": 3.148596916016224e-06, "loss": 0.76661909, "num_input_tokens_seen": 116547800, "step": 5424, "time_per_iteration": 2.695530652999878 }, { "auxiliary_loss_clip": 0.0110482, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.04803681, "balance_loss_mlp": 1.02199221, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 1.6667522289255576, "language_loss": 0.77194774, "learning_rate": 3.1482780629912355e-06, "loss": 0.79335308, "num_input_tokens_seen": 116568460, "step": 5425, "time_per_iteration": 2.6649699211120605 }, { "auxiliary_loss_clip": 0.01106187, "auxiliary_loss_mlp": 0.01040306, "balance_loss_clip": 1.04740202, "balance_loss_mlp": 1.02368808, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 2.8883064562409744, "language_loss": 0.78262472, "learning_rate": 3.147959166423428e-06, "loss": 0.80408967, "num_input_tokens_seen": 116588705, "step": 5426, "time_per_iteration": 2.7820892333984375 }, { "auxiliary_loss_clip": 0.01088898, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.04331303, "balance_loss_mlp": 1.01889908, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 1.9267107865215556, "language_loss": 0.74485052, "learning_rate": 3.147640226324893e-06, "loss": 0.76609194, "num_input_tokens_seen": 116608845, "step": 5427, "time_per_iteration": 2.7831003665924072 }, { "auxiliary_loss_clip": 0.01103791, "auxiliary_loss_mlp": 0.01041786, "balance_loss_clip": 1.04539597, "balance_loss_mlp": 1.02549028, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 6.869638277775165, "language_loss": 0.79136658, "learning_rate": 3.1473212427077266e-06, "loss": 0.81282234, "num_input_tokens_seen": 116628145, "step": 5428, "time_per_iteration": 2.7186481952667236 }, { "auxiliary_loss_clip": 0.01121911, "auxiliary_loss_mlp": 0.01040908, "balance_loss_clip": 1.04629314, "balance_loss_mlp": 1.02576876, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 5.016107817785842, "language_loss": 0.71130025, "learning_rate": 3.147002215584023e-06, "loss": 0.7329284, "num_input_tokens_seen": 116646920, "step": 5429, "time_per_iteration": 2.6733968257904053 }, { "auxiliary_loss_clip": 0.01098408, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.04658663, "balance_loss_mlp": 1.0212121, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 1.7379615094125744, "language_loss": 0.78620625, "learning_rate": 3.146683144965881e-06, "loss": 0.80754858, "num_input_tokens_seen": 116665100, "step": 5430, "time_per_iteration": 2.7313849925994873 }, { "auxiliary_loss_clip": 0.01084979, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.04809749, "balance_loss_mlp": 1.02660871, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 3.4420441965814477, "language_loss": 0.84279943, "learning_rate": 3.146364030865399e-06, "loss": 0.86408061, "num_input_tokens_seen": 116682205, "step": 5431, "time_per_iteration": 2.720797300338745 }, { "auxiliary_loss_clip": 0.01117845, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.04730058, "balance_loss_mlp": 1.02067482, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 1.9482899767939774, "language_loss": 0.70736587, "learning_rate": 3.146044873294678e-06, "loss": 0.7288934, "num_input_tokens_seen": 116702575, "step": 5432, "time_per_iteration": 2.6805124282836914 }, { "auxiliary_loss_clip": 0.01073417, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.04051948, "balance_loss_mlp": 1.02625418, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 1.6263283854003907, "language_loss": 0.84160507, "learning_rate": 3.1457256722658203e-06, "loss": 0.86276555, "num_input_tokens_seen": 116720885, "step": 5433, "time_per_iteration": 2.733450174331665 }, { "auxiliary_loss_clip": 0.01110224, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.04831946, "balance_loss_mlp": 1.01733375, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 1.8752055231309104, "language_loss": 0.860237, "learning_rate": 3.145406427790931e-06, "loss": 0.881657, "num_input_tokens_seen": 116740395, "step": 5434, "time_per_iteration": 2.6711690425872803 }, { "auxiliary_loss_clip": 0.01115762, "auxiliary_loss_mlp": 0.0104022, "balance_loss_clip": 1.04894018, "balance_loss_mlp": 1.02460361, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 2.089345873834278, "language_loss": 0.87845808, "learning_rate": 3.1450871398821147e-06, "loss": 0.90001786, "num_input_tokens_seen": 116758870, "step": 5435, "time_per_iteration": 2.7342183589935303 }, { "auxiliary_loss_clip": 0.01137287, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.05190301, "balance_loss_mlp": 1.02256095, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 3.0926239838125595, "language_loss": 0.7645883, "learning_rate": 3.144767808551479e-06, "loss": 0.78633732, "num_input_tokens_seen": 116773440, "step": 5436, "time_per_iteration": 2.648062229156494 }, { "auxiliary_loss_clip": 0.01137346, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.0532552, "balance_loss_mlp": 1.02046728, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 1.7720337367532448, "language_loss": 0.71802473, "learning_rate": 3.144448433811134e-06, "loss": 0.73974752, "num_input_tokens_seen": 116794375, "step": 5437, "time_per_iteration": 2.680525541305542 }, { "auxiliary_loss_clip": 0.01095966, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04542243, "balance_loss_mlp": 1.02445781, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 1.7134236857074348, "language_loss": 0.63728261, "learning_rate": 3.144129015673189e-06, "loss": 0.65866441, "num_input_tokens_seen": 116815095, "step": 5438, "time_per_iteration": 2.7343454360961914 }, { "auxiliary_loss_clip": 0.01128746, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.02468967, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 3.854723832885701, "language_loss": 0.74629039, "learning_rate": 3.1438095541497576e-06, "loss": 0.76797515, "num_input_tokens_seen": 116836630, "step": 5439, "time_per_iteration": 2.6859002113342285 }, { "auxiliary_loss_clip": 0.0113034, "auxiliary_loss_mlp": 0.0104413, "balance_loss_clip": 1.05407321, "balance_loss_mlp": 1.02773881, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 3.9922367032947634, "language_loss": 0.74743968, "learning_rate": 3.1434900492529527e-06, "loss": 0.76918435, "num_input_tokens_seen": 116856880, "step": 5440, "time_per_iteration": 2.6785733699798584 }, { "auxiliary_loss_clip": 0.01124529, "auxiliary_loss_mlp": 0.00773254, "balance_loss_clip": 1.05180979, "balance_loss_mlp": 1.00108397, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 2.2888111794693033, "language_loss": 0.84642965, "learning_rate": 3.1431705009948914e-06, "loss": 0.86540747, "num_input_tokens_seen": 116873770, "step": 5441, "time_per_iteration": 2.692375421524048 }, { "auxiliary_loss_clip": 0.01126517, "auxiliary_loss_mlp": 0.01042941, "balance_loss_clip": 1.05065203, "balance_loss_mlp": 1.02715778, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 3.048730330719705, "language_loss": 0.86782062, "learning_rate": 3.1428509093876897e-06, "loss": 0.88951516, "num_input_tokens_seen": 116891225, "step": 5442, "time_per_iteration": 2.6678872108459473 }, { "auxiliary_loss_clip": 0.01105154, "auxiliary_loss_mlp": 0.01041235, "balance_loss_clip": 1.05088091, "balance_loss_mlp": 1.02450991, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 2.240879974234663, "language_loss": 0.77471602, "learning_rate": 3.1425312744434668e-06, "loss": 0.79617989, "num_input_tokens_seen": 116912300, "step": 5443, "time_per_iteration": 2.715407133102417 }, { "auxiliary_loss_clip": 0.01109692, "auxiliary_loss_mlp": 0.00773391, "balance_loss_clip": 1.05144906, "balance_loss_mlp": 1.00102162, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 2.595112113661144, "language_loss": 0.81782895, "learning_rate": 3.142211596174343e-06, "loss": 0.83665979, "num_input_tokens_seen": 116929425, "step": 5444, "time_per_iteration": 2.7483620643615723 }, { "auxiliary_loss_clip": 0.0109768, "auxiliary_loss_mlp": 0.01042359, "balance_loss_clip": 1.05127132, "balance_loss_mlp": 1.02671897, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 2.0540771727134786, "language_loss": 0.59668452, "learning_rate": 3.1418918745924423e-06, "loss": 0.61808491, "num_input_tokens_seen": 116948255, "step": 5445, "time_per_iteration": 2.7937049865722656 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.05779314, "balance_loss_mlp": 1.02935553, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.705344105300375, "language_loss": 0.88343978, "learning_rate": 3.1415721097098865e-06, "loss": 0.90523833, "num_input_tokens_seen": 116964905, "step": 5446, "time_per_iteration": 2.586451292037964 }, { "auxiliary_loss_clip": 0.01135097, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0612191, "balance_loss_mlp": 1.02387285, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 2.2697780368090883, "language_loss": 0.79279661, "learning_rate": 3.141252301538802e-06, "loss": 0.81456167, "num_input_tokens_seen": 116983650, "step": 5447, "time_per_iteration": 2.744072198867798 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.00773964, "balance_loss_clip": 1.04747021, "balance_loss_mlp": 1.00110793, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 1.8015667711206929, "language_loss": 0.73182315, "learning_rate": 3.1409324500913157e-06, "loss": 0.75063848, "num_input_tokens_seen": 117003265, "step": 5448, "time_per_iteration": 2.6825077533721924 }, { "auxiliary_loss_clip": 0.01142648, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.05620432, "balance_loss_mlp": 1.02694106, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.4660761852129829, "language_loss": 0.67103487, "learning_rate": 3.1406125553795567e-06, "loss": 0.69288433, "num_input_tokens_seen": 117025370, "step": 5449, "time_per_iteration": 2.682499885559082 }, { "auxiliary_loss_clip": 0.0110995, "auxiliary_loss_mlp": 0.010411, "balance_loss_clip": 1.0542469, "balance_loss_mlp": 1.02627623, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 3.4023702964270943, "language_loss": 0.65110958, "learning_rate": 3.1402926174156556e-06, "loss": 0.67262006, "num_input_tokens_seen": 117044350, "step": 5450, "time_per_iteration": 2.7582857608795166 }, { "auxiliary_loss_clip": 0.0113136, "auxiliary_loss_mlp": 0.01045713, "balance_loss_clip": 1.05517817, "balance_loss_mlp": 1.03021002, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 1.5880234750249043, "language_loss": 0.77630055, "learning_rate": 3.1399726362117437e-06, "loss": 0.79807132, "num_input_tokens_seen": 117064450, "step": 5451, "time_per_iteration": 2.6543071269989014 }, { "auxiliary_loss_clip": 0.01131184, "auxiliary_loss_mlp": 0.01044056, "balance_loss_clip": 1.05428064, "balance_loss_mlp": 1.02809358, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 1.913131066587778, "language_loss": 0.70510584, "learning_rate": 3.1396526117799555e-06, "loss": 0.7268582, "num_input_tokens_seen": 117083060, "step": 5452, "time_per_iteration": 2.6963608264923096 }, { "auxiliary_loss_clip": 0.01112229, "auxiliary_loss_mlp": 0.01036592, "balance_loss_clip": 1.048841, "balance_loss_mlp": 1.02223349, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 2.6287596248848013, "language_loss": 0.78730083, "learning_rate": 3.1393325441324256e-06, "loss": 0.80878907, "num_input_tokens_seen": 117101860, "step": 5453, "time_per_iteration": 4.197263479232788 }, { "auxiliary_loss_clip": 0.01130585, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.0526675, "balance_loss_mlp": 1.02026486, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 5.184832608635382, "language_loss": 0.75771177, "learning_rate": 3.1390124332812916e-06, "loss": 0.77937293, "num_input_tokens_seen": 117123100, "step": 5454, "time_per_iteration": 2.7643721103668213 }, { "auxiliary_loss_clip": 0.01070253, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.03818846, "balance_loss_mlp": 1.03363037, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 2.8017119157252703, "language_loss": 0.76891404, "learning_rate": 3.1386922792386924e-06, "loss": 0.79009354, "num_input_tokens_seen": 117140515, "step": 5455, "time_per_iteration": 4.402290105819702 }, { "auxiliary_loss_clip": 0.01131084, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.05241477, "balance_loss_mlp": 1.02624655, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 1.6426536912861747, "language_loss": 0.74021912, "learning_rate": 3.138372082016768e-06, "loss": 0.76195538, "num_input_tokens_seen": 117161485, "step": 5456, "time_per_iteration": 2.821965217590332 }, { "auxiliary_loss_clip": 0.01140062, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.05334985, "balance_loss_mlp": 1.03212523, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 1.7597936582740754, "language_loss": 0.78038168, "learning_rate": 3.1380518416276596e-06, "loss": 0.80225635, "num_input_tokens_seen": 117181870, "step": 5457, "time_per_iteration": 2.703756093978882 }, { "auxiliary_loss_clip": 0.01104649, "auxiliary_loss_mlp": 0.01042509, "balance_loss_clip": 1.04943132, "balance_loss_mlp": 1.02752471, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 5.102364490559591, "language_loss": 0.79493362, "learning_rate": 3.1377315580835115e-06, "loss": 0.81640518, "num_input_tokens_seen": 117201380, "step": 5458, "time_per_iteration": 4.307415962219238 }, { "auxiliary_loss_clip": 0.01124323, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.05467916, "balance_loss_mlp": 1.02362311, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 1.6160363150508943, "language_loss": 0.73029429, "learning_rate": 3.1374112313964686e-06, "loss": 0.7519297, "num_input_tokens_seen": 117221040, "step": 5459, "time_per_iteration": 2.678131341934204 }, { "auxiliary_loss_clip": 0.01118921, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.05190325, "balance_loss_mlp": 1.02591753, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 2.011905165126453, "language_loss": 0.84018445, "learning_rate": 3.1370908615786783e-06, "loss": 0.86178553, "num_input_tokens_seen": 117241395, "step": 5460, "time_per_iteration": 5.767046213150024 }, { "auxiliary_loss_clip": 0.01138817, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.05174541, "balance_loss_mlp": 1.02029121, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 1.9959413021835115, "language_loss": 0.76553524, "learning_rate": 3.136770448642288e-06, "loss": 0.78727543, "num_input_tokens_seen": 117259340, "step": 5461, "time_per_iteration": 2.673659086227417 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.05065536, "balance_loss_mlp": 1.02489805, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 2.148112131584704, "language_loss": 0.62898672, "learning_rate": 3.1364499925994484e-06, "loss": 0.65062523, "num_input_tokens_seen": 117282375, "step": 5462, "time_per_iteration": 2.789217472076416 }, { "auxiliary_loss_clip": 0.01136727, "auxiliary_loss_mlp": 0.0077334, "balance_loss_clip": 1.05279326, "balance_loss_mlp": 1.00113511, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 2.4415591889879056, "language_loss": 0.7805075, "learning_rate": 3.1361294934623115e-06, "loss": 0.79960817, "num_input_tokens_seen": 117303830, "step": 5463, "time_per_iteration": 2.6797146797180176 }, { "auxiliary_loss_clip": 0.01109773, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.05036163, "balance_loss_mlp": 1.02523983, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 1.8407799027990368, "language_loss": 0.70095646, "learning_rate": 3.1358089512430303e-06, "loss": 0.72246289, "num_input_tokens_seen": 117320665, "step": 5464, "time_per_iteration": 2.7286477088928223 }, { "auxiliary_loss_clip": 0.01130175, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.05659711, "balance_loss_mlp": 1.02327609, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 1.976060055551124, "language_loss": 0.72474623, "learning_rate": 3.1354883659537594e-06, "loss": 0.74643314, "num_input_tokens_seen": 117339795, "step": 5465, "time_per_iteration": 2.6666364669799805 }, { "auxiliary_loss_clip": 0.01113042, "auxiliary_loss_mlp": 0.01049431, "balance_loss_clip": 1.05094242, "balance_loss_mlp": 1.03334332, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 1.953344541818443, "language_loss": 0.832214, "learning_rate": 3.1351677376066567e-06, "loss": 0.8538388, "num_input_tokens_seen": 117359525, "step": 5466, "time_per_iteration": 2.7432901859283447 }, { "auxiliary_loss_clip": 0.01113455, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.02577055, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 1.7893036060845653, "language_loss": 0.79221183, "learning_rate": 3.134847066213879e-06, "loss": 0.8137567, "num_input_tokens_seen": 117380320, "step": 5467, "time_per_iteration": 2.701490879058838 }, { "auxiliary_loss_clip": 0.0111678, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.05045676, "balance_loss_mlp": 1.01759124, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 1.5411251384559923, "language_loss": 0.74338531, "learning_rate": 3.134526351787587e-06, "loss": 0.76488233, "num_input_tokens_seen": 117400695, "step": 5468, "time_per_iteration": 2.6820507049560547 }, { "auxiliary_loss_clip": 0.0111552, "auxiliary_loss_mlp": 0.01042549, "balance_loss_clip": 1.05065966, "balance_loss_mlp": 1.02476263, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 1.9818058078172698, "language_loss": 0.7869612, "learning_rate": 3.134205594339942e-06, "loss": 0.80854189, "num_input_tokens_seen": 117418800, "step": 5469, "time_per_iteration": 2.6281590461730957 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.04863441, "balance_loss_mlp": 1.01838851, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 1.9383846382167882, "language_loss": 0.81744516, "learning_rate": 3.133884793883107e-06, "loss": 0.8388238, "num_input_tokens_seen": 117438220, "step": 5470, "time_per_iteration": 2.8643784523010254 }, { "auxiliary_loss_clip": 0.01140563, "auxiliary_loss_mlp": 0.01045939, "balance_loss_clip": 1.05232358, "balance_loss_mlp": 1.03021562, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 2.0914054865715768, "language_loss": 0.67699564, "learning_rate": 3.1335639504292478e-06, "loss": 0.69886065, "num_input_tokens_seen": 117462560, "step": 5471, "time_per_iteration": 2.851717948913574 }, { "auxiliary_loss_clip": 0.01148136, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.05701339, "balance_loss_mlp": 1.02594161, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 2.097557855250848, "language_loss": 0.64926231, "learning_rate": 3.1332430639905288e-06, "loss": 0.67117929, "num_input_tokens_seen": 117483665, "step": 5472, "time_per_iteration": 2.6586108207702637 }, { "auxiliary_loss_clip": 0.01128351, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05333138, "balance_loss_mlp": 1.02850199, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 3.4668570750263155, "language_loss": 0.88257217, "learning_rate": 3.13292213457912e-06, "loss": 0.90431023, "num_input_tokens_seen": 117503565, "step": 5473, "time_per_iteration": 2.6792144775390625 }, { "auxiliary_loss_clip": 0.01103479, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.04814398, "balance_loss_mlp": 1.02123809, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 1.8710184691373295, "language_loss": 0.78193343, "learning_rate": 3.1326011622071903e-06, "loss": 0.80335701, "num_input_tokens_seen": 117521460, "step": 5474, "time_per_iteration": 2.739057779312134 }, { "auxiliary_loss_clip": 0.01038022, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.02788568, "balance_loss_mlp": 1.02673554, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.8109823017171686, "language_loss": 0.6018818, "learning_rate": 3.132280146886911e-06, "loss": 0.62255442, "num_input_tokens_seen": 117580550, "step": 5475, "time_per_iteration": 3.196384906768799 }, { "auxiliary_loss_clip": 0.01091837, "auxiliary_loss_mlp": 0.01057279, "balance_loss_clip": 1.04454446, "balance_loss_mlp": 1.03726411, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 4.962450920257536, "language_loss": 0.76504046, "learning_rate": 3.131959088630455e-06, "loss": 0.78653169, "num_input_tokens_seen": 117600645, "step": 5476, "time_per_iteration": 2.7369961738586426 }, { "auxiliary_loss_clip": 0.01100541, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.04824603, "balance_loss_mlp": 1.02946782, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 2.5019671735892937, "language_loss": 0.74746907, "learning_rate": 3.131637987449997e-06, "loss": 0.76892209, "num_input_tokens_seen": 117618880, "step": 5477, "time_per_iteration": 2.814467430114746 }, { "auxiliary_loss_clip": 0.01135692, "auxiliary_loss_mlp": 0.01042652, "balance_loss_clip": 1.05235898, "balance_loss_mlp": 1.02838814, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 3.9065130557825234, "language_loss": 0.75539625, "learning_rate": 3.131316843357713e-06, "loss": 0.77717972, "num_input_tokens_seen": 117636445, "step": 5478, "time_per_iteration": 2.730445384979248 }, { "auxiliary_loss_clip": 0.0112467, "auxiliary_loss_mlp": 0.01042056, "balance_loss_clip": 1.04921985, "balance_loss_mlp": 1.02750051, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 2.855777191383278, "language_loss": 0.80462509, "learning_rate": 3.1309956563657807e-06, "loss": 0.82629234, "num_input_tokens_seen": 117653105, "step": 5479, "time_per_iteration": 2.6443796157836914 }, { "auxiliary_loss_clip": 0.01037863, "auxiliary_loss_mlp": 0.01000413, "balance_loss_clip": 1.02671266, "balance_loss_mlp": 0.99823159, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7530723778079996, "language_loss": 0.56519568, "learning_rate": 3.1306744264863804e-06, "loss": 0.58557844, "num_input_tokens_seen": 117719225, "step": 5480, "time_per_iteration": 3.213240146636963 }, { "auxiliary_loss_clip": 0.01124019, "auxiliary_loss_mlp": 0.00774449, "balance_loss_clip": 1.04898739, "balance_loss_mlp": 1.00116146, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 1.7923941739082951, "language_loss": 0.77444887, "learning_rate": 3.1303531537316915e-06, "loss": 0.79343355, "num_input_tokens_seen": 117738725, "step": 5481, "time_per_iteration": 2.6905598640441895 }, { "auxiliary_loss_clip": 0.01119194, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.05167091, "balance_loss_mlp": 1.03557408, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 1.5874205685036498, "language_loss": 0.78222132, "learning_rate": 3.130031838113899e-06, "loss": 0.80392069, "num_input_tokens_seen": 117757765, "step": 5482, "time_per_iteration": 2.765235424041748 }, { "auxiliary_loss_clip": 0.01130055, "auxiliary_loss_mlp": 0.01052605, "balance_loss_clip": 1.05121589, "balance_loss_mlp": 1.03674388, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 2.9405789595849385, "language_loss": 0.73674762, "learning_rate": 3.129710479645185e-06, "loss": 0.75857425, "num_input_tokens_seen": 117776810, "step": 5483, "time_per_iteration": 2.624969005584717 }, { "auxiliary_loss_clip": 0.01122896, "auxiliary_loss_mlp": 0.01054419, "balance_loss_clip": 1.05069685, "balance_loss_mlp": 1.03886831, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 1.8706124903497952, "language_loss": 0.75649381, "learning_rate": 3.1293890783377366e-06, "loss": 0.77826691, "num_input_tokens_seen": 117797730, "step": 5484, "time_per_iteration": 2.7650864124298096 }, { "auxiliary_loss_clip": 0.01141223, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.05515027, "balance_loss_mlp": 1.03807664, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 72.4202789440072, "language_loss": 0.71719176, "learning_rate": 3.129067634203742e-06, "loss": 0.73913872, "num_input_tokens_seen": 117815365, "step": 5485, "time_per_iteration": 2.603039264678955 }, { "auxiliary_loss_clip": 0.01081054, "auxiliary_loss_mlp": 0.01052335, "balance_loss_clip": 1.04921818, "balance_loss_mlp": 1.03822041, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 1.6108204077161399, "language_loss": 0.80275488, "learning_rate": 3.128746147255388e-06, "loss": 0.82408869, "num_input_tokens_seen": 117836095, "step": 5486, "time_per_iteration": 2.8364202976226807 }, { "auxiliary_loss_clip": 0.01106188, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.04739475, "balance_loss_mlp": 1.03650784, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 2.173231613182175, "language_loss": 0.84374005, "learning_rate": 3.1284246175048683e-06, "loss": 0.86533195, "num_input_tokens_seen": 117854655, "step": 5487, "time_per_iteration": 2.7796428203582764 }, { "auxiliary_loss_clip": 0.01087509, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04317069, "balance_loss_mlp": 1.0379355, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 2.633362688401157, "language_loss": 0.74667275, "learning_rate": 3.1281030449643735e-06, "loss": 0.76809955, "num_input_tokens_seen": 117873300, "step": 5488, "time_per_iteration": 2.7173233032226562 }, { "auxiliary_loss_clip": 0.01143363, "auxiliary_loss_mlp": 0.01051325, "balance_loss_clip": 1.05679107, "balance_loss_mlp": 1.03563726, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 2.518818086418956, "language_loss": 0.71718305, "learning_rate": 3.127781429646098e-06, "loss": 0.7391299, "num_input_tokens_seen": 117891540, "step": 5489, "time_per_iteration": 2.6647188663482666 }, { "auxiliary_loss_clip": 0.01137372, "auxiliary_loss_mlp": 0.01044261, "balance_loss_clip": 1.05154073, "balance_loss_mlp": 1.02973497, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 6.067113992727344, "language_loss": 0.88346136, "learning_rate": 3.127459771562238e-06, "loss": 0.90527773, "num_input_tokens_seen": 117907690, "step": 5490, "time_per_iteration": 2.594193696975708 }, { "auxiliary_loss_clip": 0.01127009, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.05081856, "balance_loss_mlp": 1.02396214, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 5.091693260582257, "language_loss": 0.83396459, "learning_rate": 3.1271380707249907e-06, "loss": 0.85562241, "num_input_tokens_seen": 117925640, "step": 5491, "time_per_iteration": 2.6124439239501953 }, { "auxiliary_loss_clip": 0.01111643, "auxiliary_loss_mlp": 0.01048849, "balance_loss_clip": 1.05066538, "balance_loss_mlp": 1.03372788, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 1.9936853829327341, "language_loss": 0.77453989, "learning_rate": 3.126816327146554e-06, "loss": 0.79614484, "num_input_tokens_seen": 117944525, "step": 5492, "time_per_iteration": 4.26681923866272 }, { "auxiliary_loss_clip": 0.01144384, "auxiliary_loss_mlp": 0.01046422, "balance_loss_clip": 1.05559993, "balance_loss_mlp": 1.02987576, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 2.586093125227841, "language_loss": 0.74295127, "learning_rate": 3.12649454083913e-06, "loss": 0.76485932, "num_input_tokens_seen": 117962515, "step": 5493, "time_per_iteration": 2.572657585144043 }, { "auxiliary_loss_clip": 0.01007495, "auxiliary_loss_mlp": 0.01051184, "balance_loss_clip": 1.0238874, "balance_loss_mlp": 1.0491215, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.7952972655943692, "language_loss": 0.53981996, "learning_rate": 3.12617271181492e-06, "loss": 0.5604068, "num_input_tokens_seen": 118018780, "step": 5494, "time_per_iteration": 3.2123944759368896 }, { "auxiliary_loss_clip": 0.01114646, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.04879999, "balance_loss_mlp": 1.02241075, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 1.4867113292626302, "language_loss": 0.87236047, "learning_rate": 3.1258508400861276e-06, "loss": 0.89388549, "num_input_tokens_seen": 118038610, "step": 5495, "time_per_iteration": 4.180245637893677 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.0520072, "balance_loss_mlp": 1.02813482, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 2.0634169818588157, "language_loss": 0.73468459, "learning_rate": 3.1255289256649587e-06, "loss": 0.7561748, "num_input_tokens_seen": 118055905, "step": 5496, "time_per_iteration": 2.816849946975708 }, { "auxiliary_loss_clip": 0.01107244, "auxiliary_loss_mlp": 0.01039897, "balance_loss_clip": 1.04852057, "balance_loss_mlp": 1.02469766, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.430684839051296, "language_loss": 0.72464252, "learning_rate": 3.1252069685636196e-06, "loss": 0.74611384, "num_input_tokens_seen": 118073695, "step": 5497, "time_per_iteration": 4.314718961715698 }, { "auxiliary_loss_clip": 0.01111966, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.05051875, "balance_loss_mlp": 1.02313733, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1.9082848646705384, "language_loss": 0.804672, "learning_rate": 3.124884968794321e-06, "loss": 0.82617176, "num_input_tokens_seen": 118094030, "step": 5498, "time_per_iteration": 2.831347942352295 }, { "auxiliary_loss_clip": 0.01121599, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.04826963, "balance_loss_mlp": 1.02467656, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 2.0593804502858823, "language_loss": 0.75822198, "learning_rate": 3.12456292636927e-06, "loss": 0.77985466, "num_input_tokens_seen": 118111665, "step": 5499, "time_per_iteration": 4.880478858947754 }, { "auxiliary_loss_clip": 0.01119724, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.05307007, "balance_loss_mlp": 1.02016318, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 2.088317081581358, "language_loss": 0.78981787, "learning_rate": 3.124240841300681e-06, "loss": 0.81137192, "num_input_tokens_seen": 118132435, "step": 5500, "time_per_iteration": 2.7601048946380615 }, { "auxiliary_loss_clip": 0.01131843, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.0540576, "balance_loss_mlp": 1.01751041, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 8.499573931934933, "language_loss": 0.6655246, "learning_rate": 3.1239187136007665e-06, "loss": 0.68717939, "num_input_tokens_seen": 118155255, "step": 5501, "time_per_iteration": 2.7880568504333496 }, { "auxiliary_loss_clip": 0.01130024, "auxiliary_loss_mlp": 0.01044854, "balance_loss_clip": 1.05215073, "balance_loss_mlp": 1.02766418, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 2.417495150038941, "language_loss": 0.77221018, "learning_rate": 3.1235965432817417e-06, "loss": 0.79395902, "num_input_tokens_seen": 118169865, "step": 5502, "time_per_iteration": 2.621891736984253 }, { "auxiliary_loss_clip": 0.01120279, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.05816746, "balance_loss_mlp": 1.02508807, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 1.6870244228079128, "language_loss": 0.72882998, "learning_rate": 3.123274330355824e-06, "loss": 0.75044584, "num_input_tokens_seen": 118190760, "step": 5503, "time_per_iteration": 2.731391191482544 }, { "auxiliary_loss_clip": 0.01107126, "auxiliary_loss_mlp": 0.01042991, "balance_loss_clip": 1.04483843, "balance_loss_mlp": 1.02543116, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 1.6983408951831631, "language_loss": 0.75341403, "learning_rate": 3.12295207483523e-06, "loss": 0.77491516, "num_input_tokens_seen": 118213620, "step": 5504, "time_per_iteration": 2.734440565109253 }, { "auxiliary_loss_clip": 0.01116159, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.05076432, "balance_loss_mlp": 1.02267826, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 1.5921827086772462, "language_loss": 0.69537103, "learning_rate": 3.1226297767321816e-06, "loss": 0.71691644, "num_input_tokens_seen": 118235010, "step": 5505, "time_per_iteration": 2.7224769592285156 }, { "auxiliary_loss_clip": 0.0112242, "auxiliary_loss_mlp": 0.01050735, "balance_loss_clip": 1.04997373, "balance_loss_mlp": 1.03454661, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 1.6566524839278514, "language_loss": 0.81701219, "learning_rate": 3.122307436058899e-06, "loss": 0.83874375, "num_input_tokens_seen": 118255820, "step": 5506, "time_per_iteration": 2.6608633995056152 }, { "auxiliary_loss_clip": 0.01126393, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.05129898, "balance_loss_mlp": 1.02032042, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 2.1165262291534663, "language_loss": 0.7961843, "learning_rate": 3.121985052827606e-06, "loss": 0.81781757, "num_input_tokens_seen": 118274160, "step": 5507, "time_per_iteration": 2.6279826164245605 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.04948068, "balance_loss_mlp": 1.02901316, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 1.8252383106416188, "language_loss": 0.71632457, "learning_rate": 3.1216626270505274e-06, "loss": 0.73792744, "num_input_tokens_seen": 118294385, "step": 5508, "time_per_iteration": 2.666274070739746 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.04841506, "balance_loss_mlp": 1.02048194, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 2.0681023318662053, "language_loss": 0.71877921, "learning_rate": 3.12134015873989e-06, "loss": 0.74018759, "num_input_tokens_seen": 118313105, "step": 5509, "time_per_iteration": 2.9805185794830322 }, { "auxiliary_loss_clip": 0.01123913, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.05431342, "balance_loss_mlp": 1.02019095, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 1.690455092128618, "language_loss": 0.72850806, "learning_rate": 3.121017647907921e-06, "loss": 0.75010473, "num_input_tokens_seen": 118335250, "step": 5510, "time_per_iteration": 2.7012648582458496 }, { "auxiliary_loss_clip": 0.01097101, "auxiliary_loss_mlp": 0.01036395, "balance_loss_clip": 1.04754674, "balance_loss_mlp": 1.02099323, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 2.529653220973509, "language_loss": 0.87842733, "learning_rate": 3.1206950945668508e-06, "loss": 0.89976227, "num_input_tokens_seen": 118351470, "step": 5511, "time_per_iteration": 2.699303150177002 }, { "auxiliary_loss_clip": 0.01077351, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.04569423, "balance_loss_mlp": 1.0232892, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 2.0800696693803404, "language_loss": 0.73301774, "learning_rate": 3.12037249872891e-06, "loss": 0.7541737, "num_input_tokens_seen": 118370970, "step": 5512, "time_per_iteration": 2.773071765899658 }, { "auxiliary_loss_clip": 0.01092657, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.04608238, "balance_loss_mlp": 1.02226281, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 28.686212163123738, "language_loss": 0.7188127, "learning_rate": 3.1200498604063317e-06, "loss": 0.74011087, "num_input_tokens_seen": 118393125, "step": 5513, "time_per_iteration": 2.832712411880493 }, { "auxiliary_loss_clip": 0.0110331, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.0480994, "balance_loss_mlp": 1.02052951, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 1.9100766123367274, "language_loss": 0.68260789, "learning_rate": 3.1197271796113507e-06, "loss": 0.70401114, "num_input_tokens_seen": 118410860, "step": 5514, "time_per_iteration": 2.62347674369812 }, { "auxiliary_loss_clip": 0.01111479, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.04936767, "balance_loss_mlp": 1.02481997, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 1.9179680687741931, "language_loss": 0.65994096, "learning_rate": 3.1194044563562026e-06, "loss": 0.68148118, "num_input_tokens_seen": 118429570, "step": 5515, "time_per_iteration": 2.6913952827453613 }, { "auxiliary_loss_clip": 0.01121539, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.04903245, "balance_loss_mlp": 1.02393019, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 1.8088538037879305, "language_loss": 0.69273043, "learning_rate": 3.1190816906531257e-06, "loss": 0.71434575, "num_input_tokens_seen": 118450285, "step": 5516, "time_per_iteration": 2.6469173431396484 }, { "auxiliary_loss_clip": 0.011287, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.05089724, "balance_loss_mlp": 1.02339315, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 3.871010712989623, "language_loss": 0.79914033, "learning_rate": 3.118758882514359e-06, "loss": 0.82082474, "num_input_tokens_seen": 118468270, "step": 5517, "time_per_iteration": 2.6387667655944824 }, { "auxiliary_loss_clip": 0.01113973, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.04587924, "balance_loss_mlp": 1.02412271, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 1.7856922866156533, "language_loss": 0.74043357, "learning_rate": 3.118436031952143e-06, "loss": 0.76197767, "num_input_tokens_seen": 118486615, "step": 5518, "time_per_iteration": 2.6136653423309326 }, { "auxiliary_loss_clip": 0.01035845, "auxiliary_loss_mlp": 0.0100663, "balance_loss_clip": 1.02549803, "balance_loss_mlp": 1.00447261, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6165261089589951, "language_loss": 0.54330659, "learning_rate": 3.1181131389787206e-06, "loss": 0.56373143, "num_input_tokens_seen": 118553580, "step": 5519, "time_per_iteration": 3.3124027252197266 }, { "auxiliary_loss_clip": 0.01129225, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.02483273, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 2.4445902922344342, "language_loss": 0.78693354, "learning_rate": 3.117790203606336e-06, "loss": 0.80864823, "num_input_tokens_seen": 118570280, "step": 5520, "time_per_iteration": 2.680413246154785 }, { "auxiliary_loss_clip": 0.0111174, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.04981971, "balance_loss_mlp": 1.01946807, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 2.1205551001068645, "language_loss": 0.76597643, "learning_rate": 3.1174672258472344e-06, "loss": 0.78743839, "num_input_tokens_seen": 118590455, "step": 5521, "time_per_iteration": 2.7977516651153564 }, { "auxiliary_loss_clip": 0.01128356, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.0500772, "balance_loss_mlp": 1.0320611, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 5.546447388917159, "language_loss": 0.70404172, "learning_rate": 3.117144205713664e-06, "loss": 0.72581589, "num_input_tokens_seen": 118609495, "step": 5522, "time_per_iteration": 2.7343335151672363 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01039333, "balance_loss_clip": 1.04872596, "balance_loss_mlp": 1.02413392, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 2.5717643633026133, "language_loss": 0.7406925, "learning_rate": 3.1168211432178735e-06, "loss": 0.76221192, "num_input_tokens_seen": 118628720, "step": 5523, "time_per_iteration": 2.6910529136657715 }, { "auxiliary_loss_clip": 0.01108522, "auxiliary_loss_mlp": 0.01039859, "balance_loss_clip": 1.04778576, "balance_loss_mlp": 1.02415287, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 1.7441145490896364, "language_loss": 0.82432246, "learning_rate": 3.116498038372114e-06, "loss": 0.8458063, "num_input_tokens_seen": 118645955, "step": 5524, "time_per_iteration": 2.747279405593872 }, { "auxiliary_loss_clip": 0.01094215, "auxiliary_loss_mlp": 0.00773366, "balance_loss_clip": 1.04763544, "balance_loss_mlp": 1.000983, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 1.8821817398487202, "language_loss": 0.83040905, "learning_rate": 3.116174891188636e-06, "loss": 0.84908485, "num_input_tokens_seen": 118665605, "step": 5525, "time_per_iteration": 2.7802865505218506 }, { "auxiliary_loss_clip": 0.01051991, "auxiliary_loss_mlp": 0.01009126, "balance_loss_clip": 1.02309918, "balance_loss_mlp": 1.00730228, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7599038914172829, "language_loss": 0.52588648, "learning_rate": 3.1158517016796945e-06, "loss": 0.54649764, "num_input_tokens_seen": 118728155, "step": 5526, "time_per_iteration": 3.1430625915527344 }, { "auxiliary_loss_clip": 0.01100912, "auxiliary_loss_mlp": 0.00775153, "balance_loss_clip": 1.05235875, "balance_loss_mlp": 1.00101066, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 1.9434005693126541, "language_loss": 0.77540255, "learning_rate": 3.1155284698575445e-06, "loss": 0.79416323, "num_input_tokens_seen": 118743955, "step": 5527, "time_per_iteration": 2.779862403869629 }, { "auxiliary_loss_clip": 0.01095485, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.02997637, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 2.507974613956182, "language_loss": 0.7222321, "learning_rate": 3.1152051957344434e-06, "loss": 0.7436409, "num_input_tokens_seen": 118763275, "step": 5528, "time_per_iteration": 2.7340548038482666 }, { "auxiliary_loss_clip": 0.01112677, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.04796624, "balance_loss_mlp": 1.02333462, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 1.86583443755271, "language_loss": 0.82796729, "learning_rate": 3.1148818793226497e-06, "loss": 0.84947193, "num_input_tokens_seen": 118781110, "step": 5529, "time_per_iteration": 2.6532175540924072 }, { "auxiliary_loss_clip": 0.01113738, "auxiliary_loss_mlp": 0.00775289, "balance_loss_clip": 1.04990721, "balance_loss_mlp": 1.00095487, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 2.91854332756289, "language_loss": 0.69676769, "learning_rate": 3.114558520634423e-06, "loss": 0.71565795, "num_input_tokens_seen": 118800620, "step": 5530, "time_per_iteration": 2.708841323852539 }, { "auxiliary_loss_clip": 0.01126266, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05040276, "balance_loss_mlp": 1.03394794, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 2.896961644373142, "language_loss": 0.75989115, "learning_rate": 3.1142351196820256e-06, "loss": 0.7816565, "num_input_tokens_seen": 118818725, "step": 5531, "time_per_iteration": 2.672736167907715 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.0104264, "balance_loss_clip": 1.05284333, "balance_loss_mlp": 1.0260222, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 2.0175366752259465, "language_loss": 0.73189509, "learning_rate": 3.1139116764777206e-06, "loss": 0.75350642, "num_input_tokens_seen": 118839390, "step": 5532, "time_per_iteration": 4.367426156997681 }, { "auxiliary_loss_clip": 0.0111545, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.0523479, "balance_loss_mlp": 1.01623583, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 2.031596721272471, "language_loss": 0.65847003, "learning_rate": 3.1135881910337735e-06, "loss": 0.67993426, "num_input_tokens_seen": 118856275, "step": 5533, "time_per_iteration": 2.66029691696167 }, { "auxiliary_loss_clip": 0.01080696, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.04513919, "balance_loss_mlp": 1.02147257, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 2.349847054242377, "language_loss": 0.71297956, "learning_rate": 3.113264663362451e-06, "loss": 0.73416501, "num_input_tokens_seen": 118873830, "step": 5534, "time_per_iteration": 4.27457070350647 }, { "auxiliary_loss_clip": 0.0109151, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.04982436, "balance_loss_mlp": 1.02534652, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 2.0777718313633997, "language_loss": 0.6718514, "learning_rate": 3.1129410934760204e-06, "loss": 0.69317865, "num_input_tokens_seen": 118891560, "step": 5535, "time_per_iteration": 2.774434804916382 }, { "auxiliary_loss_clip": 0.01126643, "auxiliary_loss_mlp": 0.00774026, "balance_loss_clip": 1.04974341, "balance_loss_mlp": 1.00099397, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 4.4518317449354905, "language_loss": 0.72757089, "learning_rate": 3.1126174813867517e-06, "loss": 0.74657756, "num_input_tokens_seen": 118910260, "step": 5536, "time_per_iteration": 4.211881399154663 }, { "auxiliary_loss_clip": 0.0112639, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.05097485, "balance_loss_mlp": 1.02740741, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 1.6494647990025764, "language_loss": 0.81951326, "learning_rate": 3.112293827106917e-06, "loss": 0.84120637, "num_input_tokens_seen": 118929985, "step": 5537, "time_per_iteration": 2.723938465118408 }, { "auxiliary_loss_clip": 0.01130953, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.02568924, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 2.0361349610506987, "language_loss": 0.71549797, "learning_rate": 3.111970130648789e-06, "loss": 0.73722446, "num_input_tokens_seen": 118951355, "step": 5538, "time_per_iteration": 4.913949489593506 }, { "auxiliary_loss_clip": 0.01120461, "auxiliary_loss_mlp": 0.01037376, "balance_loss_clip": 1.04746032, "balance_loss_mlp": 1.02189124, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 1.8849765474814903, "language_loss": 0.74648041, "learning_rate": 3.1116463920246424e-06, "loss": 0.76805872, "num_input_tokens_seen": 118970910, "step": 5539, "time_per_iteration": 2.7290310859680176 }, { "auxiliary_loss_clip": 0.01142521, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.05175686, "balance_loss_mlp": 1.02844524, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 1.7887365250144445, "language_loss": 0.71008205, "learning_rate": 3.1113226112467527e-06, "loss": 0.73195994, "num_input_tokens_seen": 118989200, "step": 5540, "time_per_iteration": 2.6340630054473877 }, { "auxiliary_loss_clip": 0.01121672, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.04614174, "balance_loss_mlp": 1.02212477, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 2.2050863595265535, "language_loss": 0.60332179, "learning_rate": 3.1109987883273983e-06, "loss": 0.62491661, "num_input_tokens_seen": 119011030, "step": 5541, "time_per_iteration": 2.9001681804656982 }, { "auxiliary_loss_clip": 0.01116142, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.04896498, "balance_loss_mlp": 1.02827907, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 1.8682676496278656, "language_loss": 0.68843257, "learning_rate": 3.1106749232788584e-06, "loss": 0.7100479, "num_input_tokens_seen": 119030620, "step": 5542, "time_per_iteration": 2.7336552143096924 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.0241369, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 1.7424785130645766, "language_loss": 0.75545055, "learning_rate": 3.110351016113414e-06, "loss": 0.7771036, "num_input_tokens_seen": 119048015, "step": 5543, "time_per_iteration": 2.7098708152770996 }, { "auxiliary_loss_clip": 0.01059952, "auxiliary_loss_mlp": 0.01049723, "balance_loss_clip": 1.04679465, "balance_loss_mlp": 1.03153133, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 1.720313350609618, "language_loss": 0.75207818, "learning_rate": 3.110027066843348e-06, "loss": 0.77317488, "num_input_tokens_seen": 119066280, "step": 5544, "time_per_iteration": 2.8580381870269775 }, { "auxiliary_loss_clip": 0.01131382, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.0470835, "balance_loss_mlp": 1.01900601, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 1.8195187872515122, "language_loss": 0.70631826, "learning_rate": 3.1097030754809456e-06, "loss": 0.7279768, "num_input_tokens_seen": 119087680, "step": 5545, "time_per_iteration": 2.6675262451171875 }, { "auxiliary_loss_clip": 0.01090227, "auxiliary_loss_mlp": 0.01038197, "balance_loss_clip": 1.04591393, "balance_loss_mlp": 1.0225687, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 2.0475528286172615, "language_loss": 0.68962657, "learning_rate": 3.1093790420384894e-06, "loss": 0.7109108, "num_input_tokens_seen": 119105820, "step": 5546, "time_per_iteration": 2.6620733737945557 }, { "auxiliary_loss_clip": 0.01099462, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.04328573, "balance_loss_mlp": 1.02330589, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 1.6439201248410251, "language_loss": 0.64893299, "learning_rate": 3.1090549665282702e-06, "loss": 0.67032051, "num_input_tokens_seen": 119126630, "step": 5547, "time_per_iteration": 2.7897326946258545 }, { "auxiliary_loss_clip": 0.0111514, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.05108774, "balance_loss_mlp": 1.01957989, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 2.7266915889905765, "language_loss": 0.85475278, "learning_rate": 3.1087308489625742e-06, "loss": 0.8762449, "num_input_tokens_seen": 119143375, "step": 5548, "time_per_iteration": 2.691776990890503 }, { "auxiliary_loss_clip": 0.0112443, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.04759526, "balance_loss_mlp": 1.02190423, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 2.1593805374763466, "language_loss": 0.74996036, "learning_rate": 3.1084066893536945e-06, "loss": 0.77159154, "num_input_tokens_seen": 119166450, "step": 5549, "time_per_iteration": 2.778918743133545 }, { "auxiliary_loss_clip": 0.01129114, "auxiliary_loss_mlp": 0.01040153, "balance_loss_clip": 1.0509795, "balance_loss_mlp": 1.02330887, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 2.0942861782322577, "language_loss": 0.6826036, "learning_rate": 3.108082487713921e-06, "loss": 0.70429623, "num_input_tokens_seen": 119189645, "step": 5550, "time_per_iteration": 2.8417065143585205 }, { "auxiliary_loss_clip": 0.01094461, "auxiliary_loss_mlp": 0.01050862, "balance_loss_clip": 1.04752803, "balance_loss_mlp": 1.03398156, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 3.079168539029832, "language_loss": 0.60630679, "learning_rate": 3.1077582440555495e-06, "loss": 0.62776005, "num_input_tokens_seen": 119208045, "step": 5551, "time_per_iteration": 2.7206614017486572 }, { "auxiliary_loss_clip": 0.01096001, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.04871941, "balance_loss_mlp": 1.02429891, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 5.115117677651213, "language_loss": 0.70642906, "learning_rate": 3.1074339583908746e-06, "loss": 0.72779882, "num_input_tokens_seen": 119224910, "step": 5552, "time_per_iteration": 2.7452614307403564 }, { "auxiliary_loss_clip": 0.0109902, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.04360175, "balance_loss_mlp": 1.02150989, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 2.544991024269762, "language_loss": 0.82464319, "learning_rate": 3.107109630732192e-06, "loss": 0.84600323, "num_input_tokens_seen": 119243290, "step": 5553, "time_per_iteration": 2.755664110183716 }, { "auxiliary_loss_clip": 0.01115353, "auxiliary_loss_mlp": 0.00774656, "balance_loss_clip": 1.05034745, "balance_loss_mlp": 1.00092673, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 2.0139615227647343, "language_loss": 0.80920005, "learning_rate": 3.1067852610918017e-06, "loss": 0.82810014, "num_input_tokens_seen": 119261195, "step": 5554, "time_per_iteration": 2.701960563659668 }, { "auxiliary_loss_clip": 0.01127546, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02820015, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 1.6473304910242343, "language_loss": 0.81187713, "learning_rate": 3.1064608494820032e-06, "loss": 0.83358645, "num_input_tokens_seen": 119282845, "step": 5555, "time_per_iteration": 2.697605609893799 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.04721272, "balance_loss_mlp": 1.02425706, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 1.6543240081497628, "language_loss": 0.74369228, "learning_rate": 3.106136395915099e-06, "loss": 0.76529467, "num_input_tokens_seen": 119304430, "step": 5556, "time_per_iteration": 2.7341341972351074 }, { "auxiliary_loss_clip": 0.01124745, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.02102232, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 1.6367363007204896, "language_loss": 0.82058722, "learning_rate": 3.105811900403391e-06, "loss": 0.84219617, "num_input_tokens_seen": 119323830, "step": 5557, "time_per_iteration": 2.6798059940338135 }, { "auxiliary_loss_clip": 0.01115524, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.04990697, "balance_loss_mlp": 1.02333987, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 1.4529426900334401, "language_loss": 0.80220526, "learning_rate": 3.1054873629591855e-06, "loss": 0.82374907, "num_input_tokens_seen": 119346340, "step": 5558, "time_per_iteration": 2.760270118713379 }, { "auxiliary_loss_clip": 0.01108428, "auxiliary_loss_mlp": 0.01040994, "balance_loss_clip": 1.04822016, "balance_loss_mlp": 1.02628982, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 1.5625296304307381, "language_loss": 0.8137213, "learning_rate": 3.105162783594788e-06, "loss": 0.83521557, "num_input_tokens_seen": 119367285, "step": 5559, "time_per_iteration": 2.7685365676879883 }, { "auxiliary_loss_clip": 0.01096895, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.04609013, "balance_loss_mlp": 1.02726293, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 2.3834321283612003, "language_loss": 0.7164095, "learning_rate": 3.1048381623225074e-06, "loss": 0.73780799, "num_input_tokens_seen": 119385370, "step": 5560, "time_per_iteration": 2.721888780593872 }, { "auxiliary_loss_clip": 0.011201, "auxiliary_loss_mlp": 0.01043409, "balance_loss_clip": 1.05215085, "balance_loss_mlp": 1.02716064, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 2.1203222418546015, "language_loss": 0.75029516, "learning_rate": 3.1045134991546526e-06, "loss": 0.77193022, "num_input_tokens_seen": 119409150, "step": 5561, "time_per_iteration": 2.8445487022399902 }, { "auxiliary_loss_clip": 0.01115063, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.05170679, "balance_loss_mlp": 1.02177453, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 1.6036143049019338, "language_loss": 0.69467896, "learning_rate": 3.1041887941035355e-06, "loss": 0.71619672, "num_input_tokens_seen": 119426475, "step": 5562, "time_per_iteration": 2.664062023162842 }, { "auxiliary_loss_clip": 0.01125323, "auxiliary_loss_mlp": 0.01042082, "balance_loss_clip": 1.05125499, "balance_loss_mlp": 1.02763367, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 3.5139835262543504, "language_loss": 0.65094876, "learning_rate": 3.1038640471814685e-06, "loss": 0.67262286, "num_input_tokens_seen": 119446900, "step": 5563, "time_per_iteration": 2.70878529548645 }, { "auxiliary_loss_clip": 0.01078552, "auxiliary_loss_mlp": 0.01045974, "balance_loss_clip": 1.04751515, "balance_loss_mlp": 1.0296303, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 1.4983314251487456, "language_loss": 0.74106556, "learning_rate": 3.103539258400766e-06, "loss": 0.76231086, "num_input_tokens_seen": 119470945, "step": 5564, "time_per_iteration": 3.0751025676727295 }, { "auxiliary_loss_clip": 0.01035298, "auxiliary_loss_mlp": 0.01009529, "balance_loss_clip": 1.03294694, "balance_loss_mlp": 1.00762165, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.7758359845819034, "language_loss": 0.555296, "learning_rate": 3.103214427773745e-06, "loss": 0.57574433, "num_input_tokens_seen": 119529925, "step": 5565, "time_per_iteration": 3.2246947288513184 }, { "auxiliary_loss_clip": 0.01134316, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.05123055, "balance_loss_mlp": 1.02145183, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 2.332924120890769, "language_loss": 0.65000319, "learning_rate": 3.102889555312721e-06, "loss": 0.67170799, "num_input_tokens_seen": 119550700, "step": 5566, "time_per_iteration": 2.8920817375183105 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.05134845, "balance_loss_mlp": 1.02252626, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 2.3005222539878436, "language_loss": 0.77525175, "learning_rate": 3.102564641030016e-06, "loss": 0.79678619, "num_input_tokens_seen": 119569295, "step": 5567, "time_per_iteration": 2.82244610786438 }, { "auxiliary_loss_clip": 0.01112911, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.0479182, "balance_loss_mlp": 1.02079725, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 1.7148039320536435, "language_loss": 0.76432139, "learning_rate": 3.102239684937949e-06, "loss": 0.78582156, "num_input_tokens_seen": 119587375, "step": 5568, "time_per_iteration": 2.689354181289673 }, { "auxiliary_loss_clip": 0.01099358, "auxiliary_loss_mlp": 0.01048314, "balance_loss_clip": 1.04898834, "balance_loss_mlp": 1.03163624, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 3.260707250765708, "language_loss": 0.70965171, "learning_rate": 3.101914687048842e-06, "loss": 0.73112851, "num_input_tokens_seen": 119604530, "step": 5569, "time_per_iteration": 2.747023344039917 }, { "auxiliary_loss_clip": 0.01099669, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04569411, "balance_loss_mlp": 1.01819277, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 2.127450904564192, "language_loss": 0.89788258, "learning_rate": 3.10158964737502e-06, "loss": 0.91922712, "num_input_tokens_seen": 119621025, "step": 5570, "time_per_iteration": 2.810328960418701 }, { "auxiliary_loss_clip": 0.01098742, "auxiliary_loss_mlp": 0.01034906, "balance_loss_clip": 1.04593182, "balance_loss_mlp": 1.01970696, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 2.0196203016458245, "language_loss": 0.79848439, "learning_rate": 3.101264565928808e-06, "loss": 0.81982088, "num_input_tokens_seen": 119641725, "step": 5571, "time_per_iteration": 4.5300047397613525 }, { "auxiliary_loss_clip": 0.01052126, "auxiliary_loss_mlp": 0.00754598, "balance_loss_clip": 1.02251923, "balance_loss_mlp": 1.0014987, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 0.8956854098175919, "language_loss": 0.5596205, "learning_rate": 3.1009394427225335e-06, "loss": 0.57768774, "num_input_tokens_seen": 119693560, "step": 5572, "time_per_iteration": 3.0931503772735596 }, { "auxiliary_loss_clip": 0.01137277, "auxiliary_loss_mlp": 0.01047626, "balance_loss_clip": 1.05220318, "balance_loss_mlp": 1.03196192, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 2.019282888464976, "language_loss": 0.78090006, "learning_rate": 3.1006142777685257e-06, "loss": 0.8027491, "num_input_tokens_seen": 119712935, "step": 5573, "time_per_iteration": 2.710340738296509 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01046551, "balance_loss_clip": 1.05004358, "balance_loss_mlp": 1.02974284, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 3.3664569303363834, "language_loss": 0.7253201, "learning_rate": 3.1002890710791133e-06, "loss": 0.74684364, "num_input_tokens_seen": 119731680, "step": 5574, "time_per_iteration": 4.390132427215576 }, { "auxiliary_loss_clip": 0.01119913, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.04622221, "balance_loss_mlp": 1.01882839, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 1.806126996337021, "language_loss": 0.87605375, "learning_rate": 3.0999638226666287e-06, "loss": 0.89758873, "num_input_tokens_seen": 119752155, "step": 5575, "time_per_iteration": 2.6650984287261963 }, { "auxiliary_loss_clip": 0.01119423, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.05073953, "balance_loss_mlp": 1.02783298, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 2.5292682388354404, "language_loss": 0.82834053, "learning_rate": 3.0996385325434063e-06, "loss": 0.84998369, "num_input_tokens_seen": 119769195, "step": 5576, "time_per_iteration": 4.143759727478027 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.04928613, "balance_loss_mlp": 1.02584612, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 2.62081807641563, "language_loss": 0.72970062, "learning_rate": 3.0993132007217806e-06, "loss": 0.75138342, "num_input_tokens_seen": 119786810, "step": 5577, "time_per_iteration": 4.264250755310059 }, { "auxiliary_loss_clip": 0.01102749, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.05250812, "balance_loss_mlp": 1.02409935, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 2.2461501835528255, "language_loss": 0.8147049, "learning_rate": 3.0989878272140883e-06, "loss": 0.83613431, "num_input_tokens_seen": 119805395, "step": 5578, "time_per_iteration": 2.748187780380249 }, { "auxiliary_loss_clip": 0.01072311, "auxiliary_loss_mlp": 0.0077377, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.00086129, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 2.081067644088489, "language_loss": 0.72135395, "learning_rate": 3.0986624120326676e-06, "loss": 0.73981476, "num_input_tokens_seen": 119823135, "step": 5579, "time_per_iteration": 2.797891616821289 }, { "auxiliary_loss_clip": 0.0108369, "auxiliary_loss_mlp": 0.01042635, "balance_loss_clip": 1.04664183, "balance_loss_mlp": 1.02608919, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 2.1516301629227255, "language_loss": 0.81264424, "learning_rate": 3.0983369551898573e-06, "loss": 0.83390749, "num_input_tokens_seen": 119842265, "step": 5580, "time_per_iteration": 2.76359224319458 }, { "auxiliary_loss_clip": 0.01112891, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.04777932, "balance_loss_mlp": 1.01918936, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 1.787418199208594, "language_loss": 0.78071463, "learning_rate": 3.0980114566980003e-06, "loss": 0.80219114, "num_input_tokens_seen": 119862500, "step": 5581, "time_per_iteration": 2.6893699169158936 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01044533, "balance_loss_clip": 1.04555583, "balance_loss_mlp": 1.02674723, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 3.5541134032025528, "language_loss": 0.74734783, "learning_rate": 3.0976859165694384e-06, "loss": 0.76881701, "num_input_tokens_seen": 119880160, "step": 5582, "time_per_iteration": 2.750110149383545 }, { "auxiliary_loss_clip": 0.01109205, "auxiliary_loss_mlp": 0.0104468, "balance_loss_clip": 1.04334664, "balance_loss_mlp": 1.02793145, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 2.0738327777636574, "language_loss": 0.82039702, "learning_rate": 3.0973603348165166e-06, "loss": 0.84193587, "num_input_tokens_seen": 119899040, "step": 5583, "time_per_iteration": 2.629065990447998 }, { "auxiliary_loss_clip": 0.01113126, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.04719925, "balance_loss_mlp": 1.0322051, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 2.1437775006956814, "language_loss": 0.77524137, "learning_rate": 3.097034711451581e-06, "loss": 0.79684973, "num_input_tokens_seen": 119921120, "step": 5584, "time_per_iteration": 2.9303438663482666 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01043431, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.02755225, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 1.8068970963649096, "language_loss": 0.76473475, "learning_rate": 3.0967090464869795e-06, "loss": 0.78632081, "num_input_tokens_seen": 119940165, "step": 5585, "time_per_iteration": 2.7168867588043213 }, { "auxiliary_loss_clip": 0.01120824, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.04579937, "balance_loss_mlp": 1.02442741, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 1.8490215812193886, "language_loss": 0.77754235, "learning_rate": 3.0963833399350608e-06, "loss": 0.79915732, "num_input_tokens_seen": 119959730, "step": 5586, "time_per_iteration": 2.88452410697937 }, { "auxiliary_loss_clip": 0.01100333, "auxiliary_loss_mlp": 0.01057166, "balance_loss_clip": 1.0484302, "balance_loss_mlp": 1.03673398, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 1.6698470723885088, "language_loss": 0.810045, "learning_rate": 3.0960575918081756e-06, "loss": 0.8316201, "num_input_tokens_seen": 119979315, "step": 5587, "time_per_iteration": 2.7335522174835205 }, { "auxiliary_loss_clip": 0.01130777, "auxiliary_loss_mlp": 0.01042735, "balance_loss_clip": 1.04809558, "balance_loss_mlp": 1.02837586, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1.8626695130182664, "language_loss": 0.67307252, "learning_rate": 3.095731802118677e-06, "loss": 0.69480765, "num_input_tokens_seen": 119996140, "step": 5588, "time_per_iteration": 2.5910611152648926 }, { "auxiliary_loss_clip": 0.01113468, "auxiliary_loss_mlp": 0.00774774, "balance_loss_clip": 1.04702032, "balance_loss_mlp": 1.0007664, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 2.758181662666948, "language_loss": 0.70459288, "learning_rate": 3.095405970878919e-06, "loss": 0.72347522, "num_input_tokens_seen": 120017720, "step": 5589, "time_per_iteration": 2.7966625690460205 }, { "auxiliary_loss_clip": 0.01110605, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.04478765, "balance_loss_mlp": 1.02951634, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 6.820816752821097, "language_loss": 0.6717155, "learning_rate": 3.0950800981012567e-06, "loss": 0.69329101, "num_input_tokens_seen": 120036335, "step": 5590, "time_per_iteration": 2.804384231567383 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01044113, "balance_loss_clip": 1.05176187, "balance_loss_mlp": 1.02741194, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 2.108159500929249, "language_loss": 0.731767, "learning_rate": 3.094754183798047e-06, "loss": 0.75327909, "num_input_tokens_seen": 120056120, "step": 5591, "time_per_iteration": 2.7423245906829834 }, { "auxiliary_loss_clip": 0.01132777, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04753232, "balance_loss_mlp": 1.02802432, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 2.4812698890164238, "language_loss": 0.6978277, "learning_rate": 3.0944282279816493e-06, "loss": 0.71958983, "num_input_tokens_seen": 120073650, "step": 5592, "time_per_iteration": 2.624565362930298 }, { "auxiliary_loss_clip": 0.01109265, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.0459764, "balance_loss_mlp": 1.02034986, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 2.2034044743639676, "language_loss": 0.76362681, "learning_rate": 3.094102230664423e-06, "loss": 0.78507739, "num_input_tokens_seen": 120093260, "step": 5593, "time_per_iteration": 2.7709946632385254 }, { "auxiliary_loss_clip": 0.01100555, "auxiliary_loss_mlp": 0.00775613, "balance_loss_clip": 1.04247713, "balance_loss_mlp": 1.00074506, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 2.2856177577930876, "language_loss": 0.7229932, "learning_rate": 3.093776191858731e-06, "loss": 0.74175489, "num_input_tokens_seen": 120111830, "step": 5594, "time_per_iteration": 2.7880120277404785 }, { "auxiliary_loss_clip": 0.01079557, "auxiliary_loss_mlp": 0.00778898, "balance_loss_clip": 1.04157269, "balance_loss_mlp": 1.00079668, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 3.2295215673950293, "language_loss": 0.79940557, "learning_rate": 3.0934501115769363e-06, "loss": 0.81799006, "num_input_tokens_seen": 120130470, "step": 5595, "time_per_iteration": 2.8623924255371094 }, { "auxiliary_loss_clip": 0.01111225, "auxiliary_loss_mlp": 0.01039348, "balance_loss_clip": 1.04694319, "balance_loss_mlp": 1.02456045, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 3.201033356603963, "language_loss": 0.81473815, "learning_rate": 3.0931239898314037e-06, "loss": 0.83624387, "num_input_tokens_seen": 120150735, "step": 5596, "time_per_iteration": 2.900319814682007 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.04682481, "balance_loss_mlp": 1.02877986, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 1.642499178477658, "language_loss": 0.75647599, "learning_rate": 3.0927978266344995e-06, "loss": 0.778054, "num_input_tokens_seen": 120173230, "step": 5597, "time_per_iteration": 2.8402984142303467 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.04734445, "balance_loss_mlp": 1.01902318, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 1.910742765655482, "language_loss": 0.78611934, "learning_rate": 3.0924716219985916e-06, "loss": 0.80769938, "num_input_tokens_seen": 120191860, "step": 5598, "time_per_iteration": 2.7380945682525635 }, { "auxiliary_loss_clip": 0.01141013, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.04969454, "balance_loss_mlp": 1.0235827, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 1.511676842650176, "language_loss": 0.6446076, "learning_rate": 3.0921453759360514e-06, "loss": 0.66643113, "num_input_tokens_seen": 120219195, "step": 5599, "time_per_iteration": 2.845017433166504 }, { "auxiliary_loss_clip": 0.01103042, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.04571164, "balance_loss_mlp": 1.03408813, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 3.0475721260430486, "language_loss": 0.8262403, "learning_rate": 3.091819088459249e-06, "loss": 0.84781146, "num_input_tokens_seen": 120232950, "step": 5600, "time_per_iteration": 2.690335512161255 }, { "auxiliary_loss_clip": 0.01128117, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.04780042, "balance_loss_mlp": 1.02822232, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 2.4530209101601037, "language_loss": 0.83457136, "learning_rate": 3.0914927595805573e-06, "loss": 0.856305, "num_input_tokens_seen": 120248865, "step": 5601, "time_per_iteration": 2.760735034942627 }, { "auxiliary_loss_clip": 0.01122256, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.04873729, "balance_loss_mlp": 1.02092862, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 2.1704904083215903, "language_loss": 0.83173311, "learning_rate": 3.0911663893123507e-06, "loss": 0.85332292, "num_input_tokens_seen": 120267820, "step": 5602, "time_per_iteration": 2.6818981170654297 }, { "auxiliary_loss_clip": 0.0113558, "auxiliary_loss_mlp": 0.01053921, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.03756535, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 3.8525391607572477, "language_loss": 0.69046748, "learning_rate": 3.0908399776670048e-06, "loss": 0.71236247, "num_input_tokens_seen": 120286540, "step": 5603, "time_per_iteration": 2.6086158752441406 }, { "auxiliary_loss_clip": 0.01116527, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.04876411, "balance_loss_mlp": 1.02617979, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 1.5388557517073465, "language_loss": 0.83146536, "learning_rate": 3.090513524656898e-06, "loss": 0.85305738, "num_input_tokens_seen": 120307305, "step": 5604, "time_per_iteration": 2.7269375324249268 }, { "auxiliary_loss_clip": 0.01095396, "auxiliary_loss_mlp": 0.01043597, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02708673, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 1.634462052702842, "language_loss": 0.73473096, "learning_rate": 3.090187030294409e-06, "loss": 0.75612092, "num_input_tokens_seen": 120327845, "step": 5605, "time_per_iteration": 2.712197780609131 }, { "auxiliary_loss_clip": 0.0111786, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.04761815, "balance_loss_mlp": 1.02235925, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 3.8834830456250913, "language_loss": 0.83444858, "learning_rate": 3.089860494591919e-06, "loss": 0.85601556, "num_input_tokens_seen": 120343255, "step": 5606, "time_per_iteration": 2.6680989265441895 }, { "auxiliary_loss_clip": 0.01108557, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.04293787, "balance_loss_mlp": 1.02370059, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 2.0409696956182946, "language_loss": 0.67694759, "learning_rate": 3.089533917561809e-06, "loss": 0.69842374, "num_input_tokens_seen": 120361745, "step": 5607, "time_per_iteration": 2.8172407150268555 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01053243, "balance_loss_clip": 1.04604626, "balance_loss_mlp": 1.03458667, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 1.9822534609557965, "language_loss": 0.70618403, "learning_rate": 3.089207299216464e-06, "loss": 0.72793615, "num_input_tokens_seen": 120380565, "step": 5608, "time_per_iteration": 2.669027090072632 }, { "auxiliary_loss_clip": 0.01055328, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.03931713, "balance_loss_mlp": 1.02449393, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 1.931960515128334, "language_loss": 0.79290974, "learning_rate": 3.088880639568269e-06, "loss": 0.81386876, "num_input_tokens_seen": 120399235, "step": 5609, "time_per_iteration": 2.7859673500061035 }, { "auxiliary_loss_clip": 0.01124996, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04914641, "balance_loss_mlp": 1.02387619, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 1.7580059679361764, "language_loss": 0.82490408, "learning_rate": 3.0885539386296114e-06, "loss": 0.8465687, "num_input_tokens_seen": 120420095, "step": 5610, "time_per_iteration": 4.319208145141602 }, { "auxiliary_loss_clip": 0.01123032, "auxiliary_loss_mlp": 0.0104256, "balance_loss_clip": 1.0486002, "balance_loss_mlp": 1.02448845, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 2.0228863025134824, "language_loss": 0.82122159, "learning_rate": 3.088227196412879e-06, "loss": 0.84287751, "num_input_tokens_seen": 120437690, "step": 5611, "time_per_iteration": 2.6127841472625732 }, { "auxiliary_loss_clip": 0.01116485, "auxiliary_loss_mlp": 0.01045036, "balance_loss_clip": 1.04920387, "balance_loss_mlp": 1.02683246, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 2.0856936331065037, "language_loss": 0.79704899, "learning_rate": 3.0879004129304626e-06, "loss": 0.81866419, "num_input_tokens_seen": 120459240, "step": 5612, "time_per_iteration": 2.7237493991851807 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.04079247, "balance_loss_mlp": 1.02410221, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 2.390785367991082, "language_loss": 0.70200634, "learning_rate": 3.087573588194753e-06, "loss": 0.7231766, "num_input_tokens_seen": 120481090, "step": 5613, "time_per_iteration": 4.43415379524231 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.01037291, "balance_loss_clip": 1.04903054, "balance_loss_mlp": 1.02097178, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 2.1929626699857585, "language_loss": 0.79407388, "learning_rate": 3.087246722218144e-06, "loss": 0.81563175, "num_input_tokens_seen": 120500045, "step": 5614, "time_per_iteration": 2.6484436988830566 }, { "auxiliary_loss_clip": 0.01105902, "auxiliary_loss_mlp": 0.01046863, "balance_loss_clip": 1.04512811, "balance_loss_mlp": 1.02796841, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 1.967540834348034, "language_loss": 0.91201901, "learning_rate": 3.086919815013031e-06, "loss": 0.93354666, "num_input_tokens_seen": 120521125, "step": 5615, "time_per_iteration": 4.486853361129761 }, { "auxiliary_loss_clip": 0.01119294, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.04542458, "balance_loss_mlp": 1.0265168, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 2.688104519924193, "language_loss": 0.80865037, "learning_rate": 3.086592866591809e-06, "loss": 0.83026439, "num_input_tokens_seen": 120539180, "step": 5616, "time_per_iteration": 2.693419933319092 }, { "auxiliary_loss_clip": 0.01132102, "auxiliary_loss_mlp": 0.00776249, "balance_loss_clip": 1.04987526, "balance_loss_mlp": 1.00074387, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 5.641479508637021, "language_loss": 0.83967853, "learning_rate": 3.0862658769668774e-06, "loss": 0.85876203, "num_input_tokens_seen": 120556280, "step": 5617, "time_per_iteration": 4.261611461639404 }, { "auxiliary_loss_clip": 0.01065047, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.0423851, "balance_loss_mlp": 1.030074, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 2.2609860925126117, "language_loss": 0.80159199, "learning_rate": 3.0859388461506343e-06, "loss": 0.82272285, "num_input_tokens_seen": 120575395, "step": 5618, "time_per_iteration": 2.8115389347076416 }, { "auxiliary_loss_clip": 0.01092947, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.02121353, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 1.9598490702889584, "language_loss": 0.7111814, "learning_rate": 3.085611774155481e-06, "loss": 0.73248887, "num_input_tokens_seen": 120596075, "step": 5619, "time_per_iteration": 2.86958909034729 }, { "auxiliary_loss_clip": 0.01116213, "auxiliary_loss_mlp": 0.01047745, "balance_loss_clip": 1.04749656, "balance_loss_mlp": 1.03167593, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 2.630730252639156, "language_loss": 0.70144761, "learning_rate": 3.085284660993821e-06, "loss": 0.72308713, "num_input_tokens_seen": 120614195, "step": 5620, "time_per_iteration": 2.6953368186950684 }, { "auxiliary_loss_clip": 0.01136416, "auxiliary_loss_mlp": 0.01047216, "balance_loss_clip": 1.05076015, "balance_loss_mlp": 1.03201699, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 1.8373178803043773, "language_loss": 0.67899036, "learning_rate": 3.084957506678058e-06, "loss": 0.70082676, "num_input_tokens_seen": 120634475, "step": 5621, "time_per_iteration": 2.6531872749328613 }, { "auxiliary_loss_clip": 0.0110792, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.04716897, "balance_loss_mlp": 1.02814865, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 1.7693089540657438, "language_loss": 0.82862681, "learning_rate": 3.0846303112205975e-06, "loss": 0.85015041, "num_input_tokens_seen": 120654980, "step": 5622, "time_per_iteration": 2.7764267921447754 }, { "auxiliary_loss_clip": 0.01097036, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.043239, "balance_loss_mlp": 1.02565813, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 7.015051283901371, "language_loss": 0.73815429, "learning_rate": 3.0843030746338464e-06, "loss": 0.75953692, "num_input_tokens_seen": 120676245, "step": 5623, "time_per_iteration": 2.7962961196899414 }, { "auxiliary_loss_clip": 0.0104645, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.03514934, "balance_loss_mlp": 1.0298605, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.757644747116446, "language_loss": 0.55002284, "learning_rate": 3.083975796930215e-06, "loss": 0.57080543, "num_input_tokens_seen": 120741965, "step": 5624, "time_per_iteration": 3.3495559692382812 }, { "auxiliary_loss_clip": 0.01091887, "auxiliary_loss_mlp": 0.01055525, "balance_loss_clip": 1.04508519, "balance_loss_mlp": 1.03704786, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 3.1490866232839876, "language_loss": 0.73299229, "learning_rate": 3.083648478122111e-06, "loss": 0.75446641, "num_input_tokens_seen": 120760410, "step": 5625, "time_per_iteration": 2.7474253177642822 }, { "auxiliary_loss_clip": 0.01127839, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04838002, "balance_loss_mlp": 1.02828884, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 5.828984180477566, "language_loss": 0.70578009, "learning_rate": 3.0833211182219497e-06, "loss": 0.72751105, "num_input_tokens_seen": 120777705, "step": 5626, "time_per_iteration": 2.6597115993499756 }, { "auxiliary_loss_clip": 0.01108172, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.04509664, "balance_loss_mlp": 1.02605569, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 3.2927176036830574, "language_loss": 0.80853224, "learning_rate": 3.0829937172421425e-06, "loss": 0.83003139, "num_input_tokens_seen": 120798660, "step": 5627, "time_per_iteration": 2.730774402618408 }, { "auxiliary_loss_clip": 0.01131612, "auxiliary_loss_mlp": 0.0077564, "balance_loss_clip": 1.05286694, "balance_loss_mlp": 1.00064421, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 2.306116347111899, "language_loss": 0.80454439, "learning_rate": 3.0826662751951055e-06, "loss": 0.82361686, "num_input_tokens_seen": 120816705, "step": 5628, "time_per_iteration": 2.691471576690674 }, { "auxiliary_loss_clip": 0.01080566, "auxiliary_loss_mlp": 0.01046147, "balance_loss_clip": 1.04250276, "balance_loss_mlp": 1.02787185, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 3.64262689820424, "language_loss": 0.77174091, "learning_rate": 3.082338792093254e-06, "loss": 0.79300809, "num_input_tokens_seen": 120835375, "step": 5629, "time_per_iteration": 2.7564992904663086 }, { "auxiliary_loss_clip": 0.01116368, "auxiliary_loss_mlp": 0.01046104, "balance_loss_clip": 1.04699719, "balance_loss_mlp": 1.02819836, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 4.669184863549949, "language_loss": 0.84738326, "learning_rate": 3.0820112679490074e-06, "loss": 0.86900795, "num_input_tokens_seen": 120854260, "step": 5630, "time_per_iteration": 2.7284910678863525 }, { "auxiliary_loss_clip": 0.0108732, "auxiliary_loss_mlp": 0.01055965, "balance_loss_clip": 1.04692125, "balance_loss_mlp": 1.03889382, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 2.0951078731071204, "language_loss": 0.71627271, "learning_rate": 3.0816837027747857e-06, "loss": 0.73770559, "num_input_tokens_seen": 120871590, "step": 5631, "time_per_iteration": 2.7423501014709473 }, { "auxiliary_loss_clip": 0.01036653, "auxiliary_loss_mlp": 0.01008716, "balance_loss_clip": 1.02691352, "balance_loss_mlp": 1.00683236, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.8383263502294551, "language_loss": 0.56103444, "learning_rate": 3.0813560965830084e-06, "loss": 0.58148813, "num_input_tokens_seen": 120925550, "step": 5632, "time_per_iteration": 3.24780535697937 }, { "auxiliary_loss_clip": 0.01122742, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.05064476, "balance_loss_mlp": 1.02198935, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 1.5341010429525646, "language_loss": 0.80410492, "learning_rate": 3.0810284493861005e-06, "loss": 0.82572055, "num_input_tokens_seen": 120947620, "step": 5633, "time_per_iteration": 2.6492738723754883 }, { "auxiliary_loss_clip": 0.01099799, "auxiliary_loss_mlp": 0.01044702, "balance_loss_clip": 1.04435778, "balance_loss_mlp": 1.02854943, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 2.1401050060877997, "language_loss": 0.59013391, "learning_rate": 3.0807007611964855e-06, "loss": 0.61157894, "num_input_tokens_seen": 120965205, "step": 5634, "time_per_iteration": 2.7261369228363037 }, { "auxiliary_loss_clip": 0.01106157, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04877985, "balance_loss_mlp": 1.02482784, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 1.8243057386875807, "language_loss": 0.92440355, "learning_rate": 3.080373032026589e-06, "loss": 0.94587028, "num_input_tokens_seen": 120983560, "step": 5635, "time_per_iteration": 2.627788782119751 }, { "auxiliary_loss_clip": 0.01091476, "auxiliary_loss_mlp": 0.01039192, "balance_loss_clip": 1.05005646, "balance_loss_mlp": 1.02288401, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 2.00681285666687, "language_loss": 0.75539577, "learning_rate": 3.0800452618888386e-06, "loss": 0.7767024, "num_input_tokens_seen": 121001400, "step": 5636, "time_per_iteration": 2.706772565841675 }, { "auxiliary_loss_clip": 0.0112617, "auxiliary_loss_mlp": 0.01044921, "balance_loss_clip": 1.05089188, "balance_loss_mlp": 1.02866137, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 1.7127540900641318, "language_loss": 0.83448696, "learning_rate": 3.0797174507956637e-06, "loss": 0.85619783, "num_input_tokens_seen": 121021760, "step": 5637, "time_per_iteration": 2.6864166259765625 }, { "auxiliary_loss_clip": 0.0109052, "auxiliary_loss_mlp": 0.01051499, "balance_loss_clip": 1.04899251, "balance_loss_mlp": 1.03193665, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 1.650296659926583, "language_loss": 0.70123053, "learning_rate": 3.079389598759495e-06, "loss": 0.72265071, "num_input_tokens_seen": 121041070, "step": 5638, "time_per_iteration": 2.7513418197631836 }, { "auxiliary_loss_clip": 0.01107421, "auxiliary_loss_mlp": 0.01049541, "balance_loss_clip": 1.0486834, "balance_loss_mlp": 1.0325892, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 3.471125425253904, "language_loss": 0.80819786, "learning_rate": 3.079061705792765e-06, "loss": 0.82976747, "num_input_tokens_seen": 121060890, "step": 5639, "time_per_iteration": 2.8025810718536377 }, { "auxiliary_loss_clip": 0.01143398, "auxiliary_loss_mlp": 0.01048836, "balance_loss_clip": 1.0533762, "balance_loss_mlp": 1.03158689, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 8.162571098362656, "language_loss": 0.67619336, "learning_rate": 3.078733771907907e-06, "loss": 0.69811565, "num_input_tokens_seen": 121079135, "step": 5640, "time_per_iteration": 2.662127733230591 }, { "auxiliary_loss_clip": 0.01114186, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.04930854, "balance_loss_mlp": 1.02196789, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 1.6687164879604648, "language_loss": 0.69589841, "learning_rate": 3.0784057971173554e-06, "loss": 0.71742553, "num_input_tokens_seen": 121097685, "step": 5641, "time_per_iteration": 2.6596109867095947 }, { "auxiliary_loss_clip": 0.01142481, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.05451512, "balance_loss_mlp": 1.02698565, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 2.4357287647671266, "language_loss": 0.87591994, "learning_rate": 3.0780777814335483e-06, "loss": 0.89777428, "num_input_tokens_seen": 121115640, "step": 5642, "time_per_iteration": 2.6347198486328125 }, { "auxiliary_loss_clip": 0.01117312, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.04759669, "balance_loss_mlp": 1.02112639, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 1.860184080586481, "language_loss": 0.83900917, "learning_rate": 3.077749724868924e-06, "loss": 0.86053157, "num_input_tokens_seen": 121132485, "step": 5643, "time_per_iteration": 2.678086042404175 }, { "auxiliary_loss_clip": 0.01107188, "auxiliary_loss_mlp": 0.01049417, "balance_loss_clip": 1.04616475, "balance_loss_mlp": 1.03295422, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 4.293096130940915, "language_loss": 0.76897138, "learning_rate": 3.077421627435922e-06, "loss": 0.79053748, "num_input_tokens_seen": 121152935, "step": 5644, "time_per_iteration": 2.6681976318359375 }, { "auxiliary_loss_clip": 0.01123, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05055666, "balance_loss_mlp": 1.02978194, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 8.889141309374795, "language_loss": 0.62855232, "learning_rate": 3.0770934891469832e-06, "loss": 0.65024871, "num_input_tokens_seen": 121169835, "step": 5645, "time_per_iteration": 2.5976576805114746 }, { "auxiliary_loss_clip": 0.01123901, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.04963613, "balance_loss_mlp": 1.0272944, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 1.8158202042065192, "language_loss": 0.76223624, "learning_rate": 3.076765310014552e-06, "loss": 0.78389925, "num_input_tokens_seen": 121190290, "step": 5646, "time_per_iteration": 2.674058437347412 }, { "auxiliary_loss_clip": 0.01128511, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.05314088, "balance_loss_mlp": 1.03245091, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 2.6597837481337256, "language_loss": 0.78888249, "learning_rate": 3.0764370900510727e-06, "loss": 0.81066692, "num_input_tokens_seen": 121209060, "step": 5647, "time_per_iteration": 2.636462688446045 }, { "auxiliary_loss_clip": 0.01113432, "auxiliary_loss_mlp": 0.0077397, "balance_loss_clip": 1.05254745, "balance_loss_mlp": 1.00053275, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 2.0563114900155037, "language_loss": 0.77694631, "learning_rate": 3.0761088292689904e-06, "loss": 0.7958203, "num_input_tokens_seen": 121227480, "step": 5648, "time_per_iteration": 2.704535484313965 }, { "auxiliary_loss_clip": 0.00999132, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.03748918, "balance_loss_mlp": 1.02168012, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7822172669689142, "language_loss": 0.56281364, "learning_rate": 3.075780527680754e-06, "loss": 0.58304083, "num_input_tokens_seen": 121291305, "step": 5649, "time_per_iteration": 3.6428561210632324 }, { "auxiliary_loss_clip": 0.01109513, "auxiliary_loss_mlp": 0.00776659, "balance_loss_clip": 1.04886901, "balance_loss_mlp": 1.00053644, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 1.4990429944851429, "language_loss": 0.85522908, "learning_rate": 3.0754521852988117e-06, "loss": 0.87409085, "num_input_tokens_seen": 121312740, "step": 5650, "time_per_iteration": 4.6250996589660645 }, { "auxiliary_loss_clip": 0.01125063, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.04845572, "balance_loss_mlp": 1.01392674, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 1.7009103293103713, "language_loss": 0.70462626, "learning_rate": 3.0751238021356152e-06, "loss": 0.7261681, "num_input_tokens_seen": 121334220, "step": 5651, "time_per_iteration": 3.0873425006866455 }, { "auxiliary_loss_clip": 0.01088353, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04718101, "balance_loss_mlp": 1.02539587, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 2.657059560006321, "language_loss": 0.80932343, "learning_rate": 3.074795378203616e-06, "loss": 0.83062148, "num_input_tokens_seen": 121351870, "step": 5652, "time_per_iteration": 2.957105875015259 }, { "auxiliary_loss_clip": 0.01143187, "auxiliary_loss_mlp": 0.0104477, "balance_loss_clip": 1.05543184, "balance_loss_mlp": 1.0275445, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 2.181969038816262, "language_loss": 0.76847494, "learning_rate": 3.0744669135152685e-06, "loss": 0.79035449, "num_input_tokens_seen": 121373400, "step": 5653, "time_per_iteration": 4.277743816375732 }, { "auxiliary_loss_clip": 0.01117346, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.04708898, "balance_loss_mlp": 1.02475142, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 2.9108557214850217, "language_loss": 0.85412633, "learning_rate": 3.0741384080830278e-06, "loss": 0.8757109, "num_input_tokens_seen": 121385225, "step": 5654, "time_per_iteration": 4.243285179138184 }, { "auxiliary_loss_clip": 0.01118111, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.04521537, "balance_loss_mlp": 1.02490664, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 5.5024852924346765, "language_loss": 0.64919531, "learning_rate": 3.073809861919351e-06, "loss": 0.67078876, "num_input_tokens_seen": 121404735, "step": 5655, "time_per_iteration": 2.793121576309204 }, { "auxiliary_loss_clip": 0.01129599, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.05404055, "balance_loss_mlp": 1.02828872, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 1.7231624830718477, "language_loss": 0.7624622, "learning_rate": 3.073481275036697e-06, "loss": 0.78419423, "num_input_tokens_seen": 121426780, "step": 5656, "time_per_iteration": 2.739227056503296 }, { "auxiliary_loss_clip": 0.01102847, "auxiliary_loss_mlp": 0.01040319, "balance_loss_clip": 1.0458467, "balance_loss_mlp": 1.02364159, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 8.964185236965056, "language_loss": 0.82842731, "learning_rate": 3.073152647447525e-06, "loss": 0.849859, "num_input_tokens_seen": 121447245, "step": 5657, "time_per_iteration": 5.179774761199951 }, { "auxiliary_loss_clip": 0.01113742, "auxiliary_loss_mlp": 0.01048481, "balance_loss_clip": 1.05169284, "balance_loss_mlp": 1.03313899, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 1.8385093437954252, "language_loss": 0.85050905, "learning_rate": 3.0728239791642976e-06, "loss": 0.87213123, "num_input_tokens_seen": 121468165, "step": 5658, "time_per_iteration": 2.776137351989746 }, { "auxiliary_loss_clip": 0.01053106, "auxiliary_loss_mlp": 0.01016184, "balance_loss_clip": 1.03449082, "balance_loss_mlp": 1.01424086, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.825209949556337, "language_loss": 0.59988189, "learning_rate": 3.072495270199477e-06, "loss": 0.62057471, "num_input_tokens_seen": 121523795, "step": 5659, "time_per_iteration": 3.272684335708618 }, { "auxiliary_loss_clip": 0.01137862, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.05531621, "balance_loss_mlp": 1.02102888, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 2.521681543348545, "language_loss": 0.67763948, "learning_rate": 3.0721665205655284e-06, "loss": 0.69937897, "num_input_tokens_seen": 121542950, "step": 5660, "time_per_iteration": 2.699267864227295 }, { "auxiliary_loss_clip": 0.01142235, "auxiliary_loss_mlp": 0.010443, "balance_loss_clip": 1.05695057, "balance_loss_mlp": 1.02787328, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 1.9299535220965447, "language_loss": 0.67668259, "learning_rate": 3.071837730274918e-06, "loss": 0.69854796, "num_input_tokens_seen": 121562765, "step": 5661, "time_per_iteration": 2.647101402282715 }, { "auxiliary_loss_clip": 0.01119112, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 1.05479288, "balance_loss_mlp": 1.02634561, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 2.0521689983251954, "language_loss": 0.78806192, "learning_rate": 3.071508899340113e-06, "loss": 0.80967206, "num_input_tokens_seen": 121581610, "step": 5662, "time_per_iteration": 2.847168207168579 }, { "auxiliary_loss_clip": 0.01103563, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.05163002, "balance_loss_mlp": 1.02498698, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 2.226848836482441, "language_loss": 0.73531127, "learning_rate": 3.0711800277735833e-06, "loss": 0.75676656, "num_input_tokens_seen": 121601885, "step": 5663, "time_per_iteration": 2.8581340312957764 }, { "auxiliary_loss_clip": 0.01090462, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.04631042, "balance_loss_mlp": 1.02079868, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 1.7108226041633658, "language_loss": 0.86297357, "learning_rate": 3.0708511155877997e-06, "loss": 0.88423085, "num_input_tokens_seen": 121621335, "step": 5664, "time_per_iteration": 2.778038501739502 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.0103938, "balance_loss_clip": 1.05399597, "balance_loss_mlp": 1.0245564, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 2.2398696420560675, "language_loss": 0.68712831, "learning_rate": 3.070522162795235e-06, "loss": 0.70891583, "num_input_tokens_seen": 121641310, "step": 5665, "time_per_iteration": 2.688643217086792 }, { "auxiliary_loss_clip": 0.01138662, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.05278993, "balance_loss_mlp": 1.0229218, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 2.716291820837314, "language_loss": 0.73084486, "learning_rate": 3.0701931694083626e-06, "loss": 0.7526291, "num_input_tokens_seen": 121659625, "step": 5666, "time_per_iteration": 2.7325544357299805 }, { "auxiliary_loss_clip": 0.01128915, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.05135012, "balance_loss_mlp": 1.0244832, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 2.363121461769924, "language_loss": 0.72947341, "learning_rate": 3.0698641354396576e-06, "loss": 0.75115931, "num_input_tokens_seen": 121679205, "step": 5667, "time_per_iteration": 2.7143874168395996 }, { "auxiliary_loss_clip": 0.01042137, "auxiliary_loss_mlp": 0.01008076, "balance_loss_clip": 1.02401757, "balance_loss_mlp": 1.00638342, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.8313790259289849, "language_loss": 0.63259363, "learning_rate": 3.069535060901597e-06, "loss": 0.65309572, "num_input_tokens_seen": 121751085, "step": 5668, "time_per_iteration": 3.3907217979431152 }, { "auxiliary_loss_clip": 0.01036989, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.03961444, "balance_loss_mlp": 1.02808475, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 2.2447075161594365, "language_loss": 0.71795446, "learning_rate": 3.0692059458066596e-06, "loss": 0.73877549, "num_input_tokens_seen": 121768565, "step": 5669, "time_per_iteration": 2.941349983215332 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.00773367, "balance_loss_clip": 1.04966998, "balance_loss_mlp": 1.00054646, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 1.973306725053756, "language_loss": 0.80678529, "learning_rate": 3.0688767901673265e-06, "loss": 0.82555127, "num_input_tokens_seen": 121784925, "step": 5670, "time_per_iteration": 2.8877930641174316 }, { "auxiliary_loss_clip": 0.01088488, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.02111244, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 1.926244069219147, "language_loss": 0.77521646, "learning_rate": 3.068547593996078e-06, "loss": 0.79647315, "num_input_tokens_seen": 121804425, "step": 5671, "time_per_iteration": 2.886425256729126 }, { "auxiliary_loss_clip": 0.01138739, "auxiliary_loss_mlp": 0.0077388, "balance_loss_clip": 1.05301285, "balance_loss_mlp": 1.00052333, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 3.7152219569219427, "language_loss": 0.74220848, "learning_rate": 3.0682183573053974e-06, "loss": 0.76133466, "num_input_tokens_seen": 121825145, "step": 5672, "time_per_iteration": 2.751692056655884 }, { "auxiliary_loss_clip": 0.01121109, "auxiliary_loss_mlp": 0.01047405, "balance_loss_clip": 1.04886246, "balance_loss_mlp": 1.03089476, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 1.8011032028958165, "language_loss": 0.73721337, "learning_rate": 3.06788908010777e-06, "loss": 0.7588985, "num_input_tokens_seen": 121842185, "step": 5673, "time_per_iteration": 2.6628050804138184 }, { "auxiliary_loss_clip": 0.01126244, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.05143654, "balance_loss_mlp": 1.02362132, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 1.7591090628800392, "language_loss": 0.79972708, "learning_rate": 3.067559762415682e-06, "loss": 0.8213793, "num_input_tokens_seen": 121862260, "step": 5674, "time_per_iteration": 2.6803476810455322 }, { "auxiliary_loss_clip": 0.01054856, "auxiliary_loss_mlp": 0.01001466, "balance_loss_clip": 1.0258925, "balance_loss_mlp": 0.9994635, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.7875282266281167, "language_loss": 0.56080592, "learning_rate": 3.0672304042416198e-06, "loss": 0.5813691, "num_input_tokens_seen": 121923560, "step": 5675, "time_per_iteration": 3.3068313598632812 }, { "auxiliary_loss_clip": 0.01115956, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.052145, "balance_loss_mlp": 1.0006851, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 1.6444328441844458, "language_loss": 0.78795338, "learning_rate": 3.0669010055980734e-06, "loss": 0.80684733, "num_input_tokens_seen": 121943515, "step": 5676, "time_per_iteration": 2.7983739376068115 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.04593658, "balance_loss_mlp": 1.02024043, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 1.8897537275348075, "language_loss": 0.85468972, "learning_rate": 3.0665715664975357e-06, "loss": 0.8762607, "num_input_tokens_seen": 121962540, "step": 5677, "time_per_iteration": 2.698751449584961 }, { "auxiliary_loss_clip": 0.01109896, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04772925, "balance_loss_mlp": 1.02586842, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 1.7514589696636707, "language_loss": 0.79352021, "learning_rate": 3.0662420869524966e-06, "loss": 0.81504107, "num_input_tokens_seen": 121979830, "step": 5678, "time_per_iteration": 2.731834650039673 }, { "auxiliary_loss_clip": 0.01123477, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04799783, "balance_loss_mlp": 1.01833677, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 1.8765190883227818, "language_loss": 0.74821675, "learning_rate": 3.0659125669754506e-06, "loss": 0.76978606, "num_input_tokens_seen": 121999055, "step": 5679, "time_per_iteration": 2.7362489700317383 }, { "auxiliary_loss_clip": 0.01044772, "auxiliary_loss_mlp": 0.01004164, "balance_loss_clip": 1.02617037, "balance_loss_mlp": 1.00210214, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 0.716476818724812, "language_loss": 0.59445524, "learning_rate": 3.0655830065788923e-06, "loss": 0.61494464, "num_input_tokens_seen": 122067015, "step": 5680, "time_per_iteration": 3.241750955581665 }, { "auxiliary_loss_clip": 0.01108333, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.04563892, "balance_loss_mlp": 1.01804543, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 1.760771174406363, "language_loss": 0.72054088, "learning_rate": 3.0652534057753206e-06, "loss": 0.74195278, "num_input_tokens_seen": 122085295, "step": 5681, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01109003, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.0462265, "balance_loss_mlp": 1.02786994, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 2.2327180896030443, "language_loss": 0.71463466, "learning_rate": 3.064923764577233e-06, "loss": 0.73615474, "num_input_tokens_seen": 122104020, "step": 5682, "time_per_iteration": 2.825296640396118 }, { "auxiliary_loss_clip": 0.01132395, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.02507806, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 1.5426603390069147, "language_loss": 0.84101224, "learning_rate": 3.0645940829971295e-06, "loss": 0.86274409, "num_input_tokens_seen": 122125080, "step": 5683, "time_per_iteration": 2.6654412746429443 }, { "auxiliary_loss_clip": 0.01112942, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.04768562, "balance_loss_mlp": 1.03113699, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 4.046428716645244, "language_loss": 0.70964772, "learning_rate": 3.0642643610475116e-06, "loss": 0.73124808, "num_input_tokens_seen": 122146350, "step": 5684, "time_per_iteration": 2.724592924118042 }, { "auxiliary_loss_clip": 0.01132202, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.04905093, "balance_loss_mlp": 1.02367699, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 1.9204482618269598, "language_loss": 0.74832582, "learning_rate": 3.0639345987408823e-06, "loss": 0.77002841, "num_input_tokens_seen": 122168085, "step": 5685, "time_per_iteration": 2.7046890258789062 }, { "auxiliary_loss_clip": 0.01114777, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.04522872, "balance_loss_mlp": 1.03261042, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 1.9200820074556442, "language_loss": 0.70611888, "learning_rate": 3.0636047960897468e-06, "loss": 0.72774971, "num_input_tokens_seen": 122191040, "step": 5686, "time_per_iteration": 2.7390410900115967 }, { "auxiliary_loss_clip": 0.01123208, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.04809284, "balance_loss_mlp": 1.02819252, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 2.0197354521106563, "language_loss": 0.77240539, "learning_rate": 3.06327495310661e-06, "loss": 0.79407853, "num_input_tokens_seen": 122209225, "step": 5687, "time_per_iteration": 2.6381263732910156 }, { "auxiliary_loss_clip": 0.01106353, "auxiliary_loss_mlp": 0.01040255, "balance_loss_clip": 1.04849195, "balance_loss_mlp": 1.02412593, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 3.7332163528162385, "language_loss": 0.8676976, "learning_rate": 3.062945069803981e-06, "loss": 0.88916373, "num_input_tokens_seen": 122226160, "step": 5688, "time_per_iteration": 2.647320508956909 }, { "auxiliary_loss_clip": 0.01119843, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.04928863, "balance_loss_mlp": 1.0255394, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 1.870477619822585, "language_loss": 0.79564822, "learning_rate": 3.0626151461943684e-06, "loss": 0.81726807, "num_input_tokens_seen": 122243115, "step": 5689, "time_per_iteration": 4.1660990715026855 }, { "auxiliary_loss_clip": 0.0112576, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.02580786, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 1.7530560995380315, "language_loss": 0.73215616, "learning_rate": 3.0622851822902834e-06, "loss": 0.75383675, "num_input_tokens_seen": 122261105, "step": 5690, "time_per_iteration": 2.699846029281616 }, { "auxiliary_loss_clip": 0.01115188, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.04381919, "balance_loss_mlp": 1.03121471, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 2.1339055209058184, "language_loss": 0.76036334, "learning_rate": 3.061955178104237e-06, "loss": 0.78199112, "num_input_tokens_seen": 122279995, "step": 5691, "time_per_iteration": 2.707598924636841 }, { "auxiliary_loss_clip": 0.01119412, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04769242, "balance_loss_mlp": 1.02878046, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 1.9419180569645556, "language_loss": 0.68321705, "learning_rate": 3.0616251336487447e-06, "loss": 0.70484006, "num_input_tokens_seen": 122299070, "step": 5692, "time_per_iteration": 2.6876816749572754 }, { "auxiliary_loss_clip": 0.01123804, "auxiliary_loss_mlp": 0.01042902, "balance_loss_clip": 1.0481621, "balance_loss_mlp": 1.02660608, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 2.8342834288415504, "language_loss": 0.72458065, "learning_rate": 3.06129504893632e-06, "loss": 0.74624765, "num_input_tokens_seen": 122316800, "step": 5693, "time_per_iteration": 5.672837018966675 }, { "auxiliary_loss_clip": 0.01090312, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.0433774, "balance_loss_mlp": 1.02832651, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 1.9009541760697364, "language_loss": 0.75556326, "learning_rate": 3.0609649239794813e-06, "loss": 0.77690107, "num_input_tokens_seen": 122335275, "step": 5694, "time_per_iteration": 2.713236093521118 }, { "auxiliary_loss_clip": 0.01093804, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.04769742, "balance_loss_mlp": 1.02205038, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 2.1810058063417608, "language_loss": 0.79590774, "learning_rate": 3.060634758790747e-06, "loss": 0.81721413, "num_input_tokens_seen": 122353215, "step": 5695, "time_per_iteration": 2.7206506729125977 }, { "auxiliary_loss_clip": 0.01077977, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 1.04183137, "balance_loss_mlp": 1.02764642, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 1.8643380844369803, "language_loss": 0.73428202, "learning_rate": 3.060304553382635e-06, "loss": 0.75549489, "num_input_tokens_seen": 122372495, "step": 5696, "time_per_iteration": 4.777001857757568 }, { "auxiliary_loss_clip": 0.01088152, "auxiliary_loss_mlp": 0.01052674, "balance_loss_clip": 1.0424118, "balance_loss_mlp": 1.03569841, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 5.815439398629578, "language_loss": 0.71460104, "learning_rate": 3.0599743077676685e-06, "loss": 0.73600936, "num_input_tokens_seen": 122394600, "step": 5697, "time_per_iteration": 2.7620668411254883 }, { "auxiliary_loss_clip": 0.01108783, "auxiliary_loss_mlp": 0.01032533, "balance_loss_clip": 1.04925871, "balance_loss_mlp": 1.01740503, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 2.6993537181180316, "language_loss": 0.82170486, "learning_rate": 3.05964402195837e-06, "loss": 0.84311801, "num_input_tokens_seen": 122414700, "step": 5698, "time_per_iteration": 2.6930580139160156 }, { "auxiliary_loss_clip": 0.01077965, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.0451839, "balance_loss_mlp": 1.03073311, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 2.492082875954734, "language_loss": 0.68941295, "learning_rate": 3.0593136959672645e-06, "loss": 0.71068972, "num_input_tokens_seen": 122432760, "step": 5699, "time_per_iteration": 2.8604705333709717 }, { "auxiliary_loss_clip": 0.01113381, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05009818, "balance_loss_mlp": 1.02698755, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 2.4799642493365046, "language_loss": 0.72708368, "learning_rate": 3.058983329806877e-06, "loss": 0.74863935, "num_input_tokens_seen": 122449105, "step": 5700, "time_per_iteration": 2.721219301223755 }, { "auxiliary_loss_clip": 0.01107869, "auxiliary_loss_mlp": 0.01033632, "balance_loss_clip": 1.05173492, "balance_loss_mlp": 1.01942825, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 1.8907099352771195, "language_loss": 0.81771016, "learning_rate": 3.0586529234897354e-06, "loss": 0.83912516, "num_input_tokens_seen": 122468700, "step": 5701, "time_per_iteration": 2.668776273727417 }, { "auxiliary_loss_clip": 0.01122749, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.05318427, "balance_loss_mlp": 1.02137566, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 1.8540703451937275, "language_loss": 0.71611702, "learning_rate": 3.0583224770283694e-06, "loss": 0.73770893, "num_input_tokens_seen": 122488160, "step": 5702, "time_per_iteration": 2.7413434982299805 }, { "auxiliary_loss_clip": 0.01034072, "auxiliary_loss_mlp": 0.0102117, "balance_loss_clip": 1.02648544, "balance_loss_mlp": 1.01936996, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.8291151185510042, "language_loss": 0.57455015, "learning_rate": 3.057991990435309e-06, "loss": 0.59510255, "num_input_tokens_seen": 122542890, "step": 5703, "time_per_iteration": 3.123619318008423 }, { "auxiliary_loss_clip": 0.01125899, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.05167961, "balance_loss_mlp": 1.02754664, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 2.054859273280662, "language_loss": 0.75049305, "learning_rate": 3.057661463723086e-06, "loss": 0.77219748, "num_input_tokens_seen": 122561770, "step": 5704, "time_per_iteration": 2.786344051361084 }, { "auxiliary_loss_clip": 0.01103715, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.05234969, "balance_loss_mlp": 1.02506232, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 1.921400910299184, "language_loss": 0.72367042, "learning_rate": 3.0573308969042346e-06, "loss": 0.74510252, "num_input_tokens_seen": 122580580, "step": 5705, "time_per_iteration": 2.7464826107025146 }, { "auxiliary_loss_clip": 0.01099266, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.05201912, "balance_loss_mlp": 1.01980281, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 2.585473080189318, "language_loss": 0.80016834, "learning_rate": 3.057000289991289e-06, "loss": 0.82151377, "num_input_tokens_seen": 122599810, "step": 5706, "time_per_iteration": 2.83493971824646 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.05822873, "balance_loss_mlp": 1.02111542, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 2.833985332828215, "language_loss": 0.83001584, "learning_rate": 3.056669642996787e-06, "loss": 0.85165167, "num_input_tokens_seen": 122616035, "step": 5707, "time_per_iteration": 2.6888725757598877 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.05664158, "balance_loss_mlp": 1.02264881, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 1.6733576562987098, "language_loss": 0.75313264, "learning_rate": 3.056338955933266e-06, "loss": 0.7748242, "num_input_tokens_seen": 122633785, "step": 5708, "time_per_iteration": 2.655061960220337 }, { "auxiliary_loss_clip": 0.01105586, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.05063939, "balance_loss_mlp": 1.02357078, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 1.6008558791331946, "language_loss": 0.81187862, "learning_rate": 3.0560082288132662e-06, "loss": 0.83333254, "num_input_tokens_seen": 122652100, "step": 5709, "time_per_iteration": 2.7354934215545654 }, { "auxiliary_loss_clip": 0.01119071, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.0550828, "balance_loss_mlp": 1.02581382, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 2.1605529243452297, "language_loss": 0.79441178, "learning_rate": 3.055677461649329e-06, "loss": 0.81603634, "num_input_tokens_seen": 122669720, "step": 5710, "time_per_iteration": 2.757321834564209 }, { "auxiliary_loss_clip": 0.01130524, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.02329111, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 1.8403881586839854, "language_loss": 0.70303786, "learning_rate": 3.055346654453996e-06, "loss": 0.7247417, "num_input_tokens_seen": 122688715, "step": 5711, "time_per_iteration": 2.6535775661468506 }, { "auxiliary_loss_clip": 0.01106817, "auxiliary_loss_mlp": 0.00774858, "balance_loss_clip": 1.05299044, "balance_loss_mlp": 1.00072622, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 1.8401630077009354, "language_loss": 0.67124939, "learning_rate": 3.055015807239812e-06, "loss": 0.69006616, "num_input_tokens_seen": 122706970, "step": 5712, "time_per_iteration": 2.7115519046783447 }, { "auxiliary_loss_clip": 0.01051163, "auxiliary_loss_mlp": 0.01005713, "balance_loss_clip": 1.0511148, "balance_loss_mlp": 1.00409162, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.846630151399307, "language_loss": 0.58072996, "learning_rate": 3.0546849200193226e-06, "loss": 0.60129869, "num_input_tokens_seen": 122758095, "step": 5713, "time_per_iteration": 3.3988189697265625 }, { "auxiliary_loss_clip": 0.01142007, "auxiliary_loss_mlp": 0.01043862, "balance_loss_clip": 1.05782688, "balance_loss_mlp": 1.02813852, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 1.6506449407169241, "language_loss": 0.8079257, "learning_rate": 3.054353992805076e-06, "loss": 0.82978439, "num_input_tokens_seen": 122777815, "step": 5714, "time_per_iteration": 2.682537078857422 }, { "auxiliary_loss_clip": 0.01142274, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.0581255, "balance_loss_mlp": 1.02628696, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 2.1462767477025055, "language_loss": 0.72059911, "learning_rate": 3.05402302560962e-06, "loss": 0.74244434, "num_input_tokens_seen": 122797555, "step": 5715, "time_per_iteration": 2.6535134315490723 }, { "auxiliary_loss_clip": 0.01070037, "auxiliary_loss_mlp": 0.01002865, "balance_loss_clip": 1.0577507, "balance_loss_mlp": 1.00051689, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 0.9103705044251069, "language_loss": 0.65885556, "learning_rate": 3.053692018445505e-06, "loss": 0.67958462, "num_input_tokens_seen": 122863955, "step": 5716, "time_per_iteration": 3.205113172531128 }, { "auxiliary_loss_clip": 0.01124236, "auxiliary_loss_mlp": 0.0104266, "balance_loss_clip": 1.05416417, "balance_loss_mlp": 1.02718663, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 2.101112668121384, "language_loss": 0.74272031, "learning_rate": 3.0533609713252838e-06, "loss": 0.76438928, "num_input_tokens_seen": 122883000, "step": 5717, "time_per_iteration": 2.60300350189209 }, { "auxiliary_loss_clip": 0.01084832, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.05195725, "balance_loss_mlp": 1.02437937, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 1.8405555467441777, "language_loss": 0.75446129, "learning_rate": 3.0530298842615077e-06, "loss": 0.7757023, "num_input_tokens_seen": 122903265, "step": 5718, "time_per_iteration": 2.787687301635742 }, { "auxiliary_loss_clip": 0.01097103, "auxiliary_loss_mlp": 0.01043125, "balance_loss_clip": 1.04837775, "balance_loss_mlp": 1.02739501, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 1.9369525419747404, "language_loss": 0.63647246, "learning_rate": 3.052698757266734e-06, "loss": 0.65787476, "num_input_tokens_seen": 122923860, "step": 5719, "time_per_iteration": 2.8138949871063232 }, { "auxiliary_loss_clip": 0.01098152, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05234158, "balance_loss_mlp": 1.02310777, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 1.8182809721987367, "language_loss": 0.73785692, "learning_rate": 3.0523675903535183e-06, "loss": 0.75924277, "num_input_tokens_seen": 122945305, "step": 5720, "time_per_iteration": 2.761371612548828 }, { "auxiliary_loss_clip": 0.01127909, "auxiliary_loss_mlp": 0.01052147, "balance_loss_clip": 1.056463, "balance_loss_mlp": 1.03434944, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 2.2267988645125896, "language_loss": 0.74087942, "learning_rate": 3.0520363835344173e-06, "loss": 0.76267999, "num_input_tokens_seen": 122962535, "step": 5721, "time_per_iteration": 2.6139280796051025 }, { "auxiliary_loss_clip": 0.0111919, "auxiliary_loss_mlp": 0.0077563, "balance_loss_clip": 1.05647993, "balance_loss_mlp": 1.00063252, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 2.313932715754647, "language_loss": 0.80464351, "learning_rate": 3.051705136821992e-06, "loss": 0.82359171, "num_input_tokens_seen": 122979750, "step": 5722, "time_per_iteration": 2.6886982917785645 }, { "auxiliary_loss_clip": 0.01092207, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.05326557, "balance_loss_mlp": 1.02348995, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 2.5095280683984984, "language_loss": 0.81647789, "learning_rate": 3.051373850228801e-06, "loss": 0.83778864, "num_input_tokens_seen": 122998955, "step": 5723, "time_per_iteration": 2.7464921474456787 }, { "auxiliary_loss_clip": 0.01099736, "auxiliary_loss_mlp": 0.0105726, "balance_loss_clip": 1.0488528, "balance_loss_mlp": 1.04023743, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 1.9897062128640133, "language_loss": 0.81431544, "learning_rate": 3.0510425237674096e-06, "loss": 0.83588541, "num_input_tokens_seen": 123016165, "step": 5724, "time_per_iteration": 2.7447471618652344 }, { "auxiliary_loss_clip": 0.01112954, "auxiliary_loss_mlp": 0.01047765, "balance_loss_clip": 1.05231178, "balance_loss_mlp": 1.03056324, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 1.858960952495153, "language_loss": 0.68913317, "learning_rate": 3.05071115745038e-06, "loss": 0.71074033, "num_input_tokens_seen": 123036900, "step": 5725, "time_per_iteration": 2.798987627029419 }, { "auxiliary_loss_clip": 0.01132971, "auxiliary_loss_mlp": 0.0105182, "balance_loss_clip": 1.05775714, "balance_loss_mlp": 1.03379524, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 1.4701315954442116, "language_loss": 0.6946882, "learning_rate": 3.0503797512902773e-06, "loss": 0.71653616, "num_input_tokens_seen": 123057480, "step": 5726, "time_per_iteration": 2.663766622543335 }, { "auxiliary_loss_clip": 0.01111868, "auxiliary_loss_mlp": 0.01038496, "balance_loss_clip": 1.05667615, "balance_loss_mlp": 1.02374983, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 2.4860883718983873, "language_loss": 0.73317868, "learning_rate": 3.0500483052996703e-06, "loss": 0.7546823, "num_input_tokens_seen": 123076890, "step": 5727, "time_per_iteration": 2.8002336025238037 }, { "auxiliary_loss_clip": 0.01097058, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05053401, "balance_loss_mlp": 1.03590822, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 2.2067060616784815, "language_loss": 0.88451493, "learning_rate": 3.0497168194911257e-06, "loss": 0.90600753, "num_input_tokens_seen": 123092530, "step": 5728, "time_per_iteration": 2.703842878341675 }, { "auxiliary_loss_clip": 0.01089582, "auxiliary_loss_mlp": 0.01048379, "balance_loss_clip": 1.04858351, "balance_loss_mlp": 1.03266144, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 2.2135571419735904, "language_loss": 0.70018214, "learning_rate": 3.0493852938772143e-06, "loss": 0.72156173, "num_input_tokens_seen": 123110560, "step": 5729, "time_per_iteration": 4.360877275466919 }, { "auxiliary_loss_clip": 0.01124088, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.0525502, "balance_loss_mlp": 1.02208424, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 1.9483871766944658, "language_loss": 0.7435137, "learning_rate": 3.0490537284705078e-06, "loss": 0.76513231, "num_input_tokens_seen": 123128655, "step": 5730, "time_per_iteration": 2.6021499633789062 }, { "auxiliary_loss_clip": 0.01099617, "auxiliary_loss_mlp": 0.0105823, "balance_loss_clip": 1.04880106, "balance_loss_mlp": 1.04053974, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 2.1142556114368314, "language_loss": 0.7952323, "learning_rate": 3.048722123283578e-06, "loss": 0.81681079, "num_input_tokens_seen": 123145130, "step": 5731, "time_per_iteration": 4.273399114608765 }, { "auxiliary_loss_clip": 0.01130567, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.05617356, "balance_loss_mlp": 1.02793896, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 2.0299111477971334, "language_loss": 0.78609502, "learning_rate": 3.0483904783290006e-06, "loss": 0.80783606, "num_input_tokens_seen": 123162265, "step": 5732, "time_per_iteration": 4.672218322753906 }, { "auxiliary_loss_clip": 0.01037769, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03788018, "balance_loss_mlp": 1.0106411, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.7456337544046427, "language_loss": 0.53537595, "learning_rate": 3.0480587936193505e-06, "loss": 0.55587733, "num_input_tokens_seen": 123218620, "step": 5733, "time_per_iteration": 3.322802782058716 }, { "auxiliary_loss_clip": 0.01122514, "auxiliary_loss_mlp": 0.01042066, "balance_loss_clip": 1.05675018, "balance_loss_mlp": 1.02577019, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 1.936820728476944, "language_loss": 0.832178, "learning_rate": 3.047727069167207e-06, "loss": 0.85382378, "num_input_tokens_seen": 123237325, "step": 5734, "time_per_iteration": 2.7426953315734863 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.05517805, "balance_loss_mlp": 1.01988125, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 2.7764640699074077, "language_loss": 0.92655241, "learning_rate": 3.0473953049851478e-06, "loss": 0.94811392, "num_input_tokens_seen": 123258650, "step": 5735, "time_per_iteration": 4.536838054656982 }, { "auxiliary_loss_clip": 0.0110302, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.05774188, "balance_loss_mlp": 1.02492189, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 1.7508294751665012, "language_loss": 0.76571405, "learning_rate": 3.0470635010857533e-06, "loss": 0.78715694, "num_input_tokens_seen": 123277155, "step": 5736, "time_per_iteration": 2.784958600997925 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.05683184, "balance_loss_mlp": 1.02396011, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 1.7983696926456887, "language_loss": 0.78327668, "learning_rate": 3.0467316574816064e-06, "loss": 0.80491114, "num_input_tokens_seen": 123297640, "step": 5737, "time_per_iteration": 2.709786891937256 }, { "auxiliary_loss_clip": 0.01083721, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 1.04379368, "balance_loss_mlp": 1.02520096, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 2.0055780284948375, "language_loss": 0.71544027, "learning_rate": 3.0463997741852893e-06, "loss": 0.73671806, "num_input_tokens_seen": 123314370, "step": 5738, "time_per_iteration": 2.779651165008545 }, { "auxiliary_loss_clip": 0.0110112, "auxiliary_loss_mlp": 0.01042892, "balance_loss_clip": 1.04991913, "balance_loss_mlp": 1.02520132, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 2.7751951344870562, "language_loss": 0.82324719, "learning_rate": 3.046067851209389e-06, "loss": 0.84468728, "num_input_tokens_seen": 123336085, "step": 5739, "time_per_iteration": 2.7953522205352783 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01037335, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.02132511, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 1.8186717226973075, "language_loss": 0.83071041, "learning_rate": 3.0457358885664898e-06, "loss": 0.85212862, "num_input_tokens_seen": 123354460, "step": 5740, "time_per_iteration": 2.7530486583709717 }, { "auxiliary_loss_clip": 0.01130478, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.05699897, "balance_loss_mlp": 1.01901984, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 2.1971165557092656, "language_loss": 0.7704618, "learning_rate": 3.045403886269181e-06, "loss": 0.79212344, "num_input_tokens_seen": 123373420, "step": 5741, "time_per_iteration": 2.6488983631134033 }, { "auxiliary_loss_clip": 0.01116686, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.05202794, "balance_loss_mlp": 1.02271724, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 1.629760829576741, "language_loss": 0.76972193, "learning_rate": 3.045071844330053e-06, "loss": 0.7912721, "num_input_tokens_seen": 123394730, "step": 5742, "time_per_iteration": 2.7333807945251465 }, { "auxiliary_loss_clip": 0.01133631, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.05862427, "balance_loss_mlp": 1.02371693, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 2.2460068376984523, "language_loss": 0.76135588, "learning_rate": 3.0447397627616955e-06, "loss": 0.78309238, "num_input_tokens_seen": 123412895, "step": 5743, "time_per_iteration": 2.677682638168335 }, { "auxiliary_loss_clip": 0.01128893, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.05570602, "balance_loss_mlp": 1.02171636, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 2.0501405423310097, "language_loss": 0.70481914, "learning_rate": 3.0444076415767016e-06, "loss": 0.72647989, "num_input_tokens_seen": 123432320, "step": 5744, "time_per_iteration": 2.7430574893951416 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.05727339, "balance_loss_mlp": 1.01959133, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 2.271690731291802, "language_loss": 0.79658759, "learning_rate": 3.044075480787665e-06, "loss": 0.81835419, "num_input_tokens_seen": 123450980, "step": 5745, "time_per_iteration": 2.6587865352630615 }, { "auxiliary_loss_clip": 0.01092128, "auxiliary_loss_mlp": 0.01041398, "balance_loss_clip": 1.0486573, "balance_loss_mlp": 1.02435148, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 1.8194779915280654, "language_loss": 0.89049339, "learning_rate": 3.043743280407182e-06, "loss": 0.91182864, "num_input_tokens_seen": 123469365, "step": 5746, "time_per_iteration": 2.7314908504486084 }, { "auxiliary_loss_clip": 0.01133638, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.02101421, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 2.5554958969654136, "language_loss": 0.64851058, "learning_rate": 3.043411040447849e-06, "loss": 0.67023152, "num_input_tokens_seen": 123489425, "step": 5747, "time_per_iteration": 2.6858277320861816 }, { "auxiliary_loss_clip": 0.01119459, "auxiliary_loss_mlp": 0.01035118, "balance_loss_clip": 1.05213308, "balance_loss_mlp": 1.01928735, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 1.5633023430662023, "language_loss": 0.72855747, "learning_rate": 3.043078760922264e-06, "loss": 0.75010324, "num_input_tokens_seen": 123509970, "step": 5748, "time_per_iteration": 2.805250406265259 }, { "auxiliary_loss_clip": 0.01084714, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05246413, "balance_loss_mlp": 1.01832819, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 1.6861475272665256, "language_loss": 0.7584126, "learning_rate": 3.042746441843029e-06, "loss": 0.7795862, "num_input_tokens_seen": 123531055, "step": 5749, "time_per_iteration": 2.8886258602142334 }, { "auxiliary_loss_clip": 0.01061531, "auxiliary_loss_mlp": 0.01002064, "balance_loss_clip": 1.05058503, "balance_loss_mlp": 1.00045478, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 0.8852783380527953, "language_loss": 0.62715566, "learning_rate": 3.0424140832227437e-06, "loss": 0.64779162, "num_input_tokens_seen": 123584720, "step": 5750, "time_per_iteration": 3.1283066272735596 }, { "auxiliary_loss_clip": 0.01110881, "auxiliary_loss_mlp": 0.01037788, "balance_loss_clip": 1.05210388, "balance_loss_mlp": 1.02242184, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 2.239830827663745, "language_loss": 0.80332017, "learning_rate": 3.042081685074012e-06, "loss": 0.82480681, "num_input_tokens_seen": 123604465, "step": 5751, "time_per_iteration": 2.721344470977783 }, { "auxiliary_loss_clip": 0.01135561, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.0536952, "balance_loss_mlp": 1.03101254, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 2.3847713847020744, "language_loss": 0.84148252, "learning_rate": 3.041749247409439e-06, "loss": 0.86329746, "num_input_tokens_seen": 123622320, "step": 5752, "time_per_iteration": 2.578984260559082 }, { "auxiliary_loss_clip": 0.01047286, "auxiliary_loss_mlp": 0.00754976, "balance_loss_clip": 1.0380801, "balance_loss_mlp": 1.00148225, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7284359747550926, "language_loss": 0.6310631, "learning_rate": 3.0414167702416296e-06, "loss": 0.64908576, "num_input_tokens_seen": 123678010, "step": 5753, "time_per_iteration": 3.0907819271087646 }, { "auxiliary_loss_clip": 0.01112695, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.05358505, "balance_loss_mlp": 1.01956582, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 1.9590865283999213, "language_loss": 0.71000856, "learning_rate": 3.0410842535831914e-06, "loss": 0.73149538, "num_input_tokens_seen": 123696830, "step": 5754, "time_per_iteration": 2.7031564712524414 }, { "auxiliary_loss_clip": 0.01127989, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.05300486, "balance_loss_mlp": 1.02251959, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 2.56305874029915, "language_loss": 0.73286581, "learning_rate": 3.0407516974467343e-06, "loss": 0.75452608, "num_input_tokens_seen": 123714360, "step": 5755, "time_per_iteration": 2.656804084777832 }, { "auxiliary_loss_clip": 0.01122508, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.01791406, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 1.7746130503339408, "language_loss": 0.7232182, "learning_rate": 3.040419101844869e-06, "loss": 0.74477637, "num_input_tokens_seen": 123739250, "step": 5756, "time_per_iteration": 2.8805603981018066 }, { "auxiliary_loss_clip": 0.01055943, "auxiliary_loss_mlp": 0.01012753, "balance_loss_clip": 1.03647125, "balance_loss_mlp": 1.01088166, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 0.7176054236110851, "language_loss": 0.62659568, "learning_rate": 3.040086466790207e-06, "loss": 0.64728266, "num_input_tokens_seen": 123802845, "step": 5757, "time_per_iteration": 3.21248197555542 }, { "auxiliary_loss_clip": 0.0103445, "auxiliary_loss_mlp": 0.00755471, "balance_loss_clip": 1.03495657, "balance_loss_mlp": 1.0016396, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.8171010225304897, "language_loss": 0.59206927, "learning_rate": 3.039753792295362e-06, "loss": 0.60996854, "num_input_tokens_seen": 123861805, "step": 5758, "time_per_iteration": 3.2514266967773438 }, { "auxiliary_loss_clip": 0.01122832, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.05849838, "balance_loss_mlp": 1.02783418, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 1.8827972101732287, "language_loss": 0.71806967, "learning_rate": 3.0394210783729487e-06, "loss": 0.73972023, "num_input_tokens_seen": 123881820, "step": 5759, "time_per_iteration": 2.943061351776123 }, { "auxiliary_loss_clip": 0.0108272, "auxiliary_loss_mlp": 0.01061154, "balance_loss_clip": 1.0455631, "balance_loss_mlp": 1.04352307, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 1.9206924983950955, "language_loss": 0.83097923, "learning_rate": 3.0390883250355836e-06, "loss": 0.85241801, "num_input_tokens_seen": 123903700, "step": 5760, "time_per_iteration": 2.8922929763793945 }, { "auxiliary_loss_clip": 0.01029416, "auxiliary_loss_mlp": 0.01010127, "balance_loss_clip": 1.02909803, "balance_loss_mlp": 1.00855386, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.8149802448400086, "language_loss": 0.56472003, "learning_rate": 3.0387555322958865e-06, "loss": 0.58511543, "num_input_tokens_seen": 123960075, "step": 5761, "time_per_iteration": 3.274470567703247 }, { "auxiliary_loss_clip": 0.01122229, "auxiliary_loss_mlp": 0.00773416, "balance_loss_clip": 1.04931128, "balance_loss_mlp": 1.00069964, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 2.486389460519204, "language_loss": 0.94996566, "learning_rate": 3.038422700166474e-06, "loss": 0.96892214, "num_input_tokens_seen": 123975805, "step": 5762, "time_per_iteration": 2.636906623840332 }, { "auxiliary_loss_clip": 0.01106692, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.04844642, "balance_loss_mlp": 1.02467608, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 1.8335548533403485, "language_loss": 0.69540495, "learning_rate": 3.0380898286599692e-06, "loss": 0.71688455, "num_input_tokens_seen": 123997530, "step": 5763, "time_per_iteration": 2.8476505279541016 }, { "auxiliary_loss_clip": 0.01125911, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.04963946, "balance_loss_mlp": 1.03319085, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 2.0043623648961195, "language_loss": 0.83985734, "learning_rate": 3.0377569177889945e-06, "loss": 0.86163127, "num_input_tokens_seen": 124016375, "step": 5764, "time_per_iteration": 2.693847417831421 }, { "auxiliary_loss_clip": 0.01103367, "auxiliary_loss_mlp": 0.01039514, "balance_loss_clip": 1.04989028, "balance_loss_mlp": 1.02363563, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.2905956292147045, "language_loss": 0.6769501, "learning_rate": 3.0374239675661722e-06, "loss": 0.69837892, "num_input_tokens_seen": 124033975, "step": 5765, "time_per_iteration": 2.7656123638153076 }, { "auxiliary_loss_clip": 0.01108658, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.05017447, "balance_loss_mlp": 1.0279808, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 2.7236728572511653, "language_loss": 0.77394044, "learning_rate": 3.03709097800413e-06, "loss": 0.79547942, "num_input_tokens_seen": 124051930, "step": 5766, "time_per_iteration": 2.7095906734466553 }, { "auxiliary_loss_clip": 0.01078684, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.04552221, "balance_loss_mlp": 1.02113521, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 1.6543575607114767, "language_loss": 0.73547316, "learning_rate": 3.0367579491154943e-06, "loss": 0.75661922, "num_input_tokens_seen": 124071220, "step": 5767, "time_per_iteration": 2.8161730766296387 }, { "auxiliary_loss_clip": 0.01111822, "auxiliary_loss_mlp": 0.01043875, "balance_loss_clip": 1.05307102, "balance_loss_mlp": 1.02734113, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 2.2530154082607776, "language_loss": 0.7832194, "learning_rate": 3.036424880912893e-06, "loss": 0.80477637, "num_input_tokens_seen": 124090140, "step": 5768, "time_per_iteration": 4.265673875808716 }, { "auxiliary_loss_clip": 0.01050543, "auxiliary_loss_mlp": 0.01012109, "balance_loss_clip": 1.0320363, "balance_loss_mlp": 1.0104636, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.7741250202123364, "language_loss": 0.57502627, "learning_rate": 3.036091773408956e-06, "loss": 0.59565282, "num_input_tokens_seen": 124152025, "step": 5769, "time_per_iteration": 3.2264139652252197 }, { "auxiliary_loss_clip": 0.01107195, "auxiliary_loss_mlp": 0.01044629, "balance_loss_clip": 1.04818511, "balance_loss_mlp": 1.02630615, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 2.34841523993127, "language_loss": 0.85575318, "learning_rate": 3.0357586266163154e-06, "loss": 0.87727135, "num_input_tokens_seen": 124165795, "step": 5770, "time_per_iteration": 2.7029645442962646 }, { "auxiliary_loss_clip": 0.01034922, "auxiliary_loss_mlp": 0.01007496, "balance_loss_clip": 1.02998519, "balance_loss_mlp": 1.00527906, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.7677707974310557, "language_loss": 0.59758615, "learning_rate": 3.0354254405476036e-06, "loss": 0.6180104, "num_input_tokens_seen": 124222925, "step": 5771, "time_per_iteration": 4.5523951053619385 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.05249262, "balance_loss_mlp": 1.03320241, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 1.9048919633537342, "language_loss": 0.71560407, "learning_rate": 3.0350922152154557e-06, "loss": 0.73736715, "num_input_tokens_seen": 124240915, "step": 5772, "time_per_iteration": 2.8108439445495605 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.0077423, "balance_loss_clip": 1.05118012, "balance_loss_mlp": 1.00077164, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 1.679823492532721, "language_loss": 0.764898, "learning_rate": 3.034758950632507e-06, "loss": 0.78372908, "num_input_tokens_seen": 124262770, "step": 5773, "time_per_iteration": 2.813775062561035 }, { "auxiliary_loss_clip": 0.01128178, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.05019748, "balance_loss_mlp": 1.02674699, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 5.389351496516036, "language_loss": 0.70094979, "learning_rate": 3.034425646811396e-06, "loss": 0.72266221, "num_input_tokens_seen": 124280950, "step": 5774, "time_per_iteration": 4.167816162109375 }, { "auxiliary_loss_clip": 0.01113209, "auxiliary_loss_mlp": 0.00774032, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.00071549, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 1.6687380405540382, "language_loss": 0.76013231, "learning_rate": 3.0340923037647602e-06, "loss": 0.77900469, "num_input_tokens_seen": 124299540, "step": 5775, "time_per_iteration": 2.739729404449463 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.0480268, "balance_loss_mlp": 1.02965736, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 2.598065011523741, "language_loss": 0.77565503, "learning_rate": 3.0337589215052404e-06, "loss": 0.79727352, "num_input_tokens_seen": 124316285, "step": 5776, "time_per_iteration": 2.7339272499084473 }, { "auxiliary_loss_clip": 0.01036494, "auxiliary_loss_mlp": 0.01014475, "balance_loss_clip": 1.02741766, "balance_loss_mlp": 1.01280594, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8358378555600092, "language_loss": 0.63272905, "learning_rate": 3.033425500045478e-06, "loss": 0.65323877, "num_input_tokens_seen": 124376650, "step": 5777, "time_per_iteration": 3.257993459701538 }, { "auxiliary_loss_clip": 0.01098381, "auxiliary_loss_mlp": 0.01045801, "balance_loss_clip": 1.04933393, "balance_loss_mlp": 1.02975535, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 3.5330364681008755, "language_loss": 0.6504612, "learning_rate": 3.033092039398119e-06, "loss": 0.67190301, "num_input_tokens_seen": 124396475, "step": 5778, "time_per_iteration": 2.775846481323242 }, { "auxiliary_loss_clip": 0.01113961, "auxiliary_loss_mlp": 0.01054607, "balance_loss_clip": 1.04786038, "balance_loss_mlp": 1.03903246, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 2.3967507755094064, "language_loss": 0.71278334, "learning_rate": 3.0327585395758046e-06, "loss": 0.73446906, "num_input_tokens_seen": 124416480, "step": 5779, "time_per_iteration": 2.7915873527526855 }, { "auxiliary_loss_clip": 0.01142932, "auxiliary_loss_mlp": 0.01053692, "balance_loss_clip": 1.05395269, "balance_loss_mlp": 1.03762269, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 2.0452202029673043, "language_loss": 0.62873107, "learning_rate": 3.0324250005911837e-06, "loss": 0.65069735, "num_input_tokens_seen": 124435950, "step": 5780, "time_per_iteration": 2.6743876934051514 }, { "auxiliary_loss_clip": 0.01095736, "auxiliary_loss_mlp": 0.01050069, "balance_loss_clip": 1.04648292, "balance_loss_mlp": 1.03446484, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 1.6009150193459345, "language_loss": 0.72167897, "learning_rate": 3.0320914224569033e-06, "loss": 0.743137, "num_input_tokens_seen": 124455410, "step": 5781, "time_per_iteration": 2.749302625656128 }, { "auxiliary_loss_clip": 0.01073898, "auxiliary_loss_mlp": 0.01052117, "balance_loss_clip": 1.040519, "balance_loss_mlp": 1.03405714, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 2.5507599846278644, "language_loss": 0.76966107, "learning_rate": 3.031757805185612e-06, "loss": 0.79092121, "num_input_tokens_seen": 124474870, "step": 5782, "time_per_iteration": 2.801867723464966 }, { "auxiliary_loss_clip": 0.01108825, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.05032897, "balance_loss_mlp": 1.02193785, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 2.367934041085959, "language_loss": 0.62506068, "learning_rate": 3.0314241487899622e-06, "loss": 0.64651906, "num_input_tokens_seen": 124494105, "step": 5783, "time_per_iteration": 2.709778070449829 }, { "auxiliary_loss_clip": 0.01092863, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.04997683, "balance_loss_mlp": 1.0163672, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 1.7498214415914104, "language_loss": 0.88513505, "learning_rate": 3.031090453282605e-06, "loss": 0.90637398, "num_input_tokens_seen": 124512030, "step": 5784, "time_per_iteration": 2.769317150115967 }, { "auxiliary_loss_clip": 0.01089006, "auxiliary_loss_mlp": 0.01036783, "balance_loss_clip": 1.05206084, "balance_loss_mlp": 1.02097547, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 1.703369857104052, "language_loss": 0.81740022, "learning_rate": 3.0307567186761946e-06, "loss": 0.83865809, "num_input_tokens_seen": 124530980, "step": 5785, "time_per_iteration": 2.791860818862915 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.04747128, "balance_loss_mlp": 1.02563095, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 1.689422515624071, "language_loss": 0.80540836, "learning_rate": 3.0304229449833862e-06, "loss": 0.82688099, "num_input_tokens_seen": 124549330, "step": 5786, "time_per_iteration": 2.7547576427459717 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.00773369, "balance_loss_clip": 1.05242872, "balance_loss_mlp": 1.00073981, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 2.7072955912962686, "language_loss": 0.74945676, "learning_rate": 3.030089132216836e-06, "loss": 0.76854098, "num_input_tokens_seen": 124567200, "step": 5787, "time_per_iteration": 2.592688798904419 }, { "auxiliary_loss_clip": 0.01102822, "auxiliary_loss_mlp": 0.00773627, "balance_loss_clip": 1.04294109, "balance_loss_mlp": 1.00074553, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 1.9068485918966191, "language_loss": 0.81542754, "learning_rate": 3.029755280389203e-06, "loss": 0.83419204, "num_input_tokens_seen": 124587025, "step": 5788, "time_per_iteration": 2.84395694732666 }, { "auxiliary_loss_clip": 0.01144785, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.0562067, "balance_loss_mlp": 1.02140832, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 2.2432452775203964, "language_loss": 0.85701168, "learning_rate": 3.029421389513147e-06, "loss": 0.87883425, "num_input_tokens_seen": 124605860, "step": 5789, "time_per_iteration": 2.630535125732422 }, { "auxiliary_loss_clip": 0.01130136, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.05231345, "balance_loss_mlp": 1.04007459, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 5.008598067350991, "language_loss": 0.8502599, "learning_rate": 3.029087459601328e-06, "loss": 0.87212288, "num_input_tokens_seen": 124624270, "step": 5790, "time_per_iteration": 2.6052823066711426 }, { "auxiliary_loss_clip": 0.01130643, "auxiliary_loss_mlp": 0.01044731, "balance_loss_clip": 1.05373776, "balance_loss_mlp": 1.02904904, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 1.9264082121319324, "language_loss": 0.80832046, "learning_rate": 3.0287534906664097e-06, "loss": 0.83007419, "num_input_tokens_seen": 124644005, "step": 5791, "time_per_iteration": 2.7190260887145996 }, { "auxiliary_loss_clip": 0.01125872, "auxiliary_loss_mlp": 0.0104286, "balance_loss_clip": 1.04968619, "balance_loss_mlp": 1.02690983, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 2.4373031068755022, "language_loss": 0.77855796, "learning_rate": 3.028419482721056e-06, "loss": 0.80024529, "num_input_tokens_seen": 124663020, "step": 5792, "time_per_iteration": 2.7223403453826904 }, { "auxiliary_loss_clip": 0.01108923, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.04401517, "balance_loss_mlp": 1.01922882, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 1.6684091148270528, "language_loss": 0.81824791, "learning_rate": 3.0280854357779325e-06, "loss": 0.8396861, "num_input_tokens_seen": 124682975, "step": 5793, "time_per_iteration": 2.84191632270813 }, { "auxiliary_loss_clip": 0.01124823, "auxiliary_loss_mlp": 0.01055766, "balance_loss_clip": 1.05077863, "balance_loss_mlp": 1.0392313, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 1.8786694421525794, "language_loss": 0.7607373, "learning_rate": 3.027751349849706e-06, "loss": 0.78254318, "num_input_tokens_seen": 124701340, "step": 5794, "time_per_iteration": 2.707648515701294 }, { "auxiliary_loss_clip": 0.01123664, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.04820764, "balance_loss_mlp": 1.02735913, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 2.79979085265216, "language_loss": 0.57190084, "learning_rate": 3.0274172249490456e-06, "loss": 0.59357756, "num_input_tokens_seen": 124719165, "step": 5795, "time_per_iteration": 2.6533401012420654 }, { "auxiliary_loss_clip": 0.01106011, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.04720807, "balance_loss_mlp": 1.02177811, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 2.0564463844351546, "language_loss": 0.82218957, "learning_rate": 3.0270830610886213e-06, "loss": 0.84361899, "num_input_tokens_seen": 124738670, "step": 5796, "time_per_iteration": 2.6823246479034424 }, { "auxiliary_loss_clip": 0.01120404, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.04927754, "balance_loss_mlp": 1.0192616, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 1.9927036097023587, "language_loss": 0.83429003, "learning_rate": 3.0267488582811033e-06, "loss": 0.85583472, "num_input_tokens_seen": 124758760, "step": 5797, "time_per_iteration": 2.7048346996307373 }, { "auxiliary_loss_clip": 0.01132676, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.05049801, "balance_loss_mlp": 1.02151191, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 1.9361964581914621, "language_loss": 0.73449033, "learning_rate": 3.026414616539167e-06, "loss": 0.75618768, "num_input_tokens_seen": 124777765, "step": 5798, "time_per_iteration": 2.6807782649993896 }, { "auxiliary_loss_clip": 0.01135458, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.04995012, "balance_loss_mlp": 1.02815914, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 2.5738259800272725, "language_loss": 0.76111758, "learning_rate": 3.026080335875485e-06, "loss": 0.78291941, "num_input_tokens_seen": 124796775, "step": 5799, "time_per_iteration": 2.629671096801758 }, { "auxiliary_loss_clip": 0.01073192, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.05208993, "balance_loss_mlp": 1.02083826, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 2.242229362705527, "language_loss": 0.75801086, "learning_rate": 3.025746016302734e-06, "loss": 0.77910256, "num_input_tokens_seen": 124815825, "step": 5800, "time_per_iteration": 3.047725200653076 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.00774006, "balance_loss_clip": 1.04720354, "balance_loss_mlp": 1.00079536, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 2.6257316922509286, "language_loss": 0.67468953, "learning_rate": 3.025411657833591e-06, "loss": 0.69355887, "num_input_tokens_seen": 124838420, "step": 5801, "time_per_iteration": 3.2364816665649414 }, { "auxiliary_loss_clip": 0.01103773, "auxiliary_loss_mlp": 0.010448, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.028754, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 1.8428676315803219, "language_loss": 0.76738638, "learning_rate": 3.025077260480735e-06, "loss": 0.78887206, "num_input_tokens_seen": 124857320, "step": 5802, "time_per_iteration": 2.7959024906158447 }, { "auxiliary_loss_clip": 0.01053855, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03989601, "balance_loss_mlp": 1.02219605, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 1.7816673584343024, "language_loss": 0.78991377, "learning_rate": 3.0247428242568474e-06, "loss": 0.81082606, "num_input_tokens_seen": 124875685, "step": 5803, "time_per_iteration": 2.8440747261047363 }, { "auxiliary_loss_clip": 0.01111548, "auxiliary_loss_mlp": 0.00774436, "balance_loss_clip": 1.04601288, "balance_loss_mlp": 1.00073576, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 6.169621760932873, "language_loss": 0.67899323, "learning_rate": 3.0244083491746085e-06, "loss": 0.69785309, "num_input_tokens_seen": 124895960, "step": 5804, "time_per_iteration": 2.8011341094970703 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01046207, "balance_loss_clip": 1.05153811, "balance_loss_mlp": 1.0306263, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 1.9366950093174176, "language_loss": 0.75972986, "learning_rate": 3.024073835246702e-06, "loss": 0.78128237, "num_input_tokens_seen": 124914140, "step": 5805, "time_per_iteration": 2.735410213470459 }, { "auxiliary_loss_clip": 0.01085261, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.040416, "balance_loss_mlp": 1.0230304, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 2.3089286954803194, "language_loss": 0.67154014, "learning_rate": 3.023739282485814e-06, "loss": 0.69277781, "num_input_tokens_seen": 124934180, "step": 5806, "time_per_iteration": 2.793893575668335 }, { "auxiliary_loss_clip": 0.01122813, "auxiliary_loss_mlp": 0.0104012, "balance_loss_clip": 1.05324221, "balance_loss_mlp": 1.02445614, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.5212397526739, "language_loss": 0.71703929, "learning_rate": 3.023404690904629e-06, "loss": 0.73866862, "num_input_tokens_seen": 124956060, "step": 5807, "time_per_iteration": 2.7225730419158936 }, { "auxiliary_loss_clip": 0.01135343, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.04923332, "balance_loss_mlp": 1.02102923, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 2.9062872704377125, "language_loss": 0.7383548, "learning_rate": 3.0230700605158364e-06, "loss": 0.76007676, "num_input_tokens_seen": 124976070, "step": 5808, "time_per_iteration": 4.38737154006958 }, { "auxiliary_loss_clip": 0.01133483, "auxiliary_loss_mlp": 0.01047071, "balance_loss_clip": 1.05228174, "balance_loss_mlp": 1.03241384, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 1.513097370663534, "language_loss": 0.84501046, "learning_rate": 3.0227353913321238e-06, "loss": 0.86681598, "num_input_tokens_seen": 124996995, "step": 5809, "time_per_iteration": 2.629246711730957 }, { "auxiliary_loss_clip": 0.01106316, "auxiliary_loss_mlp": 0.01034055, "balance_loss_clip": 1.04668331, "balance_loss_mlp": 1.01995289, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 2.856878325415132, "language_loss": 0.80759805, "learning_rate": 3.0224006833661835e-06, "loss": 0.82900178, "num_input_tokens_seen": 125015600, "step": 5810, "time_per_iteration": 2.815232276916504 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.05105019, "balance_loss_mlp": 1.02539277, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 1.9587859815348794, "language_loss": 0.75694251, "learning_rate": 3.0220659366307057e-06, "loss": 0.7786814, "num_input_tokens_seen": 125035290, "step": 5811, "time_per_iteration": 4.295617580413818 }, { "auxiliary_loss_clip": 0.0111498, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.04791081, "balance_loss_mlp": 1.02616942, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 1.5951936061604581, "language_loss": 0.80199474, "learning_rate": 3.021731151138386e-06, "loss": 0.82355154, "num_input_tokens_seen": 125057130, "step": 5812, "time_per_iteration": 2.8571486473083496 }, { "auxiliary_loss_clip": 0.0106966, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.04193187, "balance_loss_mlp": 1.02299738, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 1.932575417997546, "language_loss": 0.69221139, "learning_rate": 3.021396326901918e-06, "loss": 0.71329308, "num_input_tokens_seen": 125073720, "step": 5813, "time_per_iteration": 4.446147441864014 }, { "auxiliary_loss_clip": 0.01101223, "auxiliary_loss_mlp": 0.00772918, "balance_loss_clip": 1.04168797, "balance_loss_mlp": 1.00074911, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 2.168508070197816, "language_loss": 0.76586467, "learning_rate": 3.0210614639339998e-06, "loss": 0.7846061, "num_input_tokens_seen": 125090635, "step": 5814, "time_per_iteration": 2.698594331741333 }, { "auxiliary_loss_clip": 0.01114737, "auxiliary_loss_mlp": 0.00773337, "balance_loss_clip": 1.05010188, "balance_loss_mlp": 1.00060046, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 1.9777422761312171, "language_loss": 0.84760284, "learning_rate": 3.020726562247328e-06, "loss": 0.86648357, "num_input_tokens_seen": 125110070, "step": 5815, "time_per_iteration": 2.7839486598968506 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01031007, "balance_loss_clip": 1.04850423, "balance_loss_mlp": 1.01695168, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 2.1137892099104674, "language_loss": 0.77541941, "learning_rate": 3.0203916218546024e-06, "loss": 0.79689968, "num_input_tokens_seen": 125125730, "step": 5816, "time_per_iteration": 2.6244633197784424 }, { "auxiliary_loss_clip": 0.01122041, "auxiliary_loss_mlp": 0.01042966, "balance_loss_clip": 1.05198002, "balance_loss_mlp": 1.0282141, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 2.2643435778821246, "language_loss": 0.5898062, "learning_rate": 3.0200566427685246e-06, "loss": 0.61145627, "num_input_tokens_seen": 125146195, "step": 5817, "time_per_iteration": 2.676058530807495 }, { "auxiliary_loss_clip": 0.01065616, "auxiliary_loss_mlp": 0.01004328, "balance_loss_clip": 1.03704262, "balance_loss_mlp": 1.00290895, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.8661744616347857, "language_loss": 0.59915632, "learning_rate": 3.0197216250017975e-06, "loss": 0.61985576, "num_input_tokens_seen": 125207790, "step": 5818, "time_per_iteration": 3.2298331260681152 }, { "auxiliary_loss_clip": 0.0109396, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.04599476, "balance_loss_mlp": 1.02892733, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 2.0582091611638713, "language_loss": 0.83473527, "learning_rate": 3.019386568567123e-06, "loss": 0.85611546, "num_input_tokens_seen": 125226220, "step": 5819, "time_per_iteration": 2.6558237075805664 }, { "auxiliary_loss_clip": 0.01106439, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.04502416, "balance_loss_mlp": 1.01987886, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 1.848700539441483, "language_loss": 0.7078613, "learning_rate": 3.0190514734772083e-06, "loss": 0.72926915, "num_input_tokens_seen": 125247485, "step": 5820, "time_per_iteration": 2.703023672103882 }, { "auxiliary_loss_clip": 0.01122902, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04821718, "balance_loss_mlp": 1.02288496, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 1.691680241057735, "language_loss": 0.70418453, "learning_rate": 3.018716339744759e-06, "loss": 0.7257812, "num_input_tokens_seen": 125268625, "step": 5821, "time_per_iteration": 2.7258172035217285 }, { "auxiliary_loss_clip": 0.01128016, "auxiliary_loss_mlp": 0.01045237, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02945328, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 3.022669367007059, "language_loss": 0.73552108, "learning_rate": 3.0183811673824842e-06, "loss": 0.75725359, "num_input_tokens_seen": 125287530, "step": 5822, "time_per_iteration": 2.6288442611694336 }, { "auxiliary_loss_clip": 0.01111612, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04867673, "balance_loss_mlp": 1.0193131, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 13.86145468617928, "language_loss": 0.78286207, "learning_rate": 3.018045956403094e-06, "loss": 0.80432606, "num_input_tokens_seen": 125307020, "step": 5823, "time_per_iteration": 2.585644245147705 }, { "auxiliary_loss_clip": 0.01050549, "auxiliary_loss_mlp": 0.01002993, "balance_loss_clip": 1.03169346, "balance_loss_mlp": 1.00141954, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7268668465066358, "language_loss": 0.59232962, "learning_rate": 3.017710706819298e-06, "loss": 0.61286497, "num_input_tokens_seen": 125370445, "step": 5824, "time_per_iteration": 3.2155251502990723 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01041197, "balance_loss_clip": 1.04737854, "balance_loss_mlp": 1.02561092, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 3.9873136748139126, "language_loss": 0.84533477, "learning_rate": 3.017375418643811e-06, "loss": 0.86685359, "num_input_tokens_seen": 125388900, "step": 5825, "time_per_iteration": 2.687849998474121 }, { "auxiliary_loss_clip": 0.01123129, "auxiliary_loss_mlp": 0.00772852, "balance_loss_clip": 1.04982102, "balance_loss_mlp": 1.00084817, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 3.7970216760931654, "language_loss": 0.83272213, "learning_rate": 3.0170400918893464e-06, "loss": 0.85168195, "num_input_tokens_seen": 125402675, "step": 5826, "time_per_iteration": 2.623713970184326 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.04680669, "balance_loss_mlp": 1.0308249, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 1.799644232020304, "language_loss": 0.8068707, "learning_rate": 3.0167047265686186e-06, "loss": 0.82841766, "num_input_tokens_seen": 125421360, "step": 5827, "time_per_iteration": 2.7149739265441895 }, { "auxiliary_loss_clip": 0.01080927, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04276204, "balance_loss_mlp": 1.02641606, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 3.105536532024743, "language_loss": 0.71077561, "learning_rate": 3.0163693226943467e-06, "loss": 0.73199868, "num_input_tokens_seen": 125440000, "step": 5828, "time_per_iteration": 2.7468550205230713 }, { "auxiliary_loss_clip": 0.01126682, "auxiliary_loss_mlp": 0.01050267, "balance_loss_clip": 1.05060673, "balance_loss_mlp": 1.0323143, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 2.750124615693701, "language_loss": 0.79695857, "learning_rate": 3.016033880279248e-06, "loss": 0.81872809, "num_input_tokens_seen": 125460390, "step": 5829, "time_per_iteration": 2.6937646865844727 }, { "auxiliary_loss_clip": 0.01096574, "auxiliary_loss_mlp": 0.01044418, "balance_loss_clip": 1.0481379, "balance_loss_mlp": 1.02766919, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 1.9090298023730403, "language_loss": 0.72606629, "learning_rate": 3.0156983993360417e-06, "loss": 0.74747616, "num_input_tokens_seen": 125478410, "step": 5830, "time_per_iteration": 2.7369346618652344 }, { "auxiliary_loss_clip": 0.01090166, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.04190445, "balance_loss_mlp": 1.02131414, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 2.5268343856675437, "language_loss": 0.88473773, "learning_rate": 3.0153628798774513e-06, "loss": 0.90601242, "num_input_tokens_seen": 125495975, "step": 5831, "time_per_iteration": 2.716801166534424 }, { "auxiliary_loss_clip": 0.01076431, "auxiliary_loss_mlp": 0.01046131, "balance_loss_clip": 1.04348278, "balance_loss_mlp": 1.03036547, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 2.8335622037275052, "language_loss": 0.78706706, "learning_rate": 3.0150273219161985e-06, "loss": 0.80829263, "num_input_tokens_seen": 125515035, "step": 5832, "time_per_iteration": 2.719874143600464 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01049214, "balance_loss_clip": 1.04483593, "balance_loss_mlp": 1.0303669, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 2.771771323399588, "language_loss": 0.71084702, "learning_rate": 3.014691725465008e-06, "loss": 0.73229945, "num_input_tokens_seen": 125535555, "step": 5833, "time_per_iteration": 2.729029655456543 }, { "auxiliary_loss_clip": 0.0111933, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.04690456, "balance_loss_mlp": 1.02119827, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 1.4652984704802052, "language_loss": 0.80866987, "learning_rate": 3.014356090536606e-06, "loss": 0.830221, "num_input_tokens_seen": 125558195, "step": 5834, "time_per_iteration": 2.6999855041503906 }, { "auxiliary_loss_clip": 0.01086162, "auxiliary_loss_mlp": 0.01041057, "balance_loss_clip": 1.05142856, "balance_loss_mlp": 1.02516639, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 2.24398587431922, "language_loss": 0.84067535, "learning_rate": 3.0140204171437183e-06, "loss": 0.86194754, "num_input_tokens_seen": 125575375, "step": 5835, "time_per_iteration": 2.7401607036590576 }, { "auxiliary_loss_clip": 0.01072219, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.04324877, "balance_loss_mlp": 1.02816927, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 1.6286460178957367, "language_loss": 0.76643491, "learning_rate": 3.0136847052990754e-06, "loss": 0.78759408, "num_input_tokens_seen": 125596745, "step": 5836, "time_per_iteration": 2.767824649810791 }, { "auxiliary_loss_clip": 0.01095252, "auxiliary_loss_mlp": 0.01044499, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.02751756, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 2.0145924652365945, "language_loss": 0.77402902, "learning_rate": 3.0133489550154074e-06, "loss": 0.79542655, "num_input_tokens_seen": 125613980, "step": 5837, "time_per_iteration": 2.684300661087036 }, { "auxiliary_loss_clip": 0.01122261, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.04895687, "balance_loss_mlp": 1.02941537, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 2.68275803808264, "language_loss": 0.67695981, "learning_rate": 3.0130131663054442e-06, "loss": 0.69863135, "num_input_tokens_seen": 125632100, "step": 5838, "time_per_iteration": 2.6679129600524902 }, { "auxiliary_loss_clip": 0.01133084, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04808521, "balance_loss_mlp": 1.02538526, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 2.478699358378921, "language_loss": 0.83575064, "learning_rate": 3.0126773391819215e-06, "loss": 0.85749567, "num_input_tokens_seen": 125649190, "step": 5839, "time_per_iteration": 2.7186849117279053 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0484879, "balance_loss_mlp": 1.02930689, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 2.56286420283892, "language_loss": 0.58882701, "learning_rate": 3.012341473657572e-06, "loss": 0.61053669, "num_input_tokens_seen": 125668680, "step": 5840, "time_per_iteration": 2.7048165798187256 }, { "auxiliary_loss_clip": 0.01093858, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.0449121, "balance_loss_mlp": 1.02719963, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 2.762376787670534, "language_loss": 0.87442869, "learning_rate": 3.0120055697451322e-06, "loss": 0.89579934, "num_input_tokens_seen": 125686935, "step": 5841, "time_per_iteration": 2.763007402420044 }, { "auxiliary_loss_clip": 0.01116677, "auxiliary_loss_mlp": 0.01038697, "balance_loss_clip": 1.04990196, "balance_loss_mlp": 1.02083993, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 1.9868500880648916, "language_loss": 0.75116056, "learning_rate": 3.0116696274573406e-06, "loss": 0.77271438, "num_input_tokens_seen": 125707180, "step": 5842, "time_per_iteration": 2.703010082244873 }, { "auxiliary_loss_clip": 0.01124735, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.04863322, "balance_loss_mlp": 1.0302043, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 2.134458584945634, "language_loss": 0.68687361, "learning_rate": 3.0113336468069346e-06, "loss": 0.70857882, "num_input_tokens_seen": 125722780, "step": 5843, "time_per_iteration": 2.6459767818450928 }, { "auxiliary_loss_clip": 0.01135637, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05054379, "balance_loss_mlp": 1.0305481, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 2.0610262324560984, "language_loss": 0.65392244, "learning_rate": 3.010997627806655e-06, "loss": 0.67574418, "num_input_tokens_seen": 125742110, "step": 5844, "time_per_iteration": 2.6542131900787354 }, { "auxiliary_loss_clip": 0.01119986, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.04791713, "balance_loss_mlp": 1.02620745, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 2.0120705985466394, "language_loss": 0.75180912, "learning_rate": 3.010661570469245e-06, "loss": 0.77343476, "num_input_tokens_seen": 125759980, "step": 5845, "time_per_iteration": 2.686753511428833 }, { "auxiliary_loss_clip": 0.01122626, "auxiliary_loss_mlp": 0.01043989, "balance_loss_clip": 1.0485301, "balance_loss_mlp": 1.02835488, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 4.021226487899694, "language_loss": 0.73548663, "learning_rate": 3.0103254748074465e-06, "loss": 0.7571528, "num_input_tokens_seen": 125772660, "step": 5846, "time_per_iteration": 2.67868971824646 }, { "auxiliary_loss_clip": 0.01094187, "auxiliary_loss_mlp": 0.01044379, "balance_loss_clip": 1.04565465, "balance_loss_mlp": 1.02834511, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 1.687499817432144, "language_loss": 0.756024, "learning_rate": 3.0099893408340046e-06, "loss": 0.77740967, "num_input_tokens_seen": 125791935, "step": 5847, "time_per_iteration": 2.749495267868042 }, { "auxiliary_loss_clip": 0.011087, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.04465413, "balance_loss_mlp": 1.01871789, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 2.8847551511625675, "language_loss": 0.71752924, "learning_rate": 3.009653168561666e-06, "loss": 0.73895657, "num_input_tokens_seen": 125813455, "step": 5848, "time_per_iteration": 4.367843151092529 }, { "auxiliary_loss_clip": 0.0111724, "auxiliary_loss_mlp": 0.01051356, "balance_loss_clip": 1.04754996, "balance_loss_mlp": 1.03528619, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 2.1303857634409455, "language_loss": 0.89211285, "learning_rate": 3.009316958003178e-06, "loss": 0.91379881, "num_input_tokens_seen": 125827660, "step": 5849, "time_per_iteration": 2.720156192779541 }, { "auxiliary_loss_clip": 0.01112345, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.04670548, "balance_loss_mlp": 1.01948714, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 5.671837642447228, "language_loss": 0.74645329, "learning_rate": 3.0089807091712897e-06, "loss": 0.76792872, "num_input_tokens_seen": 125846655, "step": 5850, "time_per_iteration": 5.769666910171509 }, { "auxiliary_loss_clip": 0.01124277, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.05061293, "balance_loss_mlp": 1.02304828, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 4.453824391316201, "language_loss": 0.75497609, "learning_rate": 3.0086444220787515e-06, "loss": 0.77661049, "num_input_tokens_seen": 125866290, "step": 5851, "time_per_iteration": 2.6903436183929443 }, { "auxiliary_loss_clip": 0.01109028, "auxiliary_loss_mlp": 0.01043585, "balance_loss_clip": 1.047647, "balance_loss_mlp": 1.02581048, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 2.6842208339362714, "language_loss": 0.8711859, "learning_rate": 3.0083080967383165e-06, "loss": 0.892712, "num_input_tokens_seen": 125884620, "step": 5852, "time_per_iteration": 4.37211275100708 }, { "auxiliary_loss_clip": 0.01134086, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.05088282, "balance_loss_mlp": 1.02020407, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 4.894656899057391, "language_loss": 0.67756367, "learning_rate": 3.007971733162737e-06, "loss": 0.69925427, "num_input_tokens_seen": 125902430, "step": 5853, "time_per_iteration": 2.6657445430755615 }, { "auxiliary_loss_clip": 0.0110992, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.04499912, "balance_loss_mlp": 1.01943672, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 1.9396695842158058, "language_loss": 0.80834955, "learning_rate": 3.0076353313647686e-06, "loss": 0.82980192, "num_input_tokens_seen": 125920570, "step": 5854, "time_per_iteration": 2.741804361343384 }, { "auxiliary_loss_clip": 0.0111683, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.05230534, "balance_loss_mlp": 1.02117872, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 2.236186864476635, "language_loss": 0.73234653, "learning_rate": 3.0072988913571666e-06, "loss": 0.75387061, "num_input_tokens_seen": 125939800, "step": 5855, "time_per_iteration": 2.730731725692749 }, { "auxiliary_loss_clip": 0.0113392, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.05024409, "balance_loss_mlp": 1.02407861, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 2.4482136775911427, "language_loss": 0.71000826, "learning_rate": 3.006962413152691e-06, "loss": 0.73173165, "num_input_tokens_seen": 125958720, "step": 5856, "time_per_iteration": 2.632906436920166 }, { "auxiliary_loss_clip": 0.01121339, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.0479008, "balance_loss_mlp": 1.03056359, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 1.9582827204032656, "language_loss": 0.61505377, "learning_rate": 3.0066258967640987e-06, "loss": 0.63673985, "num_input_tokens_seen": 125984310, "step": 5857, "time_per_iteration": 2.8992249965667725 }, { "auxiliary_loss_clip": 0.01126198, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.05141187, "balance_loss_mlp": 1.02197754, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 2.047463358229584, "language_loss": 0.73246485, "learning_rate": 3.006289342204152e-06, "loss": 0.75410509, "num_input_tokens_seen": 126002410, "step": 5858, "time_per_iteration": 2.6754567623138428 }, { "auxiliary_loss_clip": 0.01139705, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.05193448, "balance_loss_mlp": 1.028947, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 1.8174320112537778, "language_loss": 0.7662344, "learning_rate": 3.0059527494856126e-06, "loss": 0.78807867, "num_input_tokens_seen": 126022490, "step": 5859, "time_per_iteration": 2.6464414596557617 }, { "auxiliary_loss_clip": 0.01123734, "auxiliary_loss_mlp": 0.0104748, "balance_loss_clip": 1.05600715, "balance_loss_mlp": 1.03037381, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 2.0728265984729974, "language_loss": 0.71452159, "learning_rate": 3.0056161186212435e-06, "loss": 0.73623371, "num_input_tokens_seen": 126042895, "step": 5860, "time_per_iteration": 2.7567954063415527 }, { "auxiliary_loss_clip": 0.01107752, "auxiliary_loss_mlp": 0.01042463, "balance_loss_clip": 1.04505348, "balance_loss_mlp": 1.02517724, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.4820154826508896, "language_loss": 0.66456246, "learning_rate": 3.005279449623811e-06, "loss": 0.6860646, "num_input_tokens_seen": 126060130, "step": 5861, "time_per_iteration": 2.6954853534698486 }, { "auxiliary_loss_clip": 0.01114832, "auxiliary_loss_mlp": 0.01037396, "balance_loss_clip": 1.05085611, "balance_loss_mlp": 1.0220778, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 2.552495084661914, "language_loss": 0.66833258, "learning_rate": 3.0049427425060815e-06, "loss": 0.68985492, "num_input_tokens_seen": 126077850, "step": 5862, "time_per_iteration": 2.758626699447632 }, { "auxiliary_loss_clip": 0.01111543, "auxiliary_loss_mlp": 0.01046885, "balance_loss_clip": 1.04932082, "balance_loss_mlp": 1.02999306, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 2.001922070828984, "language_loss": 0.77027225, "learning_rate": 3.0046059972808215e-06, "loss": 0.79185653, "num_input_tokens_seen": 126095985, "step": 5863, "time_per_iteration": 2.692974328994751 }, { "auxiliary_loss_clip": 0.01124448, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.05029762, "balance_loss_mlp": 1.02602828, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 2.204178263750967, "language_loss": 0.75406265, "learning_rate": 3.0042692139608024e-06, "loss": 0.77571976, "num_input_tokens_seen": 126116070, "step": 5864, "time_per_iteration": 2.7303273677825928 }, { "auxiliary_loss_clip": 0.01124417, "auxiliary_loss_mlp": 0.01048097, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.03237331, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 2.3571129928423713, "language_loss": 0.79312253, "learning_rate": 3.003932392558793e-06, "loss": 0.81484771, "num_input_tokens_seen": 126135205, "step": 5865, "time_per_iteration": 2.6439075469970703 }, { "auxiliary_loss_clip": 0.01136688, "auxiliary_loss_mlp": 0.01047929, "balance_loss_clip": 1.05626893, "balance_loss_mlp": 1.03143001, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 2.261768767041389, "language_loss": 0.81215894, "learning_rate": 3.0035955330875677e-06, "loss": 0.83400512, "num_input_tokens_seen": 126151895, "step": 5866, "time_per_iteration": 2.649991035461426 }, { "auxiliary_loss_clip": 0.01095064, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.04940605, "balance_loss_mlp": 1.0227983, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 2.4092573216113182, "language_loss": 0.84224141, "learning_rate": 3.0032586355598986e-06, "loss": 0.86359721, "num_input_tokens_seen": 126168515, "step": 5867, "time_per_iteration": 2.7634172439575195 }, { "auxiliary_loss_clip": 0.01142449, "auxiliary_loss_mlp": 0.01051484, "balance_loss_clip": 1.05421114, "balance_loss_mlp": 1.03525996, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 1.8115003163784764, "language_loss": 0.74367464, "learning_rate": 3.0029216999885613e-06, "loss": 0.76561391, "num_input_tokens_seen": 126186460, "step": 5868, "time_per_iteration": 2.5986721515655518 }, { "auxiliary_loss_clip": 0.01131163, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.05391645, "balance_loss_mlp": 1.02457356, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 1.9536193185751474, "language_loss": 0.6105355, "learning_rate": 3.0025847263863327e-06, "loss": 0.63225693, "num_input_tokens_seen": 126206170, "step": 5869, "time_per_iteration": 2.6737887859344482 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.01048512, "balance_loss_clip": 1.05128717, "balance_loss_mlp": 1.03254998, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 2.4234624332717347, "language_loss": 0.74279565, "learning_rate": 3.0022477147659917e-06, "loss": 0.76456618, "num_input_tokens_seen": 126225605, "step": 5870, "time_per_iteration": 2.6921114921569824 }, { "auxiliary_loss_clip": 0.01126478, "auxiliary_loss_mlp": 0.01039703, "balance_loss_clip": 1.05037582, "balance_loss_mlp": 1.02376485, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 1.6641276231491144, "language_loss": 0.71796882, "learning_rate": 3.001910665140316e-06, "loss": 0.73963058, "num_input_tokens_seen": 126250230, "step": 5871, "time_per_iteration": 2.8457682132720947 }, { "auxiliary_loss_clip": 0.01120204, "auxiliary_loss_mlp": 0.01040363, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02547359, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 2.0001362497177233, "language_loss": 0.73279023, "learning_rate": 3.0015735775220873e-06, "loss": 0.75439584, "num_input_tokens_seen": 126268315, "step": 5872, "time_per_iteration": 2.6763055324554443 }, { "auxiliary_loss_clip": 0.01114426, "auxiliary_loss_mlp": 0.0077352, "balance_loss_clip": 1.04808497, "balance_loss_mlp": 1.00056779, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 1.9067005964756008, "language_loss": 0.82472706, "learning_rate": 3.001236451924089e-06, "loss": 0.84360659, "num_input_tokens_seen": 126288390, "step": 5873, "time_per_iteration": 2.7487120628356934 }, { "auxiliary_loss_clip": 0.0111852, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04805684, "balance_loss_mlp": 1.03743458, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 2.0747562837168956, "language_loss": 0.65867126, "learning_rate": 3.000899288359104e-06, "loss": 0.68040824, "num_input_tokens_seen": 126305750, "step": 5874, "time_per_iteration": 2.717100143432617 }, { "auxiliary_loss_clip": 0.01065517, "auxiliary_loss_mlp": 0.01018804, "balance_loss_clip": 1.04397154, "balance_loss_mlp": 1.01712346, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.7718710282270123, "language_loss": 0.61513722, "learning_rate": 3.000562086839917e-06, "loss": 0.63598049, "num_input_tokens_seen": 126362495, "step": 5875, "time_per_iteration": 3.1768009662628174 }, { "auxiliary_loss_clip": 0.0106968, "auxiliary_loss_mlp": 0.01053019, "balance_loss_clip": 1.04069328, "balance_loss_mlp": 1.03722405, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 1.9274751499515825, "language_loss": 0.79748046, "learning_rate": 3.0002248473793163e-06, "loss": 0.81870747, "num_input_tokens_seen": 126378320, "step": 5876, "time_per_iteration": 2.7911314964294434 }, { "auxiliary_loss_clip": 0.01038976, "auxiliary_loss_mlp": 0.00753375, "balance_loss_clip": 1.03853297, "balance_loss_mlp": 1.00146759, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6715924709851474, "language_loss": 0.56771934, "learning_rate": 2.999887569990088e-06, "loss": 0.58564281, "num_input_tokens_seen": 126442735, "step": 5877, "time_per_iteration": 3.3190126419067383 }, { "auxiliary_loss_clip": 0.01106988, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.04755747, "balance_loss_mlp": 1.02150357, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 2.262624772342981, "language_loss": 0.72041059, "learning_rate": 2.999550254685024e-06, "loss": 0.74185729, "num_input_tokens_seen": 126463090, "step": 5878, "time_per_iteration": 2.769482135772705 }, { "auxiliary_loss_clip": 0.01111223, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.02333045, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 1.9529875004972157, "language_loss": 0.78282005, "learning_rate": 2.9992129014769136e-06, "loss": 0.80432463, "num_input_tokens_seen": 126482105, "step": 5879, "time_per_iteration": 2.7066614627838135 }, { "auxiliary_loss_clip": 0.01111375, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05344558, "balance_loss_mlp": 1.0287354, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 2.4774809869114547, "language_loss": 0.63312674, "learning_rate": 2.9988755103785493e-06, "loss": 0.65470898, "num_input_tokens_seen": 126502125, "step": 5880, "time_per_iteration": 2.87187123298645 }, { "auxiliary_loss_clip": 0.01116729, "auxiliary_loss_mlp": 0.01037267, "balance_loss_clip": 1.05014002, "balance_loss_mlp": 1.02067327, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 2.079670586085082, "language_loss": 0.65503716, "learning_rate": 2.998538081402727e-06, "loss": 0.67657715, "num_input_tokens_seen": 126521950, "step": 5881, "time_per_iteration": 2.701570510864258 }, { "auxiliary_loss_clip": 0.01119778, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.05182576, "balance_loss_mlp": 1.02047253, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 1.437925300063569, "language_loss": 0.75797737, "learning_rate": 2.998200614562239e-06, "loss": 0.77953088, "num_input_tokens_seen": 126542445, "step": 5882, "time_per_iteration": 2.713350772857666 }, { "auxiliary_loss_clip": 0.01112568, "auxiliary_loss_mlp": 0.01044857, "balance_loss_clip": 1.0485872, "balance_loss_mlp": 1.02591491, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 2.160470372067537, "language_loss": 0.70095098, "learning_rate": 2.9978631098698847e-06, "loss": 0.72252524, "num_input_tokens_seen": 126560690, "step": 5883, "time_per_iteration": 2.77695631980896 }, { "auxiliary_loss_clip": 0.01107169, "auxiliary_loss_mlp": 0.01040706, "balance_loss_clip": 1.04937398, "balance_loss_mlp": 1.02364671, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 3.3935912100169117, "language_loss": 0.78052664, "learning_rate": 2.9975255673384614e-06, "loss": 0.80200535, "num_input_tokens_seen": 126577620, "step": 5884, "time_per_iteration": 2.8704800605773926 }, { "auxiliary_loss_clip": 0.0111409, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.05093837, "balance_loss_mlp": 1.02157819, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 1.9052381201351025, "language_loss": 0.7519542, "learning_rate": 2.9971879869807673e-06, "loss": 0.77346253, "num_input_tokens_seen": 126596235, "step": 5885, "time_per_iteration": 2.74930477142334 }, { "auxiliary_loss_clip": 0.01088229, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04355764, "balance_loss_mlp": 1.02321255, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 3.360136520151105, "language_loss": 0.83904099, "learning_rate": 2.996850368809606e-06, "loss": 0.86033243, "num_input_tokens_seen": 126612830, "step": 5886, "time_per_iteration": 2.9362361431121826 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.05223978, "balance_loss_mlp": 1.02178788, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 2.3342407880968765, "language_loss": 0.78239143, "learning_rate": 2.9965127128377787e-06, "loss": 0.8041774, "num_input_tokens_seen": 126630910, "step": 5887, "time_per_iteration": 4.157519340515137 }, { "auxiliary_loss_clip": 0.01079386, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.04380405, "balance_loss_mlp": 1.03155398, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 3.4693260211189614, "language_loss": 0.65532601, "learning_rate": 2.996175019078089e-06, "loss": 0.67659628, "num_input_tokens_seen": 126648365, "step": 5888, "time_per_iteration": 2.7693519592285156 }, { "auxiliary_loss_clip": 0.01108859, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.04853678, "balance_loss_mlp": 1.02278328, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 2.324375134725136, "language_loss": 0.77100271, "learning_rate": 2.9958372875433437e-06, "loss": 0.7924788, "num_input_tokens_seen": 126667500, "step": 5889, "time_per_iteration": 4.211338996887207 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.05017257, "balance_loss_mlp": 1.0262332, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 2.074151752869495, "language_loss": 0.81132901, "learning_rate": 2.9954995182463478e-06, "loss": 0.83276576, "num_input_tokens_seen": 126686820, "step": 5890, "time_per_iteration": 4.248823642730713 }, { "auxiliary_loss_clip": 0.01112591, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.04692972, "balance_loss_mlp": 1.01979923, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 1.8036187380252735, "language_loss": 0.79384875, "learning_rate": 2.99516171119991e-06, "loss": 0.81532121, "num_input_tokens_seen": 126706965, "step": 5891, "time_per_iteration": 4.335815668106079 }, { "auxiliary_loss_clip": 0.01099264, "auxiliary_loss_mlp": 0.01046084, "balance_loss_clip": 1.04669261, "balance_loss_mlp": 1.0285244, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 2.015603194975926, "language_loss": 0.73404211, "learning_rate": 2.9948238664168415e-06, "loss": 0.75549555, "num_input_tokens_seen": 126724015, "step": 5892, "time_per_iteration": 2.760498046875 }, { "auxiliary_loss_clip": 0.01112321, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.04650092, "balance_loss_mlp": 1.02434158, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 2.094655212929219, "language_loss": 0.6720162, "learning_rate": 2.9944859839099518e-06, "loss": 0.6935541, "num_input_tokens_seen": 126737565, "step": 5893, "time_per_iteration": 2.671706199645996 }, { "auxiliary_loss_clip": 0.01084647, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.04317796, "balance_loss_mlp": 1.02440834, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 1.9115541405313234, "language_loss": 0.69860309, "learning_rate": 2.9941480636920533e-06, "loss": 0.71986485, "num_input_tokens_seen": 126756095, "step": 5894, "time_per_iteration": 2.720066785812378 }, { "auxiliary_loss_clip": 0.01111006, "auxiliary_loss_mlp": 0.00773076, "balance_loss_clip": 1.04764175, "balance_loss_mlp": 1.00055242, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 1.7998653616668008, "language_loss": 0.74833035, "learning_rate": 2.9938101057759615e-06, "loss": 0.76717114, "num_input_tokens_seen": 126775455, "step": 5895, "time_per_iteration": 2.8295304775238037 }, { "auxiliary_loss_clip": 0.011052, "auxiliary_loss_mlp": 0.01040742, "balance_loss_clip": 1.04288006, "balance_loss_mlp": 1.02485108, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 2.053997857318945, "language_loss": 0.83762395, "learning_rate": 2.993472110174491e-06, "loss": 0.85908329, "num_input_tokens_seen": 126792320, "step": 5896, "time_per_iteration": 2.723158836364746 }, { "auxiliary_loss_clip": 0.01111237, "auxiliary_loss_mlp": 0.00773671, "balance_loss_clip": 1.04756641, "balance_loss_mlp": 1.0005331, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 1.7709518935889355, "language_loss": 0.70033729, "learning_rate": 2.9931340769004576e-06, "loss": 0.71918637, "num_input_tokens_seen": 126813680, "step": 5897, "time_per_iteration": 2.744617223739624 }, { "auxiliary_loss_clip": 0.01111293, "auxiliary_loss_mlp": 0.01046033, "balance_loss_clip": 1.04829669, "balance_loss_mlp": 1.02830625, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 3.0934933528513344, "language_loss": 0.81546402, "learning_rate": 2.9927960059666816e-06, "loss": 0.83703721, "num_input_tokens_seen": 126834395, "step": 5898, "time_per_iteration": 2.77911376953125 }, { "auxiliary_loss_clip": 0.0113395, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.04943967, "balance_loss_mlp": 1.02232838, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 5.100417261000322, "language_loss": 0.73975331, "learning_rate": 2.9924578973859804e-06, "loss": 0.7614674, "num_input_tokens_seen": 126855145, "step": 5899, "time_per_iteration": 2.6566851139068604 }, { "auxiliary_loss_clip": 0.0113747, "auxiliary_loss_mlp": 0.00772565, "balance_loss_clip": 1.04971743, "balance_loss_mlp": 1.00056052, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 1.7615083390778834, "language_loss": 0.79458243, "learning_rate": 2.9921197511711763e-06, "loss": 0.81368273, "num_input_tokens_seen": 126873790, "step": 5900, "time_per_iteration": 2.6658642292022705 }, { "auxiliary_loss_clip": 0.0111331, "auxiliary_loss_mlp": 0.01044824, "balance_loss_clip": 1.04659319, "balance_loss_mlp": 1.0288384, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 2.160550694830747, "language_loss": 0.81303531, "learning_rate": 2.991781567335093e-06, "loss": 0.83461666, "num_input_tokens_seen": 126892865, "step": 5901, "time_per_iteration": 2.711568593978882 }, { "auxiliary_loss_clip": 0.01125037, "auxiliary_loss_mlp": 0.00772744, "balance_loss_clip": 1.05092883, "balance_loss_mlp": 1.00049663, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 2.0558354102165373, "language_loss": 0.75869077, "learning_rate": 2.9914433458905525e-06, "loss": 0.7776686, "num_input_tokens_seen": 126911935, "step": 5902, "time_per_iteration": 2.6833012104034424 }, { "auxiliary_loss_clip": 0.01123978, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.04852581, "balance_loss_mlp": 1.02142096, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 2.534328384273088, "language_loss": 0.70550704, "learning_rate": 2.991105086850381e-06, "loss": 0.72711003, "num_input_tokens_seen": 126930040, "step": 5903, "time_per_iteration": 2.689303159713745 }, { "auxiliary_loss_clip": 0.01128401, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.05025887, "balance_loss_mlp": 1.02051437, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 3.3775979872187203, "language_loss": 0.7448622, "learning_rate": 2.9907667902274053e-06, "loss": 0.76651096, "num_input_tokens_seen": 126948390, "step": 5904, "time_per_iteration": 2.6360747814178467 }, { "auxiliary_loss_clip": 0.01113034, "auxiliary_loss_mlp": 0.00772738, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.000543, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 3.051840518778985, "language_loss": 0.78653091, "learning_rate": 2.9904284560344536e-06, "loss": 0.80538863, "num_input_tokens_seen": 126964905, "step": 5905, "time_per_iteration": 2.8539419174194336 }, { "auxiliary_loss_clip": 0.01101916, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.02486014, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 18.846860460510154, "language_loss": 0.72740704, "learning_rate": 2.990090084284356e-06, "loss": 0.74881542, "num_input_tokens_seen": 126982000, "step": 5906, "time_per_iteration": 2.7013392448425293 }, { "auxiliary_loss_clip": 0.01109726, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.04908431, "balance_loss_mlp": 1.02265012, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 1.821131131528883, "language_loss": 0.74746358, "learning_rate": 2.9897516749899426e-06, "loss": 0.76895893, "num_input_tokens_seen": 126998390, "step": 5907, "time_per_iteration": 2.7603847980499268 }, { "auxiliary_loss_clip": 0.01062812, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03682017, "balance_loss_mlp": 1.02463293, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 3.0473905008627775, "language_loss": 0.7563526, "learning_rate": 2.989413228164047e-06, "loss": 0.77740943, "num_input_tokens_seen": 127020220, "step": 5908, "time_per_iteration": 2.8653454780578613 }, { "auxiliary_loss_clip": 0.01114185, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.05034626, "balance_loss_mlp": 1.02736473, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 2.926995842336842, "language_loss": 0.68243527, "learning_rate": 2.989074743819502e-06, "loss": 0.70400161, "num_input_tokens_seen": 127038585, "step": 5909, "time_per_iteration": 2.6967928409576416 }, { "auxiliary_loss_clip": 0.01120713, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.0503571, "balance_loss_mlp": 1.02271986, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 2.2169711344959864, "language_loss": 0.78605235, "learning_rate": 2.988736221969144e-06, "loss": 0.807634, "num_input_tokens_seen": 127056215, "step": 5910, "time_per_iteration": 2.65592885017395 }, { "auxiliary_loss_clip": 0.01111825, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.04383612, "balance_loss_mlp": 1.02745175, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 4.097628076705993, "language_loss": 0.71322721, "learning_rate": 2.98839766262581e-06, "loss": 0.73478568, "num_input_tokens_seen": 127075825, "step": 5911, "time_per_iteration": 2.6958134174346924 }, { "auxiliary_loss_clip": 0.01122761, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.04820287, "balance_loss_mlp": 1.02711153, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 2.592685980990988, "language_loss": 0.86703777, "learning_rate": 2.9880590658023366e-06, "loss": 0.88868415, "num_input_tokens_seen": 127091205, "step": 5912, "time_per_iteration": 2.615788221359253 }, { "auxiliary_loss_clip": 0.01113661, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04849911, "balance_loss_mlp": 1.02413917, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 1.9602305341473392, "language_loss": 0.76948488, "learning_rate": 2.9877204315115646e-06, "loss": 0.79100811, "num_input_tokens_seen": 127109210, "step": 5913, "time_per_iteration": 2.7827799320220947 }, { "auxiliary_loss_clip": 0.01098195, "auxiliary_loss_mlp": 0.01036489, "balance_loss_clip": 1.04796672, "balance_loss_mlp": 1.02183783, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 1.6272917241322848, "language_loss": 0.82545209, "learning_rate": 2.9873817597663353e-06, "loss": 0.8467989, "num_input_tokens_seen": 127128400, "step": 5914, "time_per_iteration": 2.7242603302001953 }, { "auxiliary_loss_clip": 0.01137835, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.05178475, "balance_loss_mlp": 1.02247739, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 2.9034799926536, "language_loss": 0.70664769, "learning_rate": 2.98704305057949e-06, "loss": 0.72840279, "num_input_tokens_seen": 127149965, "step": 5915, "time_per_iteration": 2.6785290241241455 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01042738, "balance_loss_clip": 1.04884696, "balance_loss_mlp": 1.02823067, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 1.7433450554379117, "language_loss": 0.76387751, "learning_rate": 2.9867043039638737e-06, "loss": 0.78555447, "num_input_tokens_seen": 127169865, "step": 5916, "time_per_iteration": 2.646141529083252 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.01039438, "balance_loss_clip": 1.04549897, "balance_loss_mlp": 1.02451277, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 1.7213233773991115, "language_loss": 0.88551259, "learning_rate": 2.986365519932332e-06, "loss": 0.9069469, "num_input_tokens_seen": 127188075, "step": 5917, "time_per_iteration": 2.735424757003784 }, { "auxiliary_loss_clip": 0.01057648, "auxiliary_loss_mlp": 0.01050179, "balance_loss_clip": 1.03888357, "balance_loss_mlp": 1.03190458, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 2.1986231946039916, "language_loss": 0.74800515, "learning_rate": 2.98602669849771e-06, "loss": 0.76908338, "num_input_tokens_seen": 127206065, "step": 5918, "time_per_iteration": 2.759612798690796 }, { "auxiliary_loss_clip": 0.01046226, "auxiliary_loss_mlp": 0.01004318, "balance_loss_clip": 1.03416467, "balance_loss_mlp": 1.00212467, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 0.9523078238877629, "language_loss": 0.63871694, "learning_rate": 2.985687839672857e-06, "loss": 0.65922242, "num_input_tokens_seen": 127257885, "step": 5919, "time_per_iteration": 2.974400281906128 }, { "auxiliary_loss_clip": 0.01125949, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.05126309, "balance_loss_mlp": 1.02168, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 2.3466450300124952, "language_loss": 0.73515332, "learning_rate": 2.9853489434706223e-06, "loss": 0.75679016, "num_input_tokens_seen": 127275550, "step": 5920, "time_per_iteration": 2.6402368545532227 }, { "auxiliary_loss_clip": 0.01092607, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.0452888, "balance_loss_mlp": 1.02082539, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 2.020155019062759, "language_loss": 0.76745147, "learning_rate": 2.985010009903857e-06, "loss": 0.78873557, "num_input_tokens_seen": 127295110, "step": 5921, "time_per_iteration": 2.7224855422973633 }, { "auxiliary_loss_clip": 0.01112186, "auxiliary_loss_mlp": 0.01038012, "balance_loss_clip": 1.04887438, "balance_loss_mlp": 1.0231111, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 2.0978128065546717, "language_loss": 0.68095905, "learning_rate": 2.9846710389854133e-06, "loss": 0.702461, "num_input_tokens_seen": 127312865, "step": 5922, "time_per_iteration": 2.6849706172943115 }, { "auxiliary_loss_clip": 0.01120912, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.04752564, "balance_loss_mlp": 1.02032125, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 3.470851899346702, "language_loss": 0.79121947, "learning_rate": 2.9843320307281454e-06, "loss": 0.81278539, "num_input_tokens_seen": 127331710, "step": 5923, "time_per_iteration": 2.659977436065674 }, { "auxiliary_loss_clip": 0.01118161, "auxiliary_loss_mlp": 0.01042419, "balance_loss_clip": 1.0530231, "balance_loss_mlp": 1.02770221, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 2.2084385051152946, "language_loss": 0.85266459, "learning_rate": 2.983992985144908e-06, "loss": 0.87427044, "num_input_tokens_seen": 127350950, "step": 5924, "time_per_iteration": 2.680994987487793 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01046078, "balance_loss_clip": 1.04669881, "balance_loss_mlp": 1.02974653, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 3.12021389910605, "language_loss": 0.77619767, "learning_rate": 2.9836539022485578e-06, "loss": 0.79775453, "num_input_tokens_seen": 127369385, "step": 5925, "time_per_iteration": 2.854043960571289 }, { "auxiliary_loss_clip": 0.01078608, "auxiliary_loss_mlp": 0.01047631, "balance_loss_clip": 1.04546142, "balance_loss_mlp": 1.03274155, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 2.0406100546628108, "language_loss": 0.75402963, "learning_rate": 2.9833147820519535e-06, "loss": 0.77529198, "num_input_tokens_seen": 127386965, "step": 5926, "time_per_iteration": 4.347430467605591 }, { "auxiliary_loss_clip": 0.01110536, "auxiliary_loss_mlp": 0.00773423, "balance_loss_clip": 1.04907203, "balance_loss_mlp": 1.00041842, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 2.7011184644215254, "language_loss": 0.69563019, "learning_rate": 2.9829756245679544e-06, "loss": 0.71446979, "num_input_tokens_seen": 127406075, "step": 5927, "time_per_iteration": 2.8237216472625732 }, { "auxiliary_loss_clip": 0.01136293, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05083871, "balance_loss_mlp": 1.0256958, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 2.594343371199836, "language_loss": 0.79681075, "learning_rate": 2.9826364298094212e-06, "loss": 0.81857955, "num_input_tokens_seen": 127425350, "step": 5928, "time_per_iteration": 4.171353340148926 }, { "auxiliary_loss_clip": 0.01139765, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.05304861, "balance_loss_mlp": 1.02473354, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 1.4355701611092584, "language_loss": 0.81758744, "learning_rate": 2.982297197789215e-06, "loss": 0.83938849, "num_input_tokens_seen": 127446335, "step": 5929, "time_per_iteration": 4.3162572383880615 }, { "auxiliary_loss_clip": 0.01120871, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.04776335, "balance_loss_mlp": 1.02304602, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 1.9323399136404307, "language_loss": 0.70277226, "learning_rate": 2.981957928520201e-06, "loss": 0.72435665, "num_input_tokens_seen": 127462795, "step": 5930, "time_per_iteration": 2.6527109146118164 }, { "auxiliary_loss_clip": 0.01131875, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.05533779, "balance_loss_mlp": 1.02960742, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 2.2535070260025147, "language_loss": 0.6758765, "learning_rate": 2.981618622015244e-06, "loss": 0.69765162, "num_input_tokens_seen": 127482675, "step": 5931, "time_per_iteration": 4.3453147411346436 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04992425, "balance_loss_mlp": 1.02531803, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 1.9436277425022137, "language_loss": 0.67792088, "learning_rate": 2.981279278287211e-06, "loss": 0.69957745, "num_input_tokens_seen": 127502275, "step": 5932, "time_per_iteration": 2.700096368789673 }, { "auxiliary_loss_clip": 0.01082532, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.04578543, "balance_loss_mlp": 1.01849222, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 5.160615382495107, "language_loss": 0.78454852, "learning_rate": 2.980939897348969e-06, "loss": 0.80570471, "num_input_tokens_seen": 127520195, "step": 5933, "time_per_iteration": 2.6900391578674316 }, { "auxiliary_loss_clip": 0.01121777, "auxiliary_loss_mlp": 0.01052933, "balance_loss_clip": 1.0480361, "balance_loss_mlp": 1.03600574, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 1.6861574442761758, "language_loss": 0.69256425, "learning_rate": 2.980600479213388e-06, "loss": 0.7143113, "num_input_tokens_seen": 127544495, "step": 5934, "time_per_iteration": 2.7415738105773926 }, { "auxiliary_loss_clip": 0.01117054, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05076528, "balance_loss_mlp": 1.00057197, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 1.9577931058258786, "language_loss": 0.70848507, "learning_rate": 2.9802610238933384e-06, "loss": 0.72743189, "num_input_tokens_seen": 127563810, "step": 5935, "time_per_iteration": 2.689974069595337 }, { "auxiliary_loss_clip": 0.01105553, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.04790044, "balance_loss_mlp": 1.02414298, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 2.8406009493899567, "language_loss": 0.7755211, "learning_rate": 2.979921531401692e-06, "loss": 0.79698032, "num_input_tokens_seen": 127579065, "step": 5936, "time_per_iteration": 2.741913318634033 }, { "auxiliary_loss_clip": 0.0112859, "auxiliary_loss_mlp": 0.00773213, "balance_loss_clip": 1.05281317, "balance_loss_mlp": 1.00073922, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 1.4219917851433757, "language_loss": 0.64282179, "learning_rate": 2.9795820017513242e-06, "loss": 0.66183978, "num_input_tokens_seen": 127599105, "step": 5937, "time_per_iteration": 2.698432207107544 }, { "auxiliary_loss_clip": 0.011437, "auxiliary_loss_mlp": 0.00773044, "balance_loss_clip": 1.05475211, "balance_loss_mlp": 1.00064254, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 3.0634993604384744, "language_loss": 0.78483748, "learning_rate": 2.9792424349551073e-06, "loss": 0.80400497, "num_input_tokens_seen": 127614940, "step": 5938, "time_per_iteration": 2.617074489593506 }, { "auxiliary_loss_clip": 0.01104152, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.05522823, "balance_loss_mlp": 1.0276773, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 1.4921508018011957, "language_loss": 0.8058449, "learning_rate": 2.9789028310259202e-06, "loss": 0.82731104, "num_input_tokens_seen": 127634960, "step": 5939, "time_per_iteration": 2.805285930633545 }, { "auxiliary_loss_clip": 0.01119857, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.05386829, "balance_loss_mlp": 1.02343178, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 2.412769849050775, "language_loss": 0.79263425, "learning_rate": 2.9785631899766395e-06, "loss": 0.81422341, "num_input_tokens_seen": 127654545, "step": 5940, "time_per_iteration": 2.729759693145752 }, { "auxiliary_loss_clip": 0.01122797, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.05434561, "balance_loss_mlp": 1.01836729, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 2.99992676537861, "language_loss": 0.72561693, "learning_rate": 2.9782235118201443e-06, "loss": 0.74720228, "num_input_tokens_seen": 127672320, "step": 5941, "time_per_iteration": 2.7407357692718506 }, { "auxiliary_loss_clip": 0.01131761, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.0537883, "balance_loss_mlp": 1.02636182, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 4.524453853263744, "language_loss": 0.64234614, "learning_rate": 2.9778837965693154e-06, "loss": 0.66409832, "num_input_tokens_seen": 127693315, "step": 5942, "time_per_iteration": 2.693835735321045 }, { "auxiliary_loss_clip": 0.01125006, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.05074191, "balance_loss_mlp": 1.02442718, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 1.88999720959261, "language_loss": 0.7433207, "learning_rate": 2.9775440442370354e-06, "loss": 0.76497656, "num_input_tokens_seen": 127711570, "step": 5943, "time_per_iteration": 2.6655383110046387 }, { "auxiliary_loss_clip": 0.0107084, "auxiliary_loss_mlp": 0.01002098, "balance_loss_clip": 1.04128122, "balance_loss_mlp": 1.000512, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.7930578325967097, "language_loss": 0.60739905, "learning_rate": 2.9772042548361867e-06, "loss": 0.62812841, "num_input_tokens_seen": 127772475, "step": 5944, "time_per_iteration": 3.257052421569824 }, { "auxiliary_loss_clip": 0.01113544, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.05017304, "balance_loss_mlp": 1.02329779, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 2.0176419730945554, "language_loss": 0.72310007, "learning_rate": 2.976864428379655e-06, "loss": 0.74462366, "num_input_tokens_seen": 127790940, "step": 5945, "time_per_iteration": 2.6320457458496094 }, { "auxiliary_loss_clip": 0.01113199, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.04710388, "balance_loss_mlp": 1.00053716, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 2.1873404124300655, "language_loss": 0.81147355, "learning_rate": 2.976524564880326e-06, "loss": 0.83034003, "num_input_tokens_seen": 127808275, "step": 5946, "time_per_iteration": 2.7045581340789795 }, { "auxiliary_loss_clip": 0.01142015, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.05382085, "balance_loss_mlp": 1.02568626, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 1.5286248167474699, "language_loss": 0.68842459, "learning_rate": 2.9761846643510882e-06, "loss": 0.71026313, "num_input_tokens_seen": 127828840, "step": 5947, "time_per_iteration": 2.6360325813293457 }, { "auxiliary_loss_clip": 0.01107164, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.04598188, "balance_loss_mlp": 1.02426696, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 4.061535671212192, "language_loss": 0.76024956, "learning_rate": 2.9758447268048297e-06, "loss": 0.78171754, "num_input_tokens_seen": 127846240, "step": 5948, "time_per_iteration": 2.6968884468078613 }, { "auxiliary_loss_clip": 0.01081903, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.04692364, "balance_loss_mlp": 1.0291121, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 1.8353415788349725, "language_loss": 0.70553362, "learning_rate": 2.9755047522544415e-06, "loss": 0.72679162, "num_input_tokens_seen": 127866880, "step": 5949, "time_per_iteration": 2.8849079608917236 }, { "auxiliary_loss_clip": 0.01113321, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.04892492, "balance_loss_mlp": 1.02688098, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 2.820547719587591, "language_loss": 0.77489066, "learning_rate": 2.9751647407128154e-06, "loss": 0.79643422, "num_input_tokens_seen": 127883560, "step": 5950, "time_per_iteration": 2.6595206260681152 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.04834211, "balance_loss_mlp": 1.02592397, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 1.7233867228761917, "language_loss": 0.72746027, "learning_rate": 2.9748246921928445e-06, "loss": 0.74915326, "num_input_tokens_seen": 127902330, "step": 5951, "time_per_iteration": 2.6544554233551025 }, { "auxiliary_loss_clip": 0.01129333, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.05047357, "balance_loss_mlp": 1.0256753, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 2.2344429074284693, "language_loss": 0.69326741, "learning_rate": 2.9744846067074236e-06, "loss": 0.71497542, "num_input_tokens_seen": 127922325, "step": 5952, "time_per_iteration": 2.7666146755218506 }, { "auxiliary_loss_clip": 0.01080716, "auxiliary_loss_mlp": 0.01049645, "balance_loss_clip": 1.04122877, "balance_loss_mlp": 1.03411233, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 4.791743787800428, "language_loss": 0.69651616, "learning_rate": 2.974144484269449e-06, "loss": 0.71781975, "num_input_tokens_seen": 127942635, "step": 5953, "time_per_iteration": 2.900196075439453 }, { "auxiliary_loss_clip": 0.01113192, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.0476222, "balance_loss_mlp": 1.0198822, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 2.3015234956442394, "language_loss": 0.6670965, "learning_rate": 2.9738043248918175e-06, "loss": 0.68857497, "num_input_tokens_seen": 127962520, "step": 5954, "time_per_iteration": 2.7609100341796875 }, { "auxiliary_loss_clip": 0.011102, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04845512, "balance_loss_mlp": 1.02633798, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 1.9332002852280215, "language_loss": 0.74798024, "learning_rate": 2.9734641285874282e-06, "loss": 0.76948655, "num_input_tokens_seen": 127981180, "step": 5955, "time_per_iteration": 2.727787733078003 }, { "auxiliary_loss_clip": 0.01114534, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.04827058, "balance_loss_mlp": 1.02546179, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 1.745052650810224, "language_loss": 0.75871193, "learning_rate": 2.973123895369182e-06, "loss": 0.78025484, "num_input_tokens_seen": 127999725, "step": 5956, "time_per_iteration": 2.685006856918335 }, { "auxiliary_loss_clip": 0.01133387, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.05088747, "balance_loss_mlp": 1.0211376, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 4.15447674959345, "language_loss": 0.73543882, "learning_rate": 2.9727836252499805e-06, "loss": 0.75712276, "num_input_tokens_seen": 128018885, "step": 5957, "time_per_iteration": 2.6640098094940186 }, { "auxiliary_loss_clip": 0.01113163, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.04958355, "balance_loss_mlp": 1.02395511, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 3.3283201757671037, "language_loss": 0.70960939, "learning_rate": 2.972443318242726e-06, "loss": 0.73112065, "num_input_tokens_seen": 128037875, "step": 5958, "time_per_iteration": 2.6962838172912598 }, { "auxiliary_loss_clip": 0.01093969, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.04454029, "balance_loss_mlp": 1.02435875, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 2.5438119471533494, "language_loss": 0.88630176, "learning_rate": 2.972102974360324e-06, "loss": 0.90762633, "num_input_tokens_seen": 128056045, "step": 5959, "time_per_iteration": 2.713508129119873 }, { "auxiliary_loss_clip": 0.0113447, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.05009389, "balance_loss_mlp": 1.02511787, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 2.2010810744211486, "language_loss": 0.58033586, "learning_rate": 2.971762593615679e-06, "loss": 0.60207957, "num_input_tokens_seen": 128077815, "step": 5960, "time_per_iteration": 2.685009479522705 }, { "auxiliary_loss_clip": 0.0113445, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04900908, "balance_loss_mlp": 1.0255897, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 2.9088839798225035, "language_loss": 0.75860739, "learning_rate": 2.9714221760216993e-06, "loss": 0.7803694, "num_input_tokens_seen": 128095460, "step": 5961, "time_per_iteration": 2.591665506362915 }, { "auxiliary_loss_clip": 0.01103629, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.04985154, "balance_loss_mlp": 1.022223, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 1.7962139278871543, "language_loss": 0.70392656, "learning_rate": 2.971081721591294e-06, "loss": 0.72533739, "num_input_tokens_seen": 128118605, "step": 5962, "time_per_iteration": 2.78696346282959 }, { "auxiliary_loss_clip": 0.01116632, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.0513072, "balance_loss_mlp": 1.02532077, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 3.937600501619356, "language_loss": 0.75052911, "learning_rate": 2.9707412303373716e-06, "loss": 0.77207649, "num_input_tokens_seen": 128139205, "step": 5963, "time_per_iteration": 2.779210090637207 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01044967, "balance_loss_clip": 1.05189323, "balance_loss_mlp": 1.03017306, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 3.7087256254692305, "language_loss": 0.78717148, "learning_rate": 2.9704007022728447e-06, "loss": 0.80898178, "num_input_tokens_seen": 128158765, "step": 5964, "time_per_iteration": 2.598621368408203 }, { "auxiliary_loss_clip": 0.01112011, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.05019569, "balance_loss_mlp": 1.02534723, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 2.0226045347569857, "language_loss": 0.66572571, "learning_rate": 2.970060137410626e-06, "loss": 0.6872592, "num_input_tokens_seen": 128177850, "step": 5965, "time_per_iteration": 2.684847116470337 }, { "auxiliary_loss_clip": 0.01132652, "auxiliary_loss_mlp": 0.0077213, "balance_loss_clip": 1.04819942, "balance_loss_mlp": 1.00052619, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 2.180178648475794, "language_loss": 0.79150963, "learning_rate": 2.9697195357636294e-06, "loss": 0.81055743, "num_input_tokens_seen": 128196925, "step": 5966, "time_per_iteration": 4.321925163269043 }, { "auxiliary_loss_clip": 0.01076497, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04272628, "balance_loss_mlp": 1.02573991, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 2.3639555115609663, "language_loss": 0.91201752, "learning_rate": 2.9693788973447715e-06, "loss": 0.93320298, "num_input_tokens_seen": 128213955, "step": 5967, "time_per_iteration": 2.7455573081970215 }, { "auxiliary_loss_clip": 0.01101026, "auxiliary_loss_mlp": 0.01053293, "balance_loss_clip": 1.04794097, "balance_loss_mlp": 1.03494644, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 5.4514250686274695, "language_loss": 0.80356693, "learning_rate": 2.9690382221669682e-06, "loss": 0.82511014, "num_input_tokens_seen": 128232980, "step": 5968, "time_per_iteration": 4.176758766174316 }, { "auxiliary_loss_clip": 0.01109306, "auxiliary_loss_mlp": 0.01052187, "balance_loss_clip": 1.04507756, "balance_loss_mlp": 1.03602266, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 2.18425096992674, "language_loss": 0.8341769, "learning_rate": 2.9686975102431384e-06, "loss": 0.85579193, "num_input_tokens_seen": 128252795, "step": 5969, "time_per_iteration": 4.278231382369995 }, { "auxiliary_loss_clip": 0.01089525, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.04389262, "balance_loss_mlp": 1.0201571, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 2.040075228447558, "language_loss": 0.72608048, "learning_rate": 2.968356761586202e-06, "loss": 0.74732047, "num_input_tokens_seen": 128273115, "step": 5970, "time_per_iteration": 2.7784154415130615 }, { "auxiliary_loss_clip": 0.01110616, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04673791, "balance_loss_mlp": 1.01868832, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 1.7975318028216438, "language_loss": 0.79562962, "learning_rate": 2.9680159762090805e-06, "loss": 0.8170712, "num_input_tokens_seen": 128292220, "step": 5971, "time_per_iteration": 4.519066333770752 }, { "auxiliary_loss_clip": 0.01098267, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02766144, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 1.754965992567408, "language_loss": 0.78217793, "learning_rate": 2.967675154124696e-06, "loss": 0.80359125, "num_input_tokens_seen": 128310305, "step": 5972, "time_per_iteration": 2.7724227905273438 }, { "auxiliary_loss_clip": 0.01092509, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.04198921, "balance_loss_mlp": 1.02043509, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 2.4812117519320287, "language_loss": 0.8120966, "learning_rate": 2.9673342953459722e-06, "loss": 0.83337677, "num_input_tokens_seen": 128328305, "step": 5973, "time_per_iteration": 2.8266379833221436 }, { "auxiliary_loss_clip": 0.01042329, "auxiliary_loss_mlp": 0.01005341, "balance_loss_clip": 1.03088689, "balance_loss_mlp": 1.0036602, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 0.9056618080123127, "language_loss": 0.56743383, "learning_rate": 2.9669933998858355e-06, "loss": 0.58791053, "num_input_tokens_seen": 128378380, "step": 5974, "time_per_iteration": 3.0758044719696045 }, { "auxiliary_loss_clip": 0.01126274, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04946434, "balance_loss_mlp": 1.02339661, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 2.5569125412900022, "language_loss": 0.68787563, "learning_rate": 2.9666524677572114e-06, "loss": 0.70951241, "num_input_tokens_seen": 128394315, "step": 5975, "time_per_iteration": 2.657576084136963 }, { "auxiliary_loss_clip": 0.01134392, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.04914975, "balance_loss_mlp": 1.02426553, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 1.804443520579843, "language_loss": 0.79982442, "learning_rate": 2.96631149897303e-06, "loss": 0.82155442, "num_input_tokens_seen": 128414515, "step": 5976, "time_per_iteration": 2.6197311878204346 }, { "auxiliary_loss_clip": 0.01074524, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.04337287, "balance_loss_mlp": 1.02404785, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 1.9714674470262432, "language_loss": 0.78818405, "learning_rate": 2.9659704935462194e-06, "loss": 0.8093304, "num_input_tokens_seen": 128430615, "step": 5977, "time_per_iteration": 2.735844612121582 }, { "auxiliary_loss_clip": 0.01094647, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.04511654, "balance_loss_mlp": 1.02789736, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 2.560014574379112, "language_loss": 0.79859221, "learning_rate": 2.9656294514897102e-06, "loss": 0.8199572, "num_input_tokens_seen": 128449480, "step": 5978, "time_per_iteration": 2.704134941101074 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.00773692, "balance_loss_clip": 1.04890609, "balance_loss_mlp": 1.00073409, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 4.868201977342703, "language_loss": 0.68310702, "learning_rate": 2.965288372816436e-06, "loss": 0.70219827, "num_input_tokens_seen": 128471465, "step": 5979, "time_per_iteration": 2.667222499847412 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.04548645, "balance_loss_mlp": 1.01876652, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 6.298210491387724, "language_loss": 0.67445302, "learning_rate": 2.9649472575393296e-06, "loss": 0.69584739, "num_input_tokens_seen": 128490645, "step": 5980, "time_per_iteration": 2.6262974739074707 }, { "auxiliary_loss_clip": 0.01113802, "auxiliary_loss_mlp": 0.01040029, "balance_loss_clip": 1.04725266, "balance_loss_mlp": 1.02324414, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 1.8251567017824133, "language_loss": 0.71328801, "learning_rate": 2.964606105671327e-06, "loss": 0.73482633, "num_input_tokens_seen": 128510225, "step": 5981, "time_per_iteration": 2.696676254272461 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01041685, "balance_loss_clip": 1.04872131, "balance_loss_mlp": 1.02498353, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 2.0089481436352767, "language_loss": 0.71294796, "learning_rate": 2.9642649172253635e-06, "loss": 0.73445523, "num_input_tokens_seen": 128530195, "step": 5982, "time_per_iteration": 2.7264244556427 }, { "auxiliary_loss_clip": 0.01114107, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.04542398, "balance_loss_mlp": 1.03115773, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 1.8520970942870048, "language_loss": 0.75614822, "learning_rate": 2.9639236922143786e-06, "loss": 0.77775598, "num_input_tokens_seen": 128549990, "step": 5983, "time_per_iteration": 2.6827449798583984 }, { "auxiliary_loss_clip": 0.01140239, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.0510025, "balance_loss_mlp": 1.02626991, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 17.088734777986428, "language_loss": 0.76256114, "learning_rate": 2.96358243065131e-06, "loss": 0.78439057, "num_input_tokens_seen": 128567925, "step": 5984, "time_per_iteration": 2.695389747619629 }, { "auxiliary_loss_clip": 0.01117847, "auxiliary_loss_mlp": 0.00772256, "balance_loss_clip": 1.04583967, "balance_loss_mlp": 1.00047541, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 1.8513392555770956, "language_loss": 0.86111921, "learning_rate": 2.9632411325490993e-06, "loss": 0.88002026, "num_input_tokens_seen": 128585655, "step": 5985, "time_per_iteration": 2.6440985202789307 }, { "auxiliary_loss_clip": 0.01117958, "auxiliary_loss_mlp": 0.01045892, "balance_loss_clip": 1.04564977, "balance_loss_mlp": 1.03012037, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 2.5721307867834406, "language_loss": 0.72770452, "learning_rate": 2.9628997979206884e-06, "loss": 0.74934304, "num_input_tokens_seen": 128604820, "step": 5986, "time_per_iteration": 2.6169698238372803 }, { "auxiliary_loss_clip": 0.01100506, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.04264784, "balance_loss_mlp": 1.02473474, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 2.1943162754876497, "language_loss": 0.73883474, "learning_rate": 2.9625584267790204e-06, "loss": 0.76023847, "num_input_tokens_seen": 128623070, "step": 5987, "time_per_iteration": 2.72385573387146 }, { "auxiliary_loss_clip": 0.0114047, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.05135727, "balance_loss_mlp": 1.02456188, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 2.225645474388546, "language_loss": 0.69665354, "learning_rate": 2.9622170191370404e-06, "loss": 0.71846086, "num_input_tokens_seen": 128642430, "step": 5988, "time_per_iteration": 2.6040101051330566 }, { "auxiliary_loss_clip": 0.01127132, "auxiliary_loss_mlp": 0.01043358, "balance_loss_clip": 1.04819822, "balance_loss_mlp": 1.0278132, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 2.281223653114012, "language_loss": 0.73300481, "learning_rate": 2.9618755750076953e-06, "loss": 0.75470972, "num_input_tokens_seen": 128661285, "step": 5989, "time_per_iteration": 2.6532981395721436 }, { "auxiliary_loss_clip": 0.01089891, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.04161119, "balance_loss_mlp": 1.02237916, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 3.1935134184936156, "language_loss": 0.79950285, "learning_rate": 2.961534094403931e-06, "loss": 0.82077992, "num_input_tokens_seen": 128682210, "step": 5990, "time_per_iteration": 2.785142421722412 }, { "auxiliary_loss_clip": 0.01123339, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.04714704, "balance_loss_mlp": 1.01775789, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 2.506195073342272, "language_loss": 0.83875644, "learning_rate": 2.961192577338698e-06, "loss": 0.86032414, "num_input_tokens_seen": 128700445, "step": 5991, "time_per_iteration": 2.6310808658599854 }, { "auxiliary_loss_clip": 0.01111044, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.04896092, "balance_loss_mlp": 1.03068912, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 2.314320245159203, "language_loss": 0.75628942, "learning_rate": 2.9608510238249463e-06, "loss": 0.77785814, "num_input_tokens_seen": 128716855, "step": 5992, "time_per_iteration": 2.6698272228240967 }, { "auxiliary_loss_clip": 0.01134951, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.04993188, "balance_loss_mlp": 1.02385557, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 2.1820524355734072, "language_loss": 0.76886415, "learning_rate": 2.960509433875627e-06, "loss": 0.79060775, "num_input_tokens_seen": 128735835, "step": 5993, "time_per_iteration": 2.5999341011047363 }, { "auxiliary_loss_clip": 0.01111748, "auxiliary_loss_mlp": 0.01054388, "balance_loss_clip": 1.04750419, "balance_loss_mlp": 1.03762674, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 1.8546706349055275, "language_loss": 0.74672681, "learning_rate": 2.9601678075036943e-06, "loss": 0.76838815, "num_input_tokens_seen": 128752465, "step": 5994, "time_per_iteration": 2.6691155433654785 }, { "auxiliary_loss_clip": 0.01095118, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.0480628, "balance_loss_mlp": 1.02331567, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 2.7696142346579666, "language_loss": 0.68887782, "learning_rate": 2.9598261447221024e-06, "loss": 0.71021217, "num_input_tokens_seen": 128770865, "step": 5995, "time_per_iteration": 2.7497267723083496 }, { "auxiliary_loss_clip": 0.01104395, "auxiliary_loss_mlp": 0.01046311, "balance_loss_clip": 1.04338932, "balance_loss_mlp": 1.03031349, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 2.2305093143222248, "language_loss": 0.82564914, "learning_rate": 2.9594844455438057e-06, "loss": 0.84715617, "num_input_tokens_seen": 128789730, "step": 5996, "time_per_iteration": 2.7227983474731445 }, { "auxiliary_loss_clip": 0.01135369, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.04974842, "balance_loss_mlp": 1.02300954, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 2.068995609090248, "language_loss": 0.73795009, "learning_rate": 2.959142709981763e-06, "loss": 0.75968659, "num_input_tokens_seen": 128806610, "step": 5997, "time_per_iteration": 2.572842836380005 }, { "auxiliary_loss_clip": 0.01121916, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0482775, "balance_loss_mlp": 1.0226686, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 2.7116535757300215, "language_loss": 0.69209671, "learning_rate": 2.9588009380489337e-06, "loss": 0.71368217, "num_input_tokens_seen": 128824830, "step": 5998, "time_per_iteration": 2.604459047317505 }, { "auxiliary_loss_clip": 0.01085406, "auxiliary_loss_mlp": 0.01041904, "balance_loss_clip": 1.04395008, "balance_loss_mlp": 1.02565587, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 2.6293691676304745, "language_loss": 0.76580822, "learning_rate": 2.9584591297582758e-06, "loss": 0.78708136, "num_input_tokens_seen": 128838170, "step": 5999, "time_per_iteration": 2.6671667098999023 }, { "auxiliary_loss_clip": 0.01098137, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.04674315, "balance_loss_mlp": 1.02590609, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 1.8157116334206203, "language_loss": 0.78264523, "learning_rate": 2.9581172851227516e-06, "loss": 0.80403441, "num_input_tokens_seen": 128855625, "step": 6000, "time_per_iteration": 2.743117332458496 }, { "auxiliary_loss_clip": 0.01095162, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.04705954, "balance_loss_mlp": 1.02203155, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 1.8701006971713747, "language_loss": 0.78316295, "learning_rate": 2.9577754041553243e-06, "loss": 0.80447751, "num_input_tokens_seen": 128873540, "step": 6001, "time_per_iteration": 2.7342417240142822 }, { "auxiliary_loss_clip": 0.01130356, "auxiliary_loss_mlp": 0.0077146, "balance_loss_clip": 1.04727733, "balance_loss_mlp": 1.00072694, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 3.3927220139250056, "language_loss": 0.83151853, "learning_rate": 2.9574334868689575e-06, "loss": 0.8505367, "num_input_tokens_seen": 128889925, "step": 6002, "time_per_iteration": 2.6884238719940186 }, { "auxiliary_loss_clip": 0.01101804, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.04249346, "balance_loss_mlp": 1.02011156, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 2.135208430409031, "language_loss": 0.90677911, "learning_rate": 2.9570915332766165e-06, "loss": 0.92812997, "num_input_tokens_seen": 128906890, "step": 6003, "time_per_iteration": 2.666738986968994 }, { "auxiliary_loss_clip": 0.01036783, "auxiliary_loss_mlp": 0.0101378, "balance_loss_clip": 1.03707922, "balance_loss_mlp": 1.01194429, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.8844533830179444, "language_loss": 0.53396428, "learning_rate": 2.9567495433912693e-06, "loss": 0.55446988, "num_input_tokens_seen": 128965940, "step": 6004, "time_per_iteration": 3.1421444416046143 }, { "auxiliary_loss_clip": 0.01112391, "auxiliary_loss_mlp": 0.00772771, "balance_loss_clip": 1.04665363, "balance_loss_mlp": 1.00050342, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 2.085214899207264, "language_loss": 0.77743608, "learning_rate": 2.956407517225883e-06, "loss": 0.79628766, "num_input_tokens_seen": 128985835, "step": 6005, "time_per_iteration": 4.196998596191406 }, { "auxiliary_loss_clip": 0.01114373, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.04545391, "balance_loss_mlp": 1.02866125, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 1.984756598411705, "language_loss": 0.78795588, "learning_rate": 2.956065454793429e-06, "loss": 0.80953228, "num_input_tokens_seen": 129003120, "step": 6006, "time_per_iteration": 2.642446517944336 }, { "auxiliary_loss_clip": 0.01135515, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04913247, "balance_loss_mlp": 1.02116823, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 3.6522767524231248, "language_loss": 0.84766537, "learning_rate": 2.955723356106876e-06, "loss": 0.86939454, "num_input_tokens_seen": 129021645, "step": 6007, "time_per_iteration": 4.38408637046814 }, { "auxiliary_loss_clip": 0.01120706, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.05059266, "balance_loss_mlp": 1.01940203, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 2.20663208121776, "language_loss": 0.72179425, "learning_rate": 2.955381221179198e-06, "loss": 0.7433598, "num_input_tokens_seen": 129038375, "step": 6008, "time_per_iteration": 4.262283802032471 }, { "auxiliary_loss_clip": 0.01118211, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.04345882, "balance_loss_mlp": 1.02150559, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 7.815944525258205, "language_loss": 0.83056295, "learning_rate": 2.955039050023368e-06, "loss": 0.85210377, "num_input_tokens_seen": 129056235, "step": 6009, "time_per_iteration": 2.643824577331543 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.04862237, "balance_loss_mlp": 1.03013086, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 2.1132167438001166, "language_loss": 0.7616573, "learning_rate": 2.954696842652362e-06, "loss": 0.7831707, "num_input_tokens_seen": 129072405, "step": 6010, "time_per_iteration": 4.361377000808716 }, { "auxiliary_loss_clip": 0.01104786, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.04665053, "balance_loss_mlp": 1.02091312, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 1.759609272436165, "language_loss": 0.83214396, "learning_rate": 2.9543545990791554e-06, "loss": 0.85354757, "num_input_tokens_seen": 129090225, "step": 6011, "time_per_iteration": 2.679145574569702 }, { "auxiliary_loss_clip": 0.01141696, "auxiliary_loss_mlp": 0.01041601, "balance_loss_clip": 1.05070031, "balance_loss_mlp": 1.02562666, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 2.194420173883677, "language_loss": 0.62446111, "learning_rate": 2.954012319316727e-06, "loss": 0.64629406, "num_input_tokens_seen": 129107685, "step": 6012, "time_per_iteration": 2.6012516021728516 }, { "auxiliary_loss_clip": 0.01106556, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.04518831, "balance_loss_mlp": 1.02368951, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 1.831524666449312, "language_loss": 0.8381623, "learning_rate": 2.9536700033780565e-06, "loss": 0.85961026, "num_input_tokens_seen": 129125315, "step": 6013, "time_per_iteration": 2.7191901206970215 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.04590511, "balance_loss_mlp": 1.02466893, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 3.6755742539930285, "language_loss": 0.91541535, "learning_rate": 2.9533276512761228e-06, "loss": 0.93713462, "num_input_tokens_seen": 129141600, "step": 6014, "time_per_iteration": 2.714121103286743 }, { "auxiliary_loss_clip": 0.01131507, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.0463829, "balance_loss_mlp": 1.0268693, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 2.2181121985150094, "language_loss": 0.73578274, "learning_rate": 2.95298526302391e-06, "loss": 0.75752199, "num_input_tokens_seen": 129160665, "step": 6015, "time_per_iteration": 2.668600082397461 }, { "auxiliary_loss_clip": 0.0105036, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.03610015, "balance_loss_mlp": 1.02980912, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 2.2662955263586158, "language_loss": 0.64756966, "learning_rate": 2.9526428386344e-06, "loss": 0.66855025, "num_input_tokens_seen": 129179220, "step": 6016, "time_per_iteration": 2.8753597736358643 }, { "auxiliary_loss_clip": 0.01127577, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.05000329, "balance_loss_mlp": 1.02170801, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 2.0483319793753343, "language_loss": 0.71927178, "learning_rate": 2.9523003781205785e-06, "loss": 0.74093938, "num_input_tokens_seen": 129200385, "step": 6017, "time_per_iteration": 2.8195903301239014 }, { "auxiliary_loss_clip": 0.01123165, "auxiliary_loss_mlp": 0.01043013, "balance_loss_clip": 1.04506993, "balance_loss_mlp": 1.02724147, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 2.196881428409859, "language_loss": 0.73543239, "learning_rate": 2.9519578814954307e-06, "loss": 0.7570942, "num_input_tokens_seen": 129217395, "step": 6018, "time_per_iteration": 2.6454639434814453 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.0470562, "balance_loss_mlp": 1.02079058, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 2.8373114264415222, "language_loss": 0.69157374, "learning_rate": 2.9516153487719448e-06, "loss": 0.71282017, "num_input_tokens_seen": 129238940, "step": 6019, "time_per_iteration": 2.824361801147461 }, { "auxiliary_loss_clip": 0.0111438, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.04542887, "balance_loss_mlp": 1.02275765, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 3.405770043894724, "language_loss": 0.76428473, "learning_rate": 2.95127277996311e-06, "loss": 0.78581828, "num_input_tokens_seen": 129258240, "step": 6020, "time_per_iteration": 2.6757993698120117 }, { "auxiliary_loss_clip": 0.01124662, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.04899478, "balance_loss_mlp": 1.02512705, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 2.1413312386751606, "language_loss": 0.73802006, "learning_rate": 2.9509301750819156e-06, "loss": 0.7596817, "num_input_tokens_seen": 129279040, "step": 6021, "time_per_iteration": 2.6422386169433594 }, { "auxiliary_loss_clip": 0.01094575, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.04502845, "balance_loss_mlp": 1.02170944, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 8.65046906858069, "language_loss": 0.80683851, "learning_rate": 2.9505875341413533e-06, "loss": 0.82814515, "num_input_tokens_seen": 129295415, "step": 6022, "time_per_iteration": 2.7069809436798096 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.04967427, "balance_loss_mlp": 1.02036762, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 1.6359940708258738, "language_loss": 0.81630391, "learning_rate": 2.950244857154417e-06, "loss": 0.83786309, "num_input_tokens_seen": 129312620, "step": 6023, "time_per_iteration": 2.676196575164795 }, { "auxiliary_loss_clip": 0.01115391, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.04994166, "balance_loss_mlp": 1.02266037, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 2.238629896510925, "language_loss": 0.79401833, "learning_rate": 2.9499021441341e-06, "loss": 0.81555158, "num_input_tokens_seen": 129331825, "step": 6024, "time_per_iteration": 2.6479294300079346 }, { "auxiliary_loss_clip": 0.01098352, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.04168642, "balance_loss_mlp": 1.02567625, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 2.1016508822119517, "language_loss": 0.74409318, "learning_rate": 2.9495593950933997e-06, "loss": 0.76549369, "num_input_tokens_seen": 129350400, "step": 6025, "time_per_iteration": 2.720113515853882 }, { "auxiliary_loss_clip": 0.01121634, "auxiliary_loss_mlp": 0.00772492, "balance_loss_clip": 1.04758501, "balance_loss_mlp": 1.00045466, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 1.7192758683210898, "language_loss": 0.72363192, "learning_rate": 2.9492166100453107e-06, "loss": 0.74257314, "num_input_tokens_seen": 129371155, "step": 6026, "time_per_iteration": 2.647515296936035 }, { "auxiliary_loss_clip": 0.01130763, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.0300554, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 3.1509295844270166, "language_loss": 0.79584157, "learning_rate": 2.948873789002833e-06, "loss": 0.81760705, "num_input_tokens_seen": 129391230, "step": 6027, "time_per_iteration": 2.666778802871704 }, { "auxiliary_loss_clip": 0.01112806, "auxiliary_loss_mlp": 0.01044567, "balance_loss_clip": 1.04690945, "balance_loss_mlp": 1.02730584, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 2.036912075012155, "language_loss": 0.67857373, "learning_rate": 2.9485309319789667e-06, "loss": 0.70014751, "num_input_tokens_seen": 129410065, "step": 6028, "time_per_iteration": 2.721635103225708 }, { "auxiliary_loss_clip": 0.01093428, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.04534137, "balance_loss_mlp": 1.02493429, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 2.040296243102333, "language_loss": 0.85588348, "learning_rate": 2.9481880389867117e-06, "loss": 0.87721586, "num_input_tokens_seen": 129428655, "step": 6029, "time_per_iteration": 2.768638849258423 }, { "auxiliary_loss_clip": 0.01097178, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.04583371, "balance_loss_mlp": 1.02534389, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 1.826841085229912, "language_loss": 0.72638077, "learning_rate": 2.9478451100390714e-06, "loss": 0.74775726, "num_input_tokens_seen": 129447845, "step": 6030, "time_per_iteration": 2.6222145557403564 }, { "auxiliary_loss_clip": 0.01111443, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.0471518, "balance_loss_mlp": 1.02635479, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 2.682823168265615, "language_loss": 0.74219912, "learning_rate": 2.94750214514905e-06, "loss": 0.76375365, "num_input_tokens_seen": 129463275, "step": 6031, "time_per_iteration": 2.62003493309021 }, { "auxiliary_loss_clip": 0.01090216, "auxiliary_loss_mlp": 0.01046109, "balance_loss_clip": 1.04174352, "balance_loss_mlp": 1.03031349, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 2.122404426395552, "language_loss": 0.72930032, "learning_rate": 2.9471591443296516e-06, "loss": 0.75066358, "num_input_tokens_seen": 129483205, "step": 6032, "time_per_iteration": 2.7382266521453857 }, { "auxiliary_loss_clip": 0.01089342, "auxiliary_loss_mlp": 0.0104871, "balance_loss_clip": 1.0457828, "balance_loss_mlp": 1.03320134, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 2.0052695882675895, "language_loss": 0.77577424, "learning_rate": 2.946816107593884e-06, "loss": 0.79715478, "num_input_tokens_seen": 129499885, "step": 6033, "time_per_iteration": 2.712574005126953 }, { "auxiliary_loss_clip": 0.01011518, "auxiliary_loss_mlp": 0.01010455, "balance_loss_clip": 1.02346182, "balance_loss_mlp": 1.00881004, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.775881514372135, "language_loss": 0.6472615, "learning_rate": 2.9464730349547547e-06, "loss": 0.66748118, "num_input_tokens_seen": 129561885, "step": 6034, "time_per_iteration": 3.33389949798584 }, { "auxiliary_loss_clip": 0.0111586, "auxiliary_loss_mlp": 0.01039589, "balance_loss_clip": 1.04362679, "balance_loss_mlp": 1.02373409, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 2.348469757016237, "language_loss": 0.89869213, "learning_rate": 2.946129926425273e-06, "loss": 0.9202466, "num_input_tokens_seen": 129582325, "step": 6035, "time_per_iteration": 2.661137580871582 }, { "auxiliary_loss_clip": 0.01112128, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.04810882, "balance_loss_mlp": 1.02445734, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 1.7965494412259506, "language_loss": 0.73480749, "learning_rate": 2.9457867820184496e-06, "loss": 0.75633562, "num_input_tokens_seen": 129600350, "step": 6036, "time_per_iteration": 2.627746105194092 }, { "auxiliary_loss_clip": 0.01118939, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.0476563, "balance_loss_mlp": 1.01825309, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 2.247638401714898, "language_loss": 0.75895989, "learning_rate": 2.945443601747297e-06, "loss": 0.78049135, "num_input_tokens_seen": 129618425, "step": 6037, "time_per_iteration": 2.6763134002685547 }, { "auxiliary_loss_clip": 0.01117432, "auxiliary_loss_mlp": 0.0105958, "balance_loss_clip": 1.04722893, "balance_loss_mlp": 1.04149556, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 1.7641921793444904, "language_loss": 0.78425813, "learning_rate": 2.945100385624828e-06, "loss": 0.80602825, "num_input_tokens_seen": 129636750, "step": 6038, "time_per_iteration": 2.6576154232025146 }, { "auxiliary_loss_clip": 0.01042272, "auxiliary_loss_mlp": 0.01000075, "balance_loss_clip": 1.02576721, "balance_loss_mlp": 0.99842948, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 0.8328343708327894, "language_loss": 0.63371962, "learning_rate": 2.9447571336640573e-06, "loss": 0.6541431, "num_input_tokens_seen": 129699030, "step": 6039, "time_per_iteration": 3.268035650253296 }, { "auxiliary_loss_clip": 0.01108663, "auxiliary_loss_mlp": 0.01052032, "balance_loss_clip": 1.04687905, "balance_loss_mlp": 1.03485394, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 2.83972356132426, "language_loss": 0.71349055, "learning_rate": 2.944413845878002e-06, "loss": 0.73509747, "num_input_tokens_seen": 129717135, "step": 6040, "time_per_iteration": 2.7468066215515137 }, { "auxiliary_loss_clip": 0.01129452, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.05027485, "balance_loss_mlp": 1.02372289, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 1.6017927687359714, "language_loss": 0.81615877, "learning_rate": 2.9440705222796783e-06, "loss": 0.83785057, "num_input_tokens_seen": 129735940, "step": 6041, "time_per_iteration": 2.6624767780303955 }, { "auxiliary_loss_clip": 0.01116373, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.04789138, "balance_loss_mlp": 1.02039289, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 6.335898198250863, "language_loss": 0.83848882, "learning_rate": 2.943727162882107e-06, "loss": 0.86002731, "num_input_tokens_seen": 129752790, "step": 6042, "time_per_iteration": 2.6279616355895996 }, { "auxiliary_loss_clip": 0.01113831, "auxiliary_loss_mlp": 0.01045895, "balance_loss_clip": 1.04817295, "balance_loss_mlp": 1.03020668, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 1.8194124872693949, "language_loss": 0.78401059, "learning_rate": 2.9433837676983064e-06, "loss": 0.80560786, "num_input_tokens_seen": 129773655, "step": 6043, "time_per_iteration": 4.221862077713013 }, { "auxiliary_loss_clip": 0.01111193, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.05454051, "balance_loss_mlp": 1.02078581, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 2.743973887678544, "language_loss": 0.65664518, "learning_rate": 2.943040336741298e-06, "loss": 0.67812526, "num_input_tokens_seen": 129791605, "step": 6044, "time_per_iteration": 2.7301173210144043 }, { "auxiliary_loss_clip": 0.01109397, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.04838157, "balance_loss_mlp": 1.02035475, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 2.5365479968338187, "language_loss": 0.81149542, "learning_rate": 2.9426968700241066e-06, "loss": 0.83293915, "num_input_tokens_seen": 129811075, "step": 6045, "time_per_iteration": 2.6896753311157227 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01045503, "balance_loss_clip": 1.04706383, "balance_loss_mlp": 1.02923083, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 2.400629400498793, "language_loss": 0.65010375, "learning_rate": 2.942353367559755e-06, "loss": 0.67158914, "num_input_tokens_seen": 129833755, "step": 6046, "time_per_iteration": 2.800321578979492 }, { "auxiliary_loss_clip": 0.01102544, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.0467155, "balance_loss_mlp": 1.02399993, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 2.172977049503826, "language_loss": 0.77142686, "learning_rate": 2.9420098293612692e-06, "loss": 0.79284167, "num_input_tokens_seen": 129854475, "step": 6047, "time_per_iteration": 4.274283170700073 }, { "auxiliary_loss_clip": 0.01137356, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.05142486, "balance_loss_mlp": 1.02983761, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 1.922622021112015, "language_loss": 0.79610157, "learning_rate": 2.9416662554416767e-06, "loss": 0.81795079, "num_input_tokens_seen": 129873530, "step": 6048, "time_per_iteration": 4.283480644226074 }, { "auxiliary_loss_clip": 0.01037942, "auxiliary_loss_mlp": 0.01005664, "balance_loss_clip": 1.01860034, "balance_loss_mlp": 1.00387573, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 0.749844121463454, "language_loss": 0.52550006, "learning_rate": 2.9413226458140054e-06, "loss": 0.54593611, "num_input_tokens_seen": 129940400, "step": 6049, "time_per_iteration": 3.2647299766540527 }, { "auxiliary_loss_clip": 0.01105759, "auxiliary_loss_mlp": 0.01042028, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.02467084, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 9.722138117523357, "language_loss": 0.8628068, "learning_rate": 2.9409790004912845e-06, "loss": 0.88428462, "num_input_tokens_seen": 129958635, "step": 6050, "time_per_iteration": 2.744236469268799 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.00772785, "balance_loss_clip": 1.04944158, "balance_loss_mlp": 1.0004611, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 3.109361789309709, "language_loss": 0.78116536, "learning_rate": 2.940635319486546e-06, "loss": 0.80009651, "num_input_tokens_seen": 129977685, "step": 6051, "time_per_iteration": 2.6305320262908936 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.04900503, "balance_loss_mlp": 1.02559745, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 1.9275322741448784, "language_loss": 0.82526582, "learning_rate": 2.940291602812822e-06, "loss": 0.84694636, "num_input_tokens_seen": 129997530, "step": 6052, "time_per_iteration": 2.711794853210449 }, { "auxiliary_loss_clip": 0.01100415, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.04675376, "balance_loss_mlp": 1.02270949, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 1.7820298413079305, "language_loss": 0.72085792, "learning_rate": 2.939947850483145e-06, "loss": 0.74223173, "num_input_tokens_seen": 130017955, "step": 6053, "time_per_iteration": 2.725600481033325 }, { "auxiliary_loss_clip": 0.01015406, "auxiliary_loss_mlp": 0.01003631, "balance_loss_clip": 1.0300014, "balance_loss_mlp": 1.00155663, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7712310074836012, "language_loss": 0.61214095, "learning_rate": 2.9396040625105532e-06, "loss": 0.63233131, "num_input_tokens_seen": 130074275, "step": 6054, "time_per_iteration": 3.3252007961273193 }, { "auxiliary_loss_clip": 0.0111079, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.04735899, "balance_loss_mlp": 1.02214301, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 2.93078334140581, "language_loss": 0.75820959, "learning_rate": 2.9392602389080802e-06, "loss": 0.77970749, "num_input_tokens_seen": 130091375, "step": 6055, "time_per_iteration": 2.656001091003418 }, { "auxiliary_loss_clip": 0.0113529, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.02581286, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 1.6734377169093124, "language_loss": 0.7533145, "learning_rate": 2.938916379688765e-06, "loss": 0.77508265, "num_input_tokens_seen": 130111595, "step": 6056, "time_per_iteration": 2.654418468475342 }, { "auxiliary_loss_clip": 0.01121707, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.055071, "balance_loss_mlp": 1.02337217, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 2.035168503846255, "language_loss": 0.80473512, "learning_rate": 2.9385724848656468e-06, "loss": 0.82633936, "num_input_tokens_seen": 130131440, "step": 6057, "time_per_iteration": 2.7347753047943115 }, { "auxiliary_loss_clip": 0.01107128, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.04495037, "balance_loss_mlp": 1.02438855, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 2.043030499006847, "language_loss": 0.80264485, "learning_rate": 2.9382285544517647e-06, "loss": 0.8241142, "num_input_tokens_seen": 130151375, "step": 6058, "time_per_iteration": 2.695674180984497 }, { "auxiliary_loss_clip": 0.01102831, "auxiliary_loss_mlp": 0.00772601, "balance_loss_clip": 1.04357934, "balance_loss_mlp": 1.00046432, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 2.032310914115462, "language_loss": 0.84994543, "learning_rate": 2.9378845884601636e-06, "loss": 0.86869979, "num_input_tokens_seen": 130169960, "step": 6059, "time_per_iteration": 2.6912410259246826 }, { "auxiliary_loss_clip": 0.01093721, "auxiliary_loss_mlp": 0.01039242, "balance_loss_clip": 1.04318213, "balance_loss_mlp": 1.02287483, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 5.903326132338396, "language_loss": 0.87806225, "learning_rate": 2.937540586903884e-06, "loss": 0.89939183, "num_input_tokens_seen": 130189800, "step": 6060, "time_per_iteration": 2.713115692138672 }, { "auxiliary_loss_clip": 0.01125791, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.0498302, "balance_loss_mlp": 1.02388453, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 2.3521788015610805, "language_loss": 0.66954017, "learning_rate": 2.937196549795971e-06, "loss": 0.69120121, "num_input_tokens_seen": 130206370, "step": 6061, "time_per_iteration": 2.8435866832733154 }, { "auxiliary_loss_clip": 0.0111942, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.05207086, "balance_loss_mlp": 1.02260041, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 2.5119296796020354, "language_loss": 0.75012159, "learning_rate": 2.9368524771494718e-06, "loss": 0.77170277, "num_input_tokens_seen": 130224445, "step": 6062, "time_per_iteration": 2.659853935241699 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.04851866, "balance_loss_mlp": 1.01628149, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 2.568706719167558, "language_loss": 0.72070628, "learning_rate": 2.936508368977432e-06, "loss": 0.74213189, "num_input_tokens_seen": 130245380, "step": 6063, "time_per_iteration": 2.7098159790039062 }, { "auxiliary_loss_clip": 0.01118768, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.04472148, "balance_loss_mlp": 1.02187479, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 2.3511982692020936, "language_loss": 0.68179435, "learning_rate": 2.936164225292901e-06, "loss": 0.70335501, "num_input_tokens_seen": 130265575, "step": 6064, "time_per_iteration": 2.6513044834136963 }, { "auxiliary_loss_clip": 0.01116627, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04925466, "balance_loss_mlp": 1.02988076, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 1.9840367281230236, "language_loss": 0.74147421, "learning_rate": 2.9358200461089297e-06, "loss": 0.76309836, "num_input_tokens_seen": 130286195, "step": 6065, "time_per_iteration": 2.764556407928467 }, { "auxiliary_loss_clip": 0.0111688, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.04924774, "balance_loss_mlp": 1.02306008, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 2.0108238901766042, "language_loss": 0.75444913, "learning_rate": 2.9354758314385676e-06, "loss": 0.77602255, "num_input_tokens_seen": 130306095, "step": 6066, "time_per_iteration": 2.749293088912964 }, { "auxiliary_loss_clip": 0.01121102, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.04859555, "balance_loss_mlp": 1.02010643, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2.8385875288429587, "language_loss": 0.76480901, "learning_rate": 2.9351315812948684e-06, "loss": 0.78636676, "num_input_tokens_seen": 130324685, "step": 6067, "time_per_iteration": 2.619833469390869 }, { "auxiliary_loss_clip": 0.01135088, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.05067635, "balance_loss_mlp": 1.02401567, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 2.2214902441228563, "language_loss": 0.71036232, "learning_rate": 2.934787295690886e-06, "loss": 0.73209393, "num_input_tokens_seen": 130343855, "step": 6068, "time_per_iteration": 2.633678674697876 }, { "auxiliary_loss_clip": 0.01119276, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.0432384, "balance_loss_mlp": 1.02402711, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 2.184109901605664, "language_loss": 0.74421692, "learning_rate": 2.9344429746396755e-06, "loss": 0.76580441, "num_input_tokens_seen": 130362320, "step": 6069, "time_per_iteration": 2.6463425159454346 }, { "auxiliary_loss_clip": 0.01115147, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.04814148, "balance_loss_mlp": 1.02237022, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 1.8874088651190308, "language_loss": 0.66247845, "learning_rate": 2.9340986181542945e-06, "loss": 0.68401062, "num_input_tokens_seen": 130383165, "step": 6070, "time_per_iteration": 2.70835280418396 }, { "auxiliary_loss_clip": 0.01118852, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.04837227, "balance_loss_mlp": 1.02161574, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 1.882521473859371, "language_loss": 0.74406028, "learning_rate": 2.9337542262477994e-06, "loss": 0.76561427, "num_input_tokens_seen": 130402425, "step": 6071, "time_per_iteration": 2.6479921340942383 }, { "auxiliary_loss_clip": 0.0112348, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.04683149, "balance_loss_mlp": 1.02142978, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 1.9443656652026238, "language_loss": 0.88592315, "learning_rate": 2.9334097989332506e-06, "loss": 0.9075312, "num_input_tokens_seen": 130419440, "step": 6072, "time_per_iteration": 2.641340732574463 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.0495832, "balance_loss_mlp": 1.02225924, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 2.382408041683643, "language_loss": 0.72436309, "learning_rate": 2.9330653362237094e-06, "loss": 0.7459538, "num_input_tokens_seen": 130438495, "step": 6073, "time_per_iteration": 2.6814513206481934 }, { "auxiliary_loss_clip": 0.01067321, "auxiliary_loss_mlp": 0.01042007, "balance_loss_clip": 1.04483008, "balance_loss_mlp": 1.0249722, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 3.1332797030940913, "language_loss": 0.66850221, "learning_rate": 2.932720838132236e-06, "loss": 0.68959546, "num_input_tokens_seen": 130455575, "step": 6074, "time_per_iteration": 2.7943460941314697 }, { "auxiliary_loss_clip": 0.01103652, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.04833269, "balance_loss_mlp": 1.02238262, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 1.5371260958261816, "language_loss": 0.72812623, "learning_rate": 2.9323763046718954e-06, "loss": 0.74953616, "num_input_tokens_seen": 130476385, "step": 6075, "time_per_iteration": 2.7581374645233154 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.04679585, "balance_loss_mlp": 1.03011715, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 2.1248471900324186, "language_loss": 0.89377797, "learning_rate": 2.9320317358557524e-06, "loss": 0.91524976, "num_input_tokens_seen": 130493630, "step": 6076, "time_per_iteration": 2.7085182666778564 }, { "auxiliary_loss_clip": 0.01125287, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.0504595, "balance_loss_mlp": 1.02784586, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 2.218138292044272, "language_loss": 0.69377828, "learning_rate": 2.931687131696872e-06, "loss": 0.71547067, "num_input_tokens_seen": 130510735, "step": 6077, "time_per_iteration": 2.6516926288604736 }, { "auxiliary_loss_clip": 0.01063406, "auxiliary_loss_mlp": 0.01003112, "balance_loss_clip": 1.03200221, "balance_loss_mlp": 1.00121677, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 0.7484778409156561, "language_loss": 0.61802375, "learning_rate": 2.9313424922083224e-06, "loss": 0.63868892, "num_input_tokens_seen": 130577050, "step": 6078, "time_per_iteration": 3.2192225456237793 }, { "auxiliary_loss_clip": 0.01105852, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.04234397, "balance_loss_mlp": 1.03565383, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 2.6620805395927283, "language_loss": 0.78445792, "learning_rate": 2.930997817403173e-06, "loss": 0.80604661, "num_input_tokens_seen": 130593780, "step": 6079, "time_per_iteration": 2.6616902351379395 }, { "auxiliary_loss_clip": 0.01129934, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05226243, "balance_loss_mlp": 1.02386224, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 2.4767906644356037, "language_loss": 0.62662333, "learning_rate": 2.9306531072944913e-06, "loss": 0.64832425, "num_input_tokens_seen": 130615510, "step": 6080, "time_per_iteration": 2.8651509284973145 }, { "auxiliary_loss_clip": 0.01108292, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.04737091, "balance_loss_mlp": 1.02529645, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 3.1314387429818327, "language_loss": 0.67686033, "learning_rate": 2.930308361895352e-06, "loss": 0.69836557, "num_input_tokens_seen": 130635410, "step": 6081, "time_per_iteration": 2.707031011581421 }, { "auxiliary_loss_clip": 0.01112746, "auxiliary_loss_mlp": 0.00773158, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.00033236, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 1.5854068035466964, "language_loss": 0.74755692, "learning_rate": 2.9299635812188257e-06, "loss": 0.76641595, "num_input_tokens_seen": 130657725, "step": 6082, "time_per_iteration": 2.7261881828308105 }, { "auxiliary_loss_clip": 0.01072732, "auxiliary_loss_mlp": 0.00772597, "balance_loss_clip": 1.04222691, "balance_loss_mlp": 1.00042963, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 2.051480252043875, "language_loss": 0.82956016, "learning_rate": 2.929618765277987e-06, "loss": 0.8480134, "num_input_tokens_seen": 130678360, "step": 6083, "time_per_iteration": 4.360748529434204 }, { "auxiliary_loss_clip": 0.01041394, "auxiliary_loss_mlp": 0.01001412, "balance_loss_clip": 1.02900386, "balance_loss_mlp": 0.99936181, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8163771270511553, "language_loss": 0.59314513, "learning_rate": 2.9292739140859125e-06, "loss": 0.61357319, "num_input_tokens_seen": 130742110, "step": 6084, "time_per_iteration": 3.3273561000823975 }, { "auxiliary_loss_clip": 0.0109183, "auxiliary_loss_mlp": 0.0104143, "balance_loss_clip": 1.04496968, "balance_loss_mlp": 1.02570593, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 3.4329037043843478, "language_loss": 0.72791892, "learning_rate": 2.9289290276556767e-06, "loss": 0.74925154, "num_input_tokens_seen": 130759870, "step": 6085, "time_per_iteration": 2.7221856117248535 }, { "auxiliary_loss_clip": 0.01101549, "auxiliary_loss_mlp": 0.01038512, "balance_loss_clip": 1.04982924, "balance_loss_mlp": 1.02383745, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 2.636651052815632, "language_loss": 0.77860379, "learning_rate": 2.9285841060003604e-06, "loss": 0.80000436, "num_input_tokens_seen": 130778510, "step": 6086, "time_per_iteration": 4.265977621078491 }, { "auxiliary_loss_clip": 0.0111591, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.04616153, "balance_loss_mlp": 1.01771855, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 1.8562986050024126, "language_loss": 0.76759315, "learning_rate": 2.9282391491330416e-06, "loss": 0.78907776, "num_input_tokens_seen": 130798535, "step": 6087, "time_per_iteration": 4.227373123168945 }, { "auxiliary_loss_clip": 0.01081855, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.04556108, "balance_loss_mlp": 1.02589023, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 2.2476274891892474, "language_loss": 0.71063232, "learning_rate": 2.9278941570668002e-06, "loss": 0.73187363, "num_input_tokens_seen": 130816655, "step": 6088, "time_per_iteration": 4.3080058097839355 }, { "auxiliary_loss_clip": 0.01136094, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.05314517, "balance_loss_mlp": 1.02267289, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 1.6318023186273214, "language_loss": 0.79717827, "learning_rate": 2.92754912981472e-06, "loss": 0.81893623, "num_input_tokens_seen": 130841225, "step": 6089, "time_per_iteration": 2.782954216003418 }, { "auxiliary_loss_clip": 0.01099767, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.04514015, "balance_loss_mlp": 1.02220643, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 2.0312735397290043, "language_loss": 0.71617413, "learning_rate": 2.927204067389884e-06, "loss": 0.73753607, "num_input_tokens_seen": 130861050, "step": 6090, "time_per_iteration": 2.7414958477020264 }, { "auxiliary_loss_clip": 0.01105933, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05133104, "balance_loss_mlp": 1.03305852, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 2.037307676788604, "language_loss": 0.74434924, "learning_rate": 2.9268589698053763e-06, "loss": 0.7658866, "num_input_tokens_seen": 130879775, "step": 6091, "time_per_iteration": 2.628554344177246 }, { "auxiliary_loss_clip": 0.01076087, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.04836047, "balance_loss_mlp": 1.02728868, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 2.1960531931019682, "language_loss": 0.73387206, "learning_rate": 2.926513837074284e-06, "loss": 0.75506234, "num_input_tokens_seen": 130898070, "step": 6092, "time_per_iteration": 2.7320556640625 }, { "auxiliary_loss_clip": 0.01127006, "auxiliary_loss_mlp": 0.01044139, "balance_loss_clip": 1.04809344, "balance_loss_mlp": 1.02796876, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 1.9967925590844784, "language_loss": 0.77662504, "learning_rate": 2.9261686692096942e-06, "loss": 0.79833645, "num_input_tokens_seen": 130915250, "step": 6093, "time_per_iteration": 2.721311092376709 }, { "auxiliary_loss_clip": 0.01124005, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04696584, "balance_loss_mlp": 1.02686548, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 1.926436620767835, "language_loss": 0.7455743, "learning_rate": 2.925823466224696e-06, "loss": 0.76723486, "num_input_tokens_seen": 130936995, "step": 6094, "time_per_iteration": 2.767188310623169 }, { "auxiliary_loss_clip": 0.01142303, "auxiliary_loss_mlp": 0.01055832, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.03969133, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 1.743331442809004, "language_loss": 0.79444361, "learning_rate": 2.9254782281323785e-06, "loss": 0.81642497, "num_input_tokens_seen": 130957970, "step": 6095, "time_per_iteration": 2.718632459640503 }, { "auxiliary_loss_clip": 0.01118218, "auxiliary_loss_mlp": 0.00774719, "balance_loss_clip": 1.05141842, "balance_loss_mlp": 1.00037265, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 3.4988865885900178, "language_loss": 0.73592722, "learning_rate": 2.925132954945834e-06, "loss": 0.75485659, "num_input_tokens_seen": 130974915, "step": 6096, "time_per_iteration": 2.674382448196411 }, { "auxiliary_loss_clip": 0.01099743, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04458702, "balance_loss_mlp": 1.02355742, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 2.41624095312735, "language_loss": 0.67081815, "learning_rate": 2.924787646678155e-06, "loss": 0.69220531, "num_input_tokens_seen": 130995745, "step": 6097, "time_per_iteration": 2.789118766784668 }, { "auxiliary_loss_clip": 0.01077673, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.04489172, "balance_loss_mlp": 1.02268624, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 1.4796406838499911, "language_loss": 0.77679402, "learning_rate": 2.9244423033424365e-06, "loss": 0.79795432, "num_input_tokens_seen": 131015545, "step": 6098, "time_per_iteration": 2.7803733348846436 }, { "auxiliary_loss_clip": 0.01122346, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02987766, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 1.744595499322522, "language_loss": 0.73707491, "learning_rate": 2.9240969249517723e-06, "loss": 0.75875127, "num_input_tokens_seen": 131033990, "step": 6099, "time_per_iteration": 2.6809163093566895 }, { "auxiliary_loss_clip": 0.01111202, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.04759586, "balance_loss_mlp": 1.02931285, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 1.8475933970370078, "language_loss": 0.84773195, "learning_rate": 2.9237515115192602e-06, "loss": 0.86928654, "num_input_tokens_seen": 131050710, "step": 6100, "time_per_iteration": 2.6730356216430664 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.04448223, "balance_loss_mlp": 1.02181566, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 3.9532097547953104, "language_loss": 0.70893979, "learning_rate": 2.9234060630579992e-06, "loss": 0.73033994, "num_input_tokens_seen": 131071435, "step": 6101, "time_per_iteration": 2.7369589805603027 }, { "auxiliary_loss_clip": 0.01111262, "auxiliary_loss_mlp": 0.01052791, "balance_loss_clip": 1.05096185, "balance_loss_mlp": 1.0361371, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 2.286737474315047, "language_loss": 0.76634502, "learning_rate": 2.9230605795810865e-06, "loss": 0.7879855, "num_input_tokens_seen": 131088775, "step": 6102, "time_per_iteration": 2.7081708908081055 }, { "auxiliary_loss_clip": 0.01131629, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.0524683, "balance_loss_mlp": 1.02050483, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 4.369253140908342, "language_loss": 0.70019859, "learning_rate": 2.922715061101625e-06, "loss": 0.72188866, "num_input_tokens_seen": 131112800, "step": 6103, "time_per_iteration": 2.8610281944274902 }, { "auxiliary_loss_clip": 0.01093091, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.04730344, "balance_loss_mlp": 1.02283263, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 3.0883152470965842, "language_loss": 0.72272754, "learning_rate": 2.922369507632716e-06, "loss": 0.744048, "num_input_tokens_seen": 131131150, "step": 6104, "time_per_iteration": 2.7520432472229004 }, { "auxiliary_loss_clip": 0.01127975, "auxiliary_loss_mlp": 0.01036046, "balance_loss_clip": 1.05017686, "balance_loss_mlp": 1.01940393, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 2.1608886453477947, "language_loss": 0.81461251, "learning_rate": 2.9220239191874617e-06, "loss": 0.83625269, "num_input_tokens_seen": 131150365, "step": 6105, "time_per_iteration": 2.7565362453460693 }, { "auxiliary_loss_clip": 0.0114363, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.05170739, "balance_loss_mlp": 1.02526236, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 1.7202629897198451, "language_loss": 0.81035495, "learning_rate": 2.9216782957789692e-06, "loss": 0.83220649, "num_input_tokens_seen": 131169310, "step": 6106, "time_per_iteration": 2.73502779006958 }, { "auxiliary_loss_clip": 0.01035121, "auxiliary_loss_mlp": 0.00753905, "balance_loss_clip": 1.03131676, "balance_loss_mlp": 1.00104892, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 0.6921927745874564, "language_loss": 0.59176284, "learning_rate": 2.9213326374203426e-06, "loss": 0.60965312, "num_input_tokens_seen": 131232900, "step": 6107, "time_per_iteration": 3.2754647731781006 }, { "auxiliary_loss_clip": 0.01111272, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.04770529, "balance_loss_mlp": 1.02058864, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 1.8102661289525128, "language_loss": 0.74492711, "learning_rate": 2.92098694412469e-06, "loss": 0.76639688, "num_input_tokens_seen": 131250920, "step": 6108, "time_per_iteration": 2.730562448501587 }, { "auxiliary_loss_clip": 0.01129123, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.04957151, "balance_loss_mlp": 1.02196801, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 2.04949693656995, "language_loss": 0.72790694, "learning_rate": 2.9206412159051213e-06, "loss": 0.7495752, "num_input_tokens_seen": 131267910, "step": 6109, "time_per_iteration": 2.6488542556762695 }, { "auxiliary_loss_clip": 0.01065451, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.04156637, "balance_loss_mlp": 1.02426052, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 4.856830375229604, "language_loss": 0.53295934, "learning_rate": 2.920295452774744e-06, "loss": 0.55401909, "num_input_tokens_seen": 131287150, "step": 6110, "time_per_iteration": 2.8366596698760986 }, { "auxiliary_loss_clip": 0.01123878, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.04783487, "balance_loss_mlp": 1.02253747, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 1.6516494205850427, "language_loss": 0.80507129, "learning_rate": 2.919949654746672e-06, "loss": 0.82670015, "num_input_tokens_seen": 131308225, "step": 6111, "time_per_iteration": 2.7537708282470703 }, { "auxiliary_loss_clip": 0.01083524, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.04381704, "balance_loss_mlp": 1.02897525, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 1.7980410764958656, "language_loss": 0.72401643, "learning_rate": 2.9196038218340163e-06, "loss": 0.74529469, "num_input_tokens_seen": 131332115, "step": 6112, "time_per_iteration": 2.80513858795166 }, { "auxiliary_loss_clip": 0.0112775, "auxiliary_loss_mlp": 0.01046215, "balance_loss_clip": 1.05025816, "balance_loss_mlp": 1.03102732, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 1.6179233760027578, "language_loss": 0.8539387, "learning_rate": 2.919257954049892e-06, "loss": 0.8756783, "num_input_tokens_seen": 131351885, "step": 6113, "time_per_iteration": 2.6997315883636475 }, { "auxiliary_loss_clip": 0.01128342, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04813516, "balance_loss_mlp": 1.02512193, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 2.2420277636185872, "language_loss": 0.78542709, "learning_rate": 2.918912051407413e-06, "loss": 0.807127, "num_input_tokens_seen": 131370245, "step": 6114, "time_per_iteration": 2.694831609725952 }, { "auxiliary_loss_clip": 0.01133627, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.05145383, "balance_loss_mlp": 1.02612031, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 1.6750895304816946, "language_loss": 0.67368686, "learning_rate": 2.918566113919698e-06, "loss": 0.69546771, "num_input_tokens_seen": 131388115, "step": 6115, "time_per_iteration": 2.6966724395751953 }, { "auxiliary_loss_clip": 0.01104674, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.04332471, "balance_loss_mlp": 1.02229142, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 3.500949938115168, "language_loss": 0.76685899, "learning_rate": 2.9182201415998636e-06, "loss": 0.78827953, "num_input_tokens_seen": 131404595, "step": 6116, "time_per_iteration": 2.6796109676361084 }, { "auxiliary_loss_clip": 0.01088778, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.04433835, "balance_loss_mlp": 1.02729988, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 1.7533988300226562, "language_loss": 0.62997502, "learning_rate": 2.9178741344610286e-06, "loss": 0.65129328, "num_input_tokens_seen": 131423760, "step": 6117, "time_per_iteration": 2.7784011363983154 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.04275632, "balance_loss_mlp": 1.0210557, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 1.9867834860772036, "language_loss": 0.73087811, "learning_rate": 2.9175280925163156e-06, "loss": 0.75229007, "num_input_tokens_seen": 131444955, "step": 6118, "time_per_iteration": 2.734731674194336 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01043898, "balance_loss_clip": 1.05198336, "balance_loss_mlp": 1.0266242, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 2.319960114880422, "language_loss": 0.72638988, "learning_rate": 2.9171820157788445e-06, "loss": 0.74815631, "num_input_tokens_seen": 131465720, "step": 6119, "time_per_iteration": 2.7073371410369873 }, { "auxiliary_loss_clip": 0.0111183, "auxiliary_loss_mlp": 0.01037904, "balance_loss_clip": 1.04830384, "balance_loss_mlp": 1.02101171, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 1.9587818101138383, "language_loss": 0.80524689, "learning_rate": 2.9168359042617404e-06, "loss": 0.8267442, "num_input_tokens_seen": 131483080, "step": 6120, "time_per_iteration": 2.679933547973633 }, { "auxiliary_loss_clip": 0.01093981, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.04785204, "balance_loss_mlp": 1.02894819, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 2.4092121945194496, "language_loss": 0.64745319, "learning_rate": 2.916489757978126e-06, "loss": 0.66883707, "num_input_tokens_seen": 131502545, "step": 6121, "time_per_iteration": 2.7067880630493164 }, { "auxiliary_loss_clip": 0.01126101, "auxiliary_loss_mlp": 0.01043212, "balance_loss_clip": 1.05021691, "balance_loss_mlp": 1.02735114, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 1.774708172393826, "language_loss": 0.71686751, "learning_rate": 2.9161435769411286e-06, "loss": 0.73856068, "num_input_tokens_seen": 131522155, "step": 6122, "time_per_iteration": 4.026647329330444 }, { "auxiliary_loss_clip": 0.01106964, "auxiliary_loss_mlp": 0.01043545, "balance_loss_clip": 1.04859734, "balance_loss_mlp": 1.0265938, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 5.6855406070233245, "language_loss": 0.69653022, "learning_rate": 2.915797361163875e-06, "loss": 0.71803534, "num_input_tokens_seen": 131543865, "step": 6123, "time_per_iteration": 2.7548627853393555 }, { "auxiliary_loss_clip": 0.01128204, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.04822993, "balance_loss_mlp": 1.02251744, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 7.022932421262019, "language_loss": 0.73640841, "learning_rate": 2.9154511106594933e-06, "loss": 0.75809622, "num_input_tokens_seen": 131562155, "step": 6124, "time_per_iteration": 2.6710870265960693 }, { "auxiliary_loss_clip": 0.01116833, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04977059, "balance_loss_mlp": 1.02809882, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 1.931714997280456, "language_loss": 0.74334198, "learning_rate": 2.915104825441114e-06, "loss": 0.76496822, "num_input_tokens_seen": 131581695, "step": 6125, "time_per_iteration": 4.175686359405518 }, { "auxiliary_loss_clip": 0.01132649, "auxiliary_loss_mlp": 0.01053205, "balance_loss_clip": 1.05193818, "balance_loss_mlp": 1.03514445, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 1.8318884745506827, "language_loss": 0.78127813, "learning_rate": 2.9147585055218686e-06, "loss": 0.80313659, "num_input_tokens_seen": 131599465, "step": 6126, "time_per_iteration": 2.6783266067504883 }, { "auxiliary_loss_clip": 0.01128437, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.0490706, "balance_loss_mlp": 1.02659082, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 2.7490159956422575, "language_loss": 0.66118228, "learning_rate": 2.914412150914888e-06, "loss": 0.68291688, "num_input_tokens_seen": 131618330, "step": 6127, "time_per_iteration": 4.20530891418457 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.05205703, "balance_loss_mlp": 1.02980185, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 1.8515813315176315, "language_loss": 0.70152593, "learning_rate": 2.9140657616333074e-06, "loss": 0.72319436, "num_input_tokens_seen": 131638960, "step": 6128, "time_per_iteration": 4.498606204986572 }, { "auxiliary_loss_clip": 0.0112131, "auxiliary_loss_mlp": 0.01046424, "balance_loss_clip": 1.05264103, "balance_loss_mlp": 1.02957964, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 2.3245894967836698, "language_loss": 0.75067866, "learning_rate": 2.9137193376902614e-06, "loss": 0.77235603, "num_input_tokens_seen": 131657440, "step": 6129, "time_per_iteration": 2.6874284744262695 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04533887, "balance_loss_mlp": 1.02403355, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 1.6533761140504426, "language_loss": 0.84758681, "learning_rate": 2.9133728790988868e-06, "loss": 0.86918116, "num_input_tokens_seen": 131678035, "step": 6130, "time_per_iteration": 2.729963541030884 }, { "auxiliary_loss_clip": 0.0102639, "auxiliary_loss_mlp": 0.01017875, "balance_loss_clip": 1.02295637, "balance_loss_mlp": 1.01620567, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8481176099425293, "language_loss": 0.60254776, "learning_rate": 2.913026385872321e-06, "loss": 0.62299049, "num_input_tokens_seen": 131742470, "step": 6131, "time_per_iteration": 3.2806124687194824 }, { "auxiliary_loss_clip": 0.01097122, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.04542315, "balance_loss_mlp": 1.01914179, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.5587449528822306, "language_loss": 0.73085582, "learning_rate": 2.9126798580237034e-06, "loss": 0.75218356, "num_input_tokens_seen": 131764570, "step": 6132, "time_per_iteration": 2.781385898590088 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.04795551, "balance_loss_mlp": 1.02187514, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 1.9292425463255205, "language_loss": 0.74192035, "learning_rate": 2.9123332955661736e-06, "loss": 0.76358628, "num_input_tokens_seen": 131785720, "step": 6133, "time_per_iteration": 2.718660831451416 }, { "auxiliary_loss_clip": 0.01072831, "auxiliary_loss_mlp": 0.01049093, "balance_loss_clip": 1.041502, "balance_loss_mlp": 1.03042495, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 1.8863128538280483, "language_loss": 0.71522588, "learning_rate": 2.911986698512874e-06, "loss": 0.73644507, "num_input_tokens_seen": 131804430, "step": 6134, "time_per_iteration": 2.8003294467926025 }, { "auxiliary_loss_clip": 0.01102901, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.0472008, "balance_loss_mlp": 1.01838863, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 1.6906065874809195, "language_loss": 0.75386798, "learning_rate": 2.9116400668769477e-06, "loss": 0.77524465, "num_input_tokens_seen": 131822060, "step": 6135, "time_per_iteration": 2.7916624546051025 }, { "auxiliary_loss_clip": 0.01030435, "auxiliary_loss_mlp": 0.01019879, "balance_loss_clip": 1.0281316, "balance_loss_mlp": 1.01760185, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8159837123545765, "language_loss": 0.58766222, "learning_rate": 2.9112934006715376e-06, "loss": 0.60816532, "num_input_tokens_seen": 131880715, "step": 6136, "time_per_iteration": 3.2766408920288086 }, { "auxiliary_loss_clip": 0.01106354, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.04497695, "balance_loss_mlp": 1.02723718, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 2.3780452593473393, "language_loss": 0.79126394, "learning_rate": 2.9109466999097918e-06, "loss": 0.81276655, "num_input_tokens_seen": 131895850, "step": 6137, "time_per_iteration": 2.8411052227020264 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.04803205, "balance_loss_mlp": 1.02645159, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 2.0312275113078337, "language_loss": 0.7454071, "learning_rate": 2.9105999646048552e-06, "loss": 0.76710081, "num_input_tokens_seen": 131915775, "step": 6138, "time_per_iteration": 2.7210230827331543 }, { "auxiliary_loss_clip": 0.01090918, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.04320955, "balance_loss_mlp": 1.0259856, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 2.0947758027881767, "language_loss": 0.64676917, "learning_rate": 2.9102531947698764e-06, "loss": 0.66810304, "num_input_tokens_seen": 131935715, "step": 6139, "time_per_iteration": 2.8667304515838623 }, { "auxiliary_loss_clip": 0.01095075, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.04443955, "balance_loss_mlp": 1.02646971, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 2.1146776737326998, "language_loss": 0.71764016, "learning_rate": 2.909906390418006e-06, "loss": 0.73901963, "num_input_tokens_seen": 131954120, "step": 6140, "time_per_iteration": 2.718100070953369 }, { "auxiliary_loss_clip": 0.01017799, "auxiliary_loss_mlp": 0.01004631, "balance_loss_clip": 1.02079976, "balance_loss_mlp": 1.00281894, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.7503567012350645, "language_loss": 0.59252203, "learning_rate": 2.9095595515623934e-06, "loss": 0.61274636, "num_input_tokens_seen": 132017485, "step": 6141, "time_per_iteration": 3.3003833293914795 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.04716861, "balance_loss_mlp": 1.02458787, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 1.900744005055956, "language_loss": 0.75374687, "learning_rate": 2.909212678216192e-06, "loss": 0.77537608, "num_input_tokens_seen": 132036760, "step": 6142, "time_per_iteration": 2.707676410675049 }, { "auxiliary_loss_clip": 0.01122008, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.02276349, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 2.0868371024046346, "language_loss": 0.77474618, "learning_rate": 2.908865770392555e-06, "loss": 0.79634303, "num_input_tokens_seen": 132056935, "step": 6143, "time_per_iteration": 2.6308929920196533 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.04840302, "balance_loss_mlp": 1.01860011, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 2.7754530777388555, "language_loss": 0.82127941, "learning_rate": 2.9085188281046364e-06, "loss": 0.84284127, "num_input_tokens_seen": 132077285, "step": 6144, "time_per_iteration": 2.7094409465789795 }, { "auxiliary_loss_clip": 0.01126238, "auxiliary_loss_mlp": 0.01040495, "balance_loss_clip": 1.0479883, "balance_loss_mlp": 1.02547419, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 2.260022101229928, "language_loss": 0.774791, "learning_rate": 2.908171851365593e-06, "loss": 0.79645836, "num_input_tokens_seen": 132095520, "step": 6145, "time_per_iteration": 2.6951241493225098 }, { "auxiliary_loss_clip": 0.01120499, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.04903388, "balance_loss_mlp": 1.01503491, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 2.2611713814894423, "language_loss": 0.76861286, "learning_rate": 2.9078248401885815e-06, "loss": 0.79012597, "num_input_tokens_seen": 132112810, "step": 6146, "time_per_iteration": 2.6205246448516846 }, { "auxiliary_loss_clip": 0.0110988, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.04717457, "balance_loss_mlp": 1.02518249, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 3.3549376840260394, "language_loss": 0.80945081, "learning_rate": 2.907477794586761e-06, "loss": 0.83097762, "num_input_tokens_seen": 132131615, "step": 6147, "time_per_iteration": 2.7176942825317383 }, { "auxiliary_loss_clip": 0.01108097, "auxiliary_loss_mlp": 0.00773519, "balance_loss_clip": 1.05041718, "balance_loss_mlp": 1.00029731, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 1.8104892137163535, "language_loss": 0.83325249, "learning_rate": 2.9071307145732926e-06, "loss": 0.85206866, "num_input_tokens_seen": 132149585, "step": 6148, "time_per_iteration": 2.7764229774475098 }, { "auxiliary_loss_clip": 0.01121751, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04946411, "balance_loss_mlp": 1.01843238, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 2.472295207741171, "language_loss": 0.74167144, "learning_rate": 2.9067836001613357e-06, "loss": 0.76322597, "num_input_tokens_seen": 132165555, "step": 6149, "time_per_iteration": 2.729785680770874 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.0524776, "balance_loss_mlp": 1.02347541, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 2.18045381202803, "language_loss": 0.71229833, "learning_rate": 2.906436451364054e-06, "loss": 0.73411667, "num_input_tokens_seen": 132185100, "step": 6150, "time_per_iteration": 2.6558914184570312 }, { "auxiliary_loss_clip": 0.01112432, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04834723, "balance_loss_mlp": 1.02634454, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 2.1283605732632487, "language_loss": 0.82001126, "learning_rate": 2.906089268194611e-06, "loss": 0.84155917, "num_input_tokens_seen": 132203930, "step": 6151, "time_per_iteration": 2.811908483505249 }, { "auxiliary_loss_clip": 0.0104085, "auxiliary_loss_mlp": 0.01012111, "balance_loss_clip": 1.02895284, "balance_loss_mlp": 1.01035905, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.8434423047890295, "language_loss": 0.63103437, "learning_rate": 2.9057420506661726e-06, "loss": 0.651564, "num_input_tokens_seen": 132263845, "step": 6152, "time_per_iteration": 3.283348798751831 }, { "auxiliary_loss_clip": 0.01083912, "auxiliary_loss_mlp": 0.01046371, "balance_loss_clip": 1.04603028, "balance_loss_mlp": 1.02939606, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 2.101714417244525, "language_loss": 0.70249707, "learning_rate": 2.9053947987919044e-06, "loss": 0.72379988, "num_input_tokens_seen": 132282350, "step": 6153, "time_per_iteration": 2.776003837585449 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.04984677, "balance_loss_mlp": 1.02176309, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 1.5983560512083512, "language_loss": 0.72364891, "learning_rate": 2.9050475125849755e-06, "loss": 0.74530017, "num_input_tokens_seen": 132301930, "step": 6154, "time_per_iteration": 2.7031455039978027 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04792106, "balance_loss_mlp": 1.02376008, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 1.6579101756116525, "language_loss": 0.67716074, "learning_rate": 2.9047001920585534e-06, "loss": 0.6986388, "num_input_tokens_seen": 132320915, "step": 6155, "time_per_iteration": 2.7716591358184814 }, { "auxiliary_loss_clip": 0.01124062, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.04789114, "balance_loss_mlp": 1.0171442, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 1.797024775246088, "language_loss": 0.68048114, "learning_rate": 2.9043528372258097e-06, "loss": 0.70204842, "num_input_tokens_seen": 132340415, "step": 6156, "time_per_iteration": 2.7830615043640137 }, { "auxiliary_loss_clip": 0.01109781, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.04603815, "balance_loss_mlp": 1.02202225, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 1.8485807917443284, "language_loss": 0.82232833, "learning_rate": 2.904005448099916e-06, "loss": 0.84379458, "num_input_tokens_seen": 132358600, "step": 6157, "time_per_iteration": 2.676429033279419 }, { "auxiliary_loss_clip": 0.01087924, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.04360199, "balance_loss_mlp": 1.02474344, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 2.2992188770836175, "language_loss": 0.76899838, "learning_rate": 2.9036580246940444e-06, "loss": 0.79029977, "num_input_tokens_seen": 132373160, "step": 6158, "time_per_iteration": 2.7764365673065186 }, { "auxiliary_loss_clip": 0.01138492, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.01997483, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 2.8360595009252196, "language_loss": 0.68930852, "learning_rate": 2.9033105670213708e-06, "loss": 0.71106398, "num_input_tokens_seen": 132392345, "step": 6159, "time_per_iteration": 2.664858818054199 }, { "auxiliary_loss_clip": 0.01110756, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.02067792, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 2.9956624327703523, "language_loss": 0.71067882, "learning_rate": 2.9029630750950697e-06, "loss": 0.73213673, "num_input_tokens_seen": 132412620, "step": 6160, "time_per_iteration": 2.757081985473633 }, { "auxiliary_loss_clip": 0.01106906, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.04698467, "balance_loss_mlp": 1.01918936, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 2.0439504076987403, "language_loss": 0.79205775, "learning_rate": 2.9026155489283176e-06, "loss": 0.81345737, "num_input_tokens_seen": 132431570, "step": 6161, "time_per_iteration": 2.8008711338043213 }, { "auxiliary_loss_clip": 0.01136197, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04960537, "balance_loss_mlp": 1.02284193, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 2.0425786778899058, "language_loss": 0.79665029, "learning_rate": 2.902267988534295e-06, "loss": 0.81840169, "num_input_tokens_seen": 132451525, "step": 6162, "time_per_iteration": 4.2554450035095215 }, { "auxiliary_loss_clip": 0.01107039, "auxiliary_loss_mlp": 0.00773743, "balance_loss_clip": 1.0442729, "balance_loss_mlp": 1.00038123, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 2.0272159369395193, "language_loss": 0.79314882, "learning_rate": 2.9019203939261783e-06, "loss": 0.81195664, "num_input_tokens_seen": 132469875, "step": 6163, "time_per_iteration": 2.753324508666992 }, { "auxiliary_loss_clip": 0.0112147, "auxiliary_loss_mlp": 0.01039825, "balance_loss_clip": 1.04676855, "balance_loss_mlp": 1.02351689, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 1.847799951808159, "language_loss": 0.67843366, "learning_rate": 2.9015727651171507e-06, "loss": 0.7000466, "num_input_tokens_seen": 132488360, "step": 6164, "time_per_iteration": 2.7885541915893555 }, { "auxiliary_loss_clip": 0.01109766, "auxiliary_loss_mlp": 0.01045808, "balance_loss_clip": 1.04918885, "balance_loss_mlp": 1.02877307, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 2.0007288653084334, "language_loss": 0.83441198, "learning_rate": 2.9012251021203935e-06, "loss": 0.85596776, "num_input_tokens_seen": 132508630, "step": 6165, "time_per_iteration": 4.3637871742248535 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.0473845, "balance_loss_mlp": 1.02026439, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 1.7502292049636352, "language_loss": 0.69057518, "learning_rate": 2.9008774049490896e-06, "loss": 0.71211129, "num_input_tokens_seen": 132527465, "step": 6166, "time_per_iteration": 2.6754019260406494 }, { "auxiliary_loss_clip": 0.01032616, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.03081024, "balance_loss_mlp": 1.02362847, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.8028866408552083, "language_loss": 0.5688796, "learning_rate": 2.9005296736164244e-06, "loss": 0.58946037, "num_input_tokens_seen": 132579940, "step": 6167, "time_per_iteration": 6.357440233230591 }, { "auxiliary_loss_clip": 0.01110244, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.04592001, "balance_loss_mlp": 1.02284551, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 2.0812394742982203, "language_loss": 0.75159574, "learning_rate": 2.900181908135584e-06, "loss": 0.77307719, "num_input_tokens_seen": 132598390, "step": 6168, "time_per_iteration": 2.7107198238372803 }, { "auxiliary_loss_clip": 0.01117658, "auxiliary_loss_mlp": 0.00773774, "balance_loss_clip": 1.04381216, "balance_loss_mlp": 1.00029826, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 2.166706099657804, "language_loss": 0.73690271, "learning_rate": 2.899834108519755e-06, "loss": 0.755817, "num_input_tokens_seen": 132616920, "step": 6169, "time_per_iteration": 2.743741035461426 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.0476737, "balance_loss_mlp": 1.02352989, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 1.6724632615545945, "language_loss": 0.79498589, "learning_rate": 2.899486274782127e-06, "loss": 0.81669056, "num_input_tokens_seen": 132637660, "step": 6170, "time_per_iteration": 2.738492727279663 }, { "auxiliary_loss_clip": 0.01122253, "auxiliary_loss_mlp": 0.01045679, "balance_loss_clip": 1.04780805, "balance_loss_mlp": 1.02913237, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 1.739457755704792, "language_loss": 0.76506341, "learning_rate": 2.8991384069358885e-06, "loss": 0.78674281, "num_input_tokens_seen": 132657635, "step": 6171, "time_per_iteration": 2.6531472206115723 }, { "auxiliary_loss_clip": 0.01112543, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.05081654, "balance_loss_mlp": 1.02546144, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 2.0084032146250608, "language_loss": 0.80705774, "learning_rate": 2.898790504994232e-06, "loss": 0.82860184, "num_input_tokens_seen": 132674455, "step": 6172, "time_per_iteration": 2.6587960720062256 }, { "auxiliary_loss_clip": 0.01125694, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.0475564, "balance_loss_mlp": 1.02747262, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 2.410153405618026, "language_loss": 0.59260982, "learning_rate": 2.89844256897035e-06, "loss": 0.61430931, "num_input_tokens_seen": 132695140, "step": 6173, "time_per_iteration": 2.738430976867676 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01044385, "balance_loss_clip": 1.04549873, "balance_loss_mlp": 1.02885222, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 1.954423749693878, "language_loss": 0.80869365, "learning_rate": 2.898094598877435e-06, "loss": 0.83024681, "num_input_tokens_seen": 132712470, "step": 6174, "time_per_iteration": 2.7166690826416016 }, { "auxiliary_loss_clip": 0.01129522, "auxiliary_loss_mlp": 0.01045042, "balance_loss_clip": 1.04628158, "balance_loss_mlp": 1.03025961, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 2.1592050046005, "language_loss": 0.79910219, "learning_rate": 2.8977465947286826e-06, "loss": 0.82084787, "num_input_tokens_seen": 132732945, "step": 6175, "time_per_iteration": 2.6746280193328857 }, { "auxiliary_loss_clip": 0.011267, "auxiliary_loss_mlp": 0.01053826, "balance_loss_clip": 1.05173898, "balance_loss_mlp": 1.0380547, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 2.2578092376668315, "language_loss": 0.88735723, "learning_rate": 2.89739855653729e-06, "loss": 0.90916252, "num_input_tokens_seen": 132752470, "step": 6176, "time_per_iteration": 2.6791093349456787 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04811859, "balance_loss_mlp": 1.02713037, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 1.5716198978013565, "language_loss": 0.73431349, "learning_rate": 2.8970504843164546e-06, "loss": 0.75598538, "num_input_tokens_seen": 132771485, "step": 6177, "time_per_iteration": 2.6808605194091797 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04524541, "balance_loss_mlp": 1.03575838, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 2.0030850547718915, "language_loss": 0.75349051, "learning_rate": 2.896702378079374e-06, "loss": 0.77503073, "num_input_tokens_seen": 132791465, "step": 6178, "time_per_iteration": 2.7112066745758057 }, { "auxiliary_loss_clip": 0.0107122, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.04323864, "balance_loss_mlp": 1.03208089, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 2.0305314414463136, "language_loss": 0.72141892, "learning_rate": 2.8963542378392502e-06, "loss": 0.74263525, "num_input_tokens_seen": 132810160, "step": 6179, "time_per_iteration": 2.7965877056121826 }, { "auxiliary_loss_clip": 0.01137504, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.05008841, "balance_loss_mlp": 1.03018165, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 2.387630814732786, "language_loss": 0.6993162, "learning_rate": 2.896006063609283e-06, "loss": 0.72115916, "num_input_tokens_seen": 132831265, "step": 6180, "time_per_iteration": 2.695232391357422 }, { "auxiliary_loss_clip": 0.01113448, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.04914021, "balance_loss_mlp": 1.02208257, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 2.1080005695464243, "language_loss": 0.77920252, "learning_rate": 2.8956578554026767e-06, "loss": 0.80070812, "num_input_tokens_seen": 132850005, "step": 6181, "time_per_iteration": 2.7087795734405518 }, { "auxiliary_loss_clip": 0.01123157, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05016994, "balance_loss_mlp": 1.02525139, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 2.570629027716188, "language_loss": 0.79222846, "learning_rate": 2.8953096132326343e-06, "loss": 0.81387818, "num_input_tokens_seen": 132865790, "step": 6182, "time_per_iteration": 2.6541473865509033 }, { "auxiliary_loss_clip": 0.01041849, "auxiliary_loss_mlp": 0.01016945, "balance_loss_clip": 1.03053021, "balance_loss_mlp": 1.01533604, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7830434308203498, "language_loss": 0.57445002, "learning_rate": 2.894961337112362e-06, "loss": 0.59503794, "num_input_tokens_seen": 132921775, "step": 6183, "time_per_iteration": 3.191969633102417 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.00775242, "balance_loss_clip": 1.04496169, "balance_loss_mlp": 1.00043631, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 1.9647478507461604, "language_loss": 0.76617277, "learning_rate": 2.894613027055066e-06, "loss": 0.78519297, "num_input_tokens_seen": 132941060, "step": 6184, "time_per_iteration": 2.7096588611602783 }, { "auxiliary_loss_clip": 0.01090654, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.04084587, "balance_loss_mlp": 1.02344596, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 2.1021072738728717, "language_loss": 0.7217713, "learning_rate": 2.894264683073954e-06, "loss": 0.74306846, "num_input_tokens_seen": 132961850, "step": 6185, "time_per_iteration": 2.739130735397339 }, { "auxiliary_loss_clip": 0.01081138, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.04156423, "balance_loss_mlp": 1.01805878, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 2.1871647895832496, "language_loss": 0.76805776, "learning_rate": 2.8939163051822363e-06, "loss": 0.78921413, "num_input_tokens_seen": 132981625, "step": 6186, "time_per_iteration": 2.779510259628296 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01042221, "balance_loss_clip": 1.05090106, "balance_loss_mlp": 1.02491212, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 1.8929067887672733, "language_loss": 0.84037393, "learning_rate": 2.8935678933931224e-06, "loss": 0.86211032, "num_input_tokens_seen": 133001225, "step": 6187, "time_per_iteration": 2.67541241645813 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.04474545, "balance_loss_mlp": 1.02553999, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 1.7194664317181616, "language_loss": 0.84831274, "learning_rate": 2.893219447719824e-06, "loss": 0.86993104, "num_input_tokens_seen": 133018820, "step": 6188, "time_per_iteration": 2.6241226196289062 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.04934168, "balance_loss_mlp": 1.02501917, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 2.498329305558477, "language_loss": 0.65702367, "learning_rate": 2.8928709681755548e-06, "loss": 0.67852014, "num_input_tokens_seen": 133040205, "step": 6189, "time_per_iteration": 2.724707841873169 }, { "auxiliary_loss_clip": 0.01112219, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.0451889, "balance_loss_mlp": 1.03045225, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 1.9571366893805608, "language_loss": 0.84120989, "learning_rate": 2.8925224547735293e-06, "loss": 0.86280334, "num_input_tokens_seen": 133058095, "step": 6190, "time_per_iteration": 2.719454050064087 }, { "auxiliary_loss_clip": 0.01109992, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.0465343, "balance_loss_mlp": 1.02571416, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 4.021000090429005, "language_loss": 0.87807733, "learning_rate": 2.8921739075269633e-06, "loss": 0.89959311, "num_input_tokens_seen": 133071530, "step": 6191, "time_per_iteration": 2.7081027030944824 }, { "auxiliary_loss_clip": 0.0108777, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.04300189, "balance_loss_mlp": 1.01962125, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 3.7199150853096508, "language_loss": 0.74228656, "learning_rate": 2.891825326449073e-06, "loss": 0.7635442, "num_input_tokens_seen": 133091410, "step": 6192, "time_per_iteration": 2.8161356449127197 }, { "auxiliary_loss_clip": 0.01134777, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.04818201, "balance_loss_mlp": 1.02497888, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 2.31871347399746, "language_loss": 0.80621845, "learning_rate": 2.8914767115530766e-06, "loss": 0.82796752, "num_input_tokens_seen": 133110365, "step": 6193, "time_per_iteration": 2.661550760269165 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01041083, "balance_loss_clip": 1.04354334, "balance_loss_mlp": 1.02522826, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 2.475173523724827, "language_loss": 0.84729886, "learning_rate": 2.891128062852194e-06, "loss": 0.86872447, "num_input_tokens_seen": 133128255, "step": 6194, "time_per_iteration": 2.711531400680542 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04650784, "balance_loss_mlp": 1.02142286, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 9.44838101604173, "language_loss": 0.77016377, "learning_rate": 2.890779380359646e-06, "loss": 0.79165184, "num_input_tokens_seen": 133143975, "step": 6195, "time_per_iteration": 2.6527512073516846 }, { "auxiliary_loss_clip": 0.01112195, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.0468967, "balance_loss_mlp": 1.02030444, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 1.7021548935758455, "language_loss": 0.79216856, "learning_rate": 2.890430664088655e-06, "loss": 0.81364441, "num_input_tokens_seen": 133162935, "step": 6196, "time_per_iteration": 2.6642892360687256 }, { "auxiliary_loss_clip": 0.01124648, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.04975688, "balance_loss_mlp": 1.0240953, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 2.570886031241156, "language_loss": 0.83998835, "learning_rate": 2.890081914052443e-06, "loss": 0.8616184, "num_input_tokens_seen": 133181180, "step": 6197, "time_per_iteration": 2.627305030822754 }, { "auxiliary_loss_clip": 0.01131102, "auxiliary_loss_mlp": 0.01040963, "balance_loss_clip": 1.04697967, "balance_loss_mlp": 1.02488184, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 1.697216275583005, "language_loss": 0.64450538, "learning_rate": 2.889733130264237e-06, "loss": 0.66622603, "num_input_tokens_seen": 133199615, "step": 6198, "time_per_iteration": 2.606621503829956 }, { "auxiliary_loss_clip": 0.01120059, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02959776, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 1.4273324893736263, "language_loss": 0.737185, "learning_rate": 2.889384312737261e-06, "loss": 0.75883007, "num_input_tokens_seen": 133219650, "step": 6199, "time_per_iteration": 2.78157901763916 }, { "auxiliary_loss_clip": 0.01105963, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.04564095, "balance_loss_mlp": 1.02154374, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 2.2948998309451905, "language_loss": 0.80481982, "learning_rate": 2.889035461484742e-06, "loss": 0.82624996, "num_input_tokens_seen": 133245675, "step": 6200, "time_per_iteration": 3.0623533725738525 }, { "auxiliary_loss_clip": 0.0109608, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.04552174, "balance_loss_mlp": 1.03016961, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 2.0774746879263746, "language_loss": 0.60494614, "learning_rate": 2.88868657651991e-06, "loss": 0.62636495, "num_input_tokens_seen": 133266905, "step": 6201, "time_per_iteration": 2.8960700035095215 }, { "auxiliary_loss_clip": 0.01125447, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.0489639, "balance_loss_mlp": 1.02346373, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 1.870117482164085, "language_loss": 0.72692698, "learning_rate": 2.8883376578559934e-06, "loss": 0.74857527, "num_input_tokens_seen": 133286865, "step": 6202, "time_per_iteration": 4.202298402786255 }, { "auxiliary_loss_clip": 0.01110741, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.04642594, "balance_loss_mlp": 1.01800799, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 2.0679450432005666, "language_loss": 0.74148834, "learning_rate": 2.8879887055062243e-06, "loss": 0.76292896, "num_input_tokens_seen": 133305295, "step": 6203, "time_per_iteration": 2.7268033027648926 }, { "auxiliary_loss_clip": 0.01106859, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.04595554, "balance_loss_mlp": 1.02524805, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 1.649450499506288, "language_loss": 0.81921744, "learning_rate": 2.8876397194838353e-06, "loss": 0.84067428, "num_input_tokens_seen": 133324625, "step": 6204, "time_per_iteration": 4.347074747085571 }, { "auxiliary_loss_clip": 0.01123916, "auxiliary_loss_mlp": 0.01044159, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.02794707, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.675399556922802, "language_loss": 0.74961317, "learning_rate": 2.8872906998020577e-06, "loss": 0.77129394, "num_input_tokens_seen": 133344625, "step": 6205, "time_per_iteration": 2.66701602935791 }, { "auxiliary_loss_clip": 0.01117233, "auxiliary_loss_mlp": 0.01045323, "balance_loss_clip": 1.04337549, "balance_loss_mlp": 1.02857447, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 1.8318607259579, "language_loss": 0.7815854, "learning_rate": 2.886941646474128e-06, "loss": 0.80321097, "num_input_tokens_seen": 133363605, "step": 6206, "time_per_iteration": 4.202580451965332 }, { "auxiliary_loss_clip": 0.01134488, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.04804325, "balance_loss_mlp": 1.02317739, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 2.3232535418166256, "language_loss": 0.93322426, "learning_rate": 2.886592559513283e-06, "loss": 0.95496845, "num_input_tokens_seen": 133379405, "step": 6207, "time_per_iteration": 4.318574666976929 }, { "auxiliary_loss_clip": 0.01105421, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.0478878, "balance_loss_mlp": 1.01876843, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 3.0736568130228363, "language_loss": 0.82651198, "learning_rate": 2.886243438932759e-06, "loss": 0.8479048, "num_input_tokens_seen": 133397585, "step": 6208, "time_per_iteration": 2.749662160873413 }, { "auxiliary_loss_clip": 0.01122225, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.04488516, "balance_loss_mlp": 1.0223707, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 2.0157740087962845, "language_loss": 0.73122764, "learning_rate": 2.8858942847457953e-06, "loss": 0.75284666, "num_input_tokens_seen": 133415365, "step": 6209, "time_per_iteration": 2.6315791606903076 }, { "auxiliary_loss_clip": 0.01095649, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.04820108, "balance_loss_mlp": 1.02065969, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 1.9650719997143145, "language_loss": 0.70413053, "learning_rate": 2.8855450969656305e-06, "loss": 0.72545838, "num_input_tokens_seen": 133435700, "step": 6210, "time_per_iteration": 2.7484405040740967 }, { "auxiliary_loss_clip": 0.01072484, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.03769457, "balance_loss_mlp": 1.02674007, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 2.0510282142916427, "language_loss": 0.77773547, "learning_rate": 2.8851958756055073e-06, "loss": 0.79891646, "num_input_tokens_seen": 133455180, "step": 6211, "time_per_iteration": 2.706294536590576 }, { "auxiliary_loss_clip": 0.01122999, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.04602683, "balance_loss_mlp": 1.02645469, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 1.675173432335243, "language_loss": 0.73258781, "learning_rate": 2.884846620678668e-06, "loss": 0.7542417, "num_input_tokens_seen": 133476715, "step": 6212, "time_per_iteration": 2.788787841796875 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.01047595, "balance_loss_clip": 1.05055571, "balance_loss_mlp": 1.03106034, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 1.9808770110660865, "language_loss": 0.81656909, "learning_rate": 2.884497332198356e-06, "loss": 0.83835626, "num_input_tokens_seen": 133494550, "step": 6213, "time_per_iteration": 2.6829304695129395 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.01046172, "balance_loss_clip": 1.0412662, "balance_loss_mlp": 1.02843404, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 2.223600899112558, "language_loss": 0.78999674, "learning_rate": 2.8841480101778167e-06, "loss": 0.81135225, "num_input_tokens_seen": 133512640, "step": 6214, "time_per_iteration": 2.674373149871826 }, { "auxiliary_loss_clip": 0.01109052, "auxiliary_loss_mlp": 0.01044175, "balance_loss_clip": 1.04420567, "balance_loss_mlp": 1.02827835, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 1.9266500277332215, "language_loss": 0.84611148, "learning_rate": 2.883798654630296e-06, "loss": 0.86764371, "num_input_tokens_seen": 133535540, "step": 6215, "time_per_iteration": 2.8276026248931885 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04435837, "balance_loss_mlp": 1.02298141, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 1.8731663372997254, "language_loss": 0.67690969, "learning_rate": 2.8834492655690423e-06, "loss": 0.69830984, "num_input_tokens_seen": 133555795, "step": 6216, "time_per_iteration": 2.724090576171875 }, { "auxiliary_loss_clip": 0.01111654, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.045977, "balance_loss_mlp": 1.02578092, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 2.3172976096058853, "language_loss": 0.65993899, "learning_rate": 2.883099843007303e-06, "loss": 0.68148154, "num_input_tokens_seen": 133575905, "step": 6217, "time_per_iteration": 2.7126269340515137 }, { "auxiliary_loss_clip": 0.01115905, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.0483315, "balance_loss_mlp": 1.02264857, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 2.0273109551694777, "language_loss": 0.80449212, "learning_rate": 2.88275038695833e-06, "loss": 0.82604814, "num_input_tokens_seen": 133592585, "step": 6218, "time_per_iteration": 2.680894374847412 }, { "auxiliary_loss_clip": 0.01115539, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.04488862, "balance_loss_mlp": 1.01760781, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 1.5960804840892617, "language_loss": 0.78692639, "learning_rate": 2.8824008974353736e-06, "loss": 0.80841064, "num_input_tokens_seen": 133615070, "step": 6219, "time_per_iteration": 2.6683976650238037 }, { "auxiliary_loss_clip": 0.01107805, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.04602623, "balance_loss_mlp": 1.0247364, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 1.8103875982928064, "language_loss": 0.77023458, "learning_rate": 2.8820513744516866e-06, "loss": 0.79172027, "num_input_tokens_seen": 133633490, "step": 6220, "time_per_iteration": 2.670686960220337 }, { "auxiliary_loss_clip": 0.01105245, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.04717016, "balance_loss_mlp": 1.02473164, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 3.4153989861378204, "language_loss": 0.8298834, "learning_rate": 2.8817018180205235e-06, "loss": 0.85134745, "num_input_tokens_seen": 133653425, "step": 6221, "time_per_iteration": 2.730738401412964 }, { "auxiliary_loss_clip": 0.01108391, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.04499435, "balance_loss_mlp": 1.02825367, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 1.9668982067313725, "language_loss": 0.75944567, "learning_rate": 2.8813522281551387e-06, "loss": 0.78096926, "num_input_tokens_seen": 133670220, "step": 6222, "time_per_iteration": 2.62052321434021 }, { "auxiliary_loss_clip": 0.01103117, "auxiliary_loss_mlp": 0.00772891, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.00029564, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 1.8881600065301847, "language_loss": 0.70621789, "learning_rate": 2.881002604868789e-06, "loss": 0.72497797, "num_input_tokens_seen": 133688910, "step": 6223, "time_per_iteration": 2.7686285972595215 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01035203, "balance_loss_clip": 1.05155015, "balance_loss_mlp": 1.02057576, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 2.1852519558340644, "language_loss": 0.6875304, "learning_rate": 2.8806529481747325e-06, "loss": 0.7089299, "num_input_tokens_seen": 133708690, "step": 6224, "time_per_iteration": 2.817263126373291 }, { "auxiliary_loss_clip": 0.01091747, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.04859614, "balance_loss_mlp": 1.02059817, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 2.246642459489035, "language_loss": 0.70192593, "learning_rate": 2.880303258086228e-06, "loss": 0.72320735, "num_input_tokens_seen": 133728095, "step": 6225, "time_per_iteration": 2.785083532333374 }, { "auxiliary_loss_clip": 0.01088757, "auxiliary_loss_mlp": 0.01048544, "balance_loss_clip": 1.04366183, "balance_loss_mlp": 1.03175974, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 2.1768682992812236, "language_loss": 0.7896018, "learning_rate": 2.879953534616536e-06, "loss": 0.81097472, "num_input_tokens_seen": 133745590, "step": 6226, "time_per_iteration": 2.7403974533081055 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01039029, "balance_loss_clip": 1.04631484, "balance_loss_mlp": 1.02303696, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 1.7825799805329443, "language_loss": 0.67965841, "learning_rate": 2.879603777778917e-06, "loss": 0.70110166, "num_input_tokens_seen": 133766155, "step": 6227, "time_per_iteration": 2.6975693702697754 }, { "auxiliary_loss_clip": 0.01099252, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.04493213, "balance_loss_mlp": 1.01890039, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 1.9005486801766094, "language_loss": 0.829476, "learning_rate": 2.879253987586635e-06, "loss": 0.85081351, "num_input_tokens_seen": 133783185, "step": 6228, "time_per_iteration": 2.7754271030426025 }, { "auxiliary_loss_clip": 0.01090082, "auxiliary_loss_mlp": 0.01048677, "balance_loss_clip": 1.04396605, "balance_loss_mlp": 1.03159404, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 1.6406992237121778, "language_loss": 0.74450547, "learning_rate": 2.8789041640529535e-06, "loss": 0.76589304, "num_input_tokens_seen": 133800975, "step": 6229, "time_per_iteration": 2.6378824710845947 }, { "auxiliary_loss_clip": 0.0109707, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.0470053, "balance_loss_mlp": 1.01971197, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 2.127994694324029, "language_loss": 0.83782691, "learning_rate": 2.8785543071911383e-06, "loss": 0.85915756, "num_input_tokens_seen": 133818020, "step": 6230, "time_per_iteration": 2.6857657432556152 }, { "auxiliary_loss_clip": 0.0112393, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.02556968, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 2.8382818326589145, "language_loss": 0.735865, "learning_rate": 2.878204417014456e-06, "loss": 0.75752056, "num_input_tokens_seen": 133840690, "step": 6231, "time_per_iteration": 2.7082016468048096 }, { "auxiliary_loss_clip": 0.0112579, "auxiliary_loss_mlp": 0.01046917, "balance_loss_clip": 1.05376148, "balance_loss_mlp": 1.03075266, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 2.9683381932525665, "language_loss": 0.7412858, "learning_rate": 2.8778544935361735e-06, "loss": 0.76301289, "num_input_tokens_seen": 133858350, "step": 6232, "time_per_iteration": 2.5764057636260986 }, { "auxiliary_loss_clip": 0.01106131, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.04461622, "balance_loss_mlp": 1.02237701, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 2.121427790242168, "language_loss": 0.77296579, "learning_rate": 2.877504536769561e-06, "loss": 0.79441959, "num_input_tokens_seen": 133879775, "step": 6233, "time_per_iteration": 2.692286252975464 }, { "auxiliary_loss_clip": 0.01118513, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05093503, "balance_loss_mlp": 1.024593, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 1.8446337373318833, "language_loss": 0.69493848, "learning_rate": 2.8771545467278883e-06, "loss": 0.71652997, "num_input_tokens_seen": 133898295, "step": 6234, "time_per_iteration": 2.658332586288452 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01042963, "balance_loss_clip": 1.04885483, "balance_loss_mlp": 1.02833033, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 1.9015387878630694, "language_loss": 0.82462788, "learning_rate": 2.8768045234244276e-06, "loss": 0.84629285, "num_input_tokens_seen": 133915230, "step": 6235, "time_per_iteration": 2.591198682785034 }, { "auxiliary_loss_clip": 0.01140927, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.05301189, "balance_loss_mlp": 1.02021289, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 1.8869628373328378, "language_loss": 0.78439927, "learning_rate": 2.8764544668724517e-06, "loss": 0.80616879, "num_input_tokens_seen": 133934110, "step": 6236, "time_per_iteration": 2.6754372119903564 }, { "auxiliary_loss_clip": 0.01118225, "auxiliary_loss_mlp": 0.01050242, "balance_loss_clip": 1.04519606, "balance_loss_mlp": 1.03202713, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 2.0770406770017242, "language_loss": 0.74357057, "learning_rate": 2.876104377085234e-06, "loss": 0.76525521, "num_input_tokens_seen": 133952395, "step": 6237, "time_per_iteration": 2.6760342121124268 }, { "auxiliary_loss_clip": 0.01114513, "auxiliary_loss_mlp": 0.00773766, "balance_loss_clip": 1.04626942, "balance_loss_mlp": 1.00036037, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 2.0699756536584633, "language_loss": 0.93258965, "learning_rate": 2.8757542540760508e-06, "loss": 0.95147252, "num_input_tokens_seen": 133969635, "step": 6238, "time_per_iteration": 2.6805243492126465 }, { "auxiliary_loss_clip": 0.01137619, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.04995167, "balance_loss_mlp": 1.02081275, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 2.3841921025147284, "language_loss": 0.70885909, "learning_rate": 2.8754040978581777e-06, "loss": 0.73060858, "num_input_tokens_seen": 133987215, "step": 6239, "time_per_iteration": 2.548285961151123 }, { "auxiliary_loss_clip": 0.01068531, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.04656243, "balance_loss_mlp": 1.02303219, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 1.601808094344726, "language_loss": 0.65752542, "learning_rate": 2.875053908444895e-06, "loss": 0.67861104, "num_input_tokens_seen": 134009250, "step": 6240, "time_per_iteration": 3.016897201538086 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.00773445, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.00033951, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 2.721418670308367, "language_loss": 0.75816065, "learning_rate": 2.8747036858494795e-06, "loss": 0.7769137, "num_input_tokens_seen": 134026875, "step": 6241, "time_per_iteration": 4.402552843093872 }, { "auxiliary_loss_clip": 0.01103844, "auxiliary_loss_mlp": 0.01044119, "balance_loss_clip": 1.04654765, "balance_loss_mlp": 1.0276264, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 2.108703330368865, "language_loss": 0.83791685, "learning_rate": 2.874353430085213e-06, "loss": 0.85939646, "num_input_tokens_seen": 134047185, "step": 6242, "time_per_iteration": 2.7508704662323 }, { "auxiliary_loss_clip": 0.01110348, "auxiliary_loss_mlp": 0.01048171, "balance_loss_clip": 1.04799628, "balance_loss_mlp": 1.03319848, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 2.4924519814208774, "language_loss": 0.68438506, "learning_rate": 2.8740031411653766e-06, "loss": 0.70597029, "num_input_tokens_seen": 134067330, "step": 6243, "time_per_iteration": 2.7814478874206543 }, { "auxiliary_loss_clip": 0.01056696, "auxiliary_loss_mlp": 0.00776554, "balance_loss_clip": 1.04175019, "balance_loss_mlp": 1.00038528, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 1.7699519682943652, "language_loss": 0.84165168, "learning_rate": 2.8736528191032535e-06, "loss": 0.85998416, "num_input_tokens_seen": 134085525, "step": 6244, "time_per_iteration": 4.510041952133179 }, { "auxiliary_loss_clip": 0.01074238, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03981614, "balance_loss_mlp": 1.02712417, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 2.7453088605805616, "language_loss": 0.82679987, "learning_rate": 2.8733024639121277e-06, "loss": 0.84797096, "num_input_tokens_seen": 134101855, "step": 6245, "time_per_iteration": 4.745215654373169 }, { "auxiliary_loss_clip": 0.01096909, "auxiliary_loss_mlp": 0.0104658, "balance_loss_clip": 1.04049206, "balance_loss_mlp": 1.0296756, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 8.46557879021872, "language_loss": 0.63902843, "learning_rate": 2.8729520756052853e-06, "loss": 0.66046333, "num_input_tokens_seen": 134119360, "step": 6246, "time_per_iteration": 4.33053731918335 }, { "auxiliary_loss_clip": 0.01112093, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.04961443, "balance_loss_mlp": 1.0264082, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 2.0508038288587183, "language_loss": 0.74467009, "learning_rate": 2.8726016541960124e-06, "loss": 0.76622653, "num_input_tokens_seen": 134137475, "step": 6247, "time_per_iteration": 2.688081979751587 }, { "auxiliary_loss_clip": 0.01126872, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.05022037, "balance_loss_mlp": 1.02133489, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 2.703785960910372, "language_loss": 0.5497098, "learning_rate": 2.872251199697598e-06, "loss": 0.57135224, "num_input_tokens_seen": 134154580, "step": 6248, "time_per_iteration": 2.6308822631835938 }, { "auxiliary_loss_clip": 0.01117073, "auxiliary_loss_mlp": 0.01036379, "balance_loss_clip": 1.04465234, "balance_loss_mlp": 1.0200597, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 4.209721572066423, "language_loss": 0.84492457, "learning_rate": 2.8719007121233297e-06, "loss": 0.86645913, "num_input_tokens_seen": 134174285, "step": 6249, "time_per_iteration": 2.6539809703826904 }, { "auxiliary_loss_clip": 0.01107733, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.04784632, "balance_loss_mlp": 1.01956248, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 1.546160982958922, "language_loss": 0.67701882, "learning_rate": 2.8715501914864993e-06, "loss": 0.69845104, "num_input_tokens_seen": 134195940, "step": 6250, "time_per_iteration": 2.787398338317871 }, { "auxiliary_loss_clip": 0.01117019, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04946029, "balance_loss_mlp": 1.0293386, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 1.960309683567346, "language_loss": 0.77824795, "learning_rate": 2.8711996378003987e-06, "loss": 0.79986179, "num_input_tokens_seen": 134212235, "step": 6251, "time_per_iteration": 2.7143123149871826 }, { "auxiliary_loss_clip": 0.01121024, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.04994178, "balance_loss_mlp": 1.0236522, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 2.527016245081176, "language_loss": 0.58002663, "learning_rate": 2.8708490510783203e-06, "loss": 0.60162789, "num_input_tokens_seen": 134233810, "step": 6252, "time_per_iteration": 2.716597557067871 }, { "auxiliary_loss_clip": 0.01116459, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.05007291, "balance_loss_mlp": 1.0260098, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 4.856643583290163, "language_loss": 0.89482141, "learning_rate": 2.8704984313335584e-06, "loss": 0.91641152, "num_input_tokens_seen": 134252020, "step": 6253, "time_per_iteration": 2.701361894607544 }, { "auxiliary_loss_clip": 0.01098154, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.04815936, "balance_loss_mlp": 1.02562761, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 2.218099502464204, "language_loss": 0.76568806, "learning_rate": 2.8701477785794097e-06, "loss": 0.78707361, "num_input_tokens_seen": 134269495, "step": 6254, "time_per_iteration": 2.6995303630828857 }, { "auxiliary_loss_clip": 0.01096995, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.04379475, "balance_loss_mlp": 1.02628207, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 2.131769376763656, "language_loss": 0.6180023, "learning_rate": 2.869797092829169e-06, "loss": 0.6394071, "num_input_tokens_seen": 134287035, "step": 6255, "time_per_iteration": 2.7164864540100098 }, { "auxiliary_loss_clip": 0.01127282, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.04883361, "balance_loss_mlp": 1.02017426, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 2.6629341180561545, "language_loss": 0.74404681, "learning_rate": 2.869446374096135e-06, "loss": 0.76568639, "num_input_tokens_seen": 134304840, "step": 6256, "time_per_iteration": 2.588169574737549 }, { "auxiliary_loss_clip": 0.01127124, "auxiliary_loss_mlp": 0.01046358, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02977645, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 2.3087979716808937, "language_loss": 0.702447, "learning_rate": 2.8690956223936088e-06, "loss": 0.72418177, "num_input_tokens_seen": 134323180, "step": 6257, "time_per_iteration": 2.701555013656616 }, { "auxiliary_loss_clip": 0.01110787, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.04812109, "balance_loss_mlp": 1.01796508, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 1.673769537318751, "language_loss": 0.84842372, "learning_rate": 2.868744837734889e-06, "loss": 0.86986494, "num_input_tokens_seen": 134341390, "step": 6258, "time_per_iteration": 2.6336703300476074 }, { "auxiliary_loss_clip": 0.01091689, "auxiliary_loss_mlp": 0.01041654, "balance_loss_clip": 1.04571128, "balance_loss_mlp": 1.0271697, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 1.4940028515654036, "language_loss": 0.80920124, "learning_rate": 2.868394020133277e-06, "loss": 0.83053464, "num_input_tokens_seen": 134360425, "step": 6259, "time_per_iteration": 2.752392053604126 }, { "auxiliary_loss_clip": 0.01093234, "auxiliary_loss_mlp": 0.01046443, "balance_loss_clip": 1.04547083, "balance_loss_mlp": 1.02969444, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 2.4951694968605627, "language_loss": 0.71285564, "learning_rate": 2.8680431696020783e-06, "loss": 0.73425239, "num_input_tokens_seen": 134379775, "step": 6260, "time_per_iteration": 2.782561779022217 }, { "auxiliary_loss_clip": 0.01107136, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.04386747, "balance_loss_mlp": 1.02305889, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 1.627422352949978, "language_loss": 0.78342533, "learning_rate": 2.867692286154594e-06, "loss": 0.80488986, "num_input_tokens_seen": 134400315, "step": 6261, "time_per_iteration": 2.6978867053985596 }, { "auxiliary_loss_clip": 0.01112259, "auxiliary_loss_mlp": 0.01048861, "balance_loss_clip": 1.04744315, "balance_loss_mlp": 1.0312773, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 2.418447947297228, "language_loss": 0.80871278, "learning_rate": 2.867341369804132e-06, "loss": 0.83032399, "num_input_tokens_seen": 134422875, "step": 6262, "time_per_iteration": 2.852675437927246 }, { "auxiliary_loss_clip": 0.01115101, "auxiliary_loss_mlp": 0.01038136, "balance_loss_clip": 1.04584765, "balance_loss_mlp": 1.02277565, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 2.9875520790285774, "language_loss": 0.80295742, "learning_rate": 2.866990420563998e-06, "loss": 0.82448983, "num_input_tokens_seen": 134443025, "step": 6263, "time_per_iteration": 2.785395622253418 }, { "auxiliary_loss_clip": 0.01140252, "auxiliary_loss_mlp": 0.01045838, "balance_loss_clip": 1.05247605, "balance_loss_mlp": 1.0300312, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 2.896352989954936, "language_loss": 0.79601765, "learning_rate": 2.866639438447501e-06, "loss": 0.81787854, "num_input_tokens_seen": 134460945, "step": 6264, "time_per_iteration": 2.581125497817993 }, { "auxiliary_loss_clip": 0.01133548, "auxiliary_loss_mlp": 0.0105155, "balance_loss_clip": 1.04770851, "balance_loss_mlp": 1.03557551, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 2.0921625870578913, "language_loss": 0.73808366, "learning_rate": 2.8662884234679497e-06, "loss": 0.75993466, "num_input_tokens_seen": 134480440, "step": 6265, "time_per_iteration": 2.6998226642608643 }, { "auxiliary_loss_clip": 0.01123221, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.05005145, "balance_loss_mlp": 1.02543402, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 1.9744000825782282, "language_loss": 0.68550873, "learning_rate": 2.865937375638654e-06, "loss": 0.70713472, "num_input_tokens_seen": 134501110, "step": 6266, "time_per_iteration": 2.6934731006622314 }, { "auxiliary_loss_clip": 0.01128105, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.04846668, "balance_loss_mlp": 1.02536833, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 3.437883319374573, "language_loss": 0.63078731, "learning_rate": 2.8655862949729264e-06, "loss": 0.65248024, "num_input_tokens_seen": 134522460, "step": 6267, "time_per_iteration": 2.7006735801696777 }, { "auxiliary_loss_clip": 0.01050407, "auxiliary_loss_mlp": 0.01011452, "balance_loss_clip": 1.02822745, "balance_loss_mlp": 1.00960469, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7198108741876666, "language_loss": 0.58852816, "learning_rate": 2.8652351814840795e-06, "loss": 0.60914677, "num_input_tokens_seen": 134589545, "step": 6268, "time_per_iteration": 3.355120897293091 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.05033755, "balance_loss_mlp": 1.02698505, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 2.34128493463531, "language_loss": 0.65263468, "learning_rate": 2.8648840351854283e-06, "loss": 0.67444575, "num_input_tokens_seen": 134610550, "step": 6269, "time_per_iteration": 2.656585931777954 }, { "auxiliary_loss_clip": 0.01099912, "auxiliary_loss_mlp": 0.01041008, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02536798, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 1.5250715006737088, "language_loss": 0.7069717, "learning_rate": 2.8645328560902874e-06, "loss": 0.72838092, "num_input_tokens_seen": 134630485, "step": 6270, "time_per_iteration": 2.7498419284820557 }, { "auxiliary_loss_clip": 0.01059818, "auxiliary_loss_mlp": 0.01007405, "balance_loss_clip": 1.02900875, "balance_loss_mlp": 1.00581956, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.7193704591933474, "language_loss": 0.56122422, "learning_rate": 2.8641816442119746e-06, "loss": 0.58189648, "num_input_tokens_seen": 134693510, "step": 6271, "time_per_iteration": 3.1569089889526367 }, { "auxiliary_loss_clip": 0.01121208, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.02609181, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 2.1611051517344246, "language_loss": 0.79855239, "learning_rate": 2.8638303995638066e-06, "loss": 0.82019162, "num_input_tokens_seen": 134713115, "step": 6272, "time_per_iteration": 2.628180742263794 }, { "auxiliary_loss_clip": 0.01118748, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.0451988, "balance_loss_mlp": 1.01934206, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 2.0954681641544304, "language_loss": 0.73789483, "learning_rate": 2.863479122159103e-06, "loss": 0.75941932, "num_input_tokens_seen": 134732635, "step": 6273, "time_per_iteration": 2.7064390182495117 }, { "auxiliary_loss_clip": 0.01117899, "auxiliary_loss_mlp": 0.01044408, "balance_loss_clip": 1.04745209, "balance_loss_mlp": 1.02905381, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 1.6440580648783938, "language_loss": 0.71867502, "learning_rate": 2.8631278120111858e-06, "loss": 0.74029803, "num_input_tokens_seen": 134750695, "step": 6274, "time_per_iteration": 2.650559186935425 }, { "auxiliary_loss_clip": 0.01105418, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.04509926, "balance_loss_mlp": 1.02567029, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 1.9251108643001593, "language_loss": 0.83620244, "learning_rate": 2.8627764691333742e-06, "loss": 0.85766381, "num_input_tokens_seen": 134768935, "step": 6275, "time_per_iteration": 2.662346839904785 }, { "auxiliary_loss_clip": 0.01077547, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04383206, "balance_loss_mlp": 1.02238655, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 1.4850375213112275, "language_loss": 0.75779188, "learning_rate": 2.8624250935389935e-06, "loss": 0.77892679, "num_input_tokens_seen": 134791260, "step": 6276, "time_per_iteration": 2.824374198913574 }, { "auxiliary_loss_clip": 0.01109985, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.04301822, "balance_loss_mlp": 1.02318192, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 1.996464283971086, "language_loss": 0.85758084, "learning_rate": 2.862073685241366e-06, "loss": 0.87907803, "num_input_tokens_seen": 134808350, "step": 6277, "time_per_iteration": 2.6880812644958496 }, { "auxiliary_loss_clip": 0.01123239, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.04981339, "balance_loss_mlp": 1.02147365, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 2.8692620956149613, "language_loss": 0.78788501, "learning_rate": 2.861722244253818e-06, "loss": 0.80947578, "num_input_tokens_seen": 134826005, "step": 6278, "time_per_iteration": 2.6566152572631836 }, { "auxiliary_loss_clip": 0.01104603, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04592609, "balance_loss_mlp": 1.02740717, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 2.420687530183356, "language_loss": 0.8289634, "learning_rate": 2.8613707705896767e-06, "loss": 0.85045302, "num_input_tokens_seen": 134844995, "step": 6279, "time_per_iteration": 2.732966899871826 }, { "auxiliary_loss_clip": 0.01110227, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.04498839, "balance_loss_mlp": 1.02520263, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 5.36242068768128, "language_loss": 0.74968797, "learning_rate": 2.861019264262269e-06, "loss": 0.77118295, "num_input_tokens_seen": 134865285, "step": 6280, "time_per_iteration": 4.266780376434326 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.04845715, "balance_loss_mlp": 1.02235854, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 1.4530407212668277, "language_loss": 0.76169163, "learning_rate": 2.8606677252849242e-06, "loss": 0.7833612, "num_input_tokens_seen": 134886535, "step": 6281, "time_per_iteration": 2.649930477142334 }, { "auxiliary_loss_clip": 0.01101629, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04291892, "balance_loss_mlp": 1.02471018, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 2.430303484367767, "language_loss": 0.83814883, "learning_rate": 2.860316153670974e-06, "loss": 0.85956836, "num_input_tokens_seen": 134907435, "step": 6282, "time_per_iteration": 2.6882312297821045 }, { "auxiliary_loss_clip": 0.0111945, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04452085, "balance_loss_mlp": 1.02134025, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 2.5787880774083725, "language_loss": 0.698241, "learning_rate": 2.8599645494337484e-06, "loss": 0.71980345, "num_input_tokens_seen": 134925360, "step": 6283, "time_per_iteration": 4.2020978927612305 }, { "auxiliary_loss_clip": 0.01072442, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.04226279, "balance_loss_mlp": 1.03394175, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 2.007181392308561, "language_loss": 0.76503819, "learning_rate": 2.859612912586581e-06, "loss": 0.78628325, "num_input_tokens_seen": 134944205, "step": 6284, "time_per_iteration": 4.349794387817383 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.05249381, "balance_loss_mlp": 1.01713097, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 2.7318562260547554, "language_loss": 0.85677552, "learning_rate": 2.8592612431428055e-06, "loss": 0.87853491, "num_input_tokens_seen": 134960255, "step": 6285, "time_per_iteration": 2.6949870586395264 }, { "auxiliary_loss_clip": 0.01111269, "auxiliary_loss_mlp": 0.01042933, "balance_loss_clip": 1.04731882, "balance_loss_mlp": 1.02694702, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 1.8544385642750592, "language_loss": 0.84419537, "learning_rate": 2.858909541115758e-06, "loss": 0.86573738, "num_input_tokens_seen": 134978605, "step": 6286, "time_per_iteration": 4.541024684906006 }, { "auxiliary_loss_clip": 0.01120151, "auxiliary_loss_mlp": 0.01043503, "balance_loss_clip": 1.05024576, "balance_loss_mlp": 1.0280652, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 2.400905995704231, "language_loss": 0.81738019, "learning_rate": 2.858557806518775e-06, "loss": 0.83901674, "num_input_tokens_seen": 134995020, "step": 6287, "time_per_iteration": 2.6611125469207764 }, { "auxiliary_loss_clip": 0.01118978, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.04537022, "balance_loss_mlp": 1.02645934, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 3.0671932020533133, "language_loss": 0.73071134, "learning_rate": 2.8582060393651927e-06, "loss": 0.7523191, "num_input_tokens_seen": 135012620, "step": 6288, "time_per_iteration": 2.6759073734283447 }, { "auxiliary_loss_clip": 0.01124666, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.05113983, "balance_loss_mlp": 1.02115071, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 1.9644960153972613, "language_loss": 0.75616127, "learning_rate": 2.857854239668352e-06, "loss": 0.77777576, "num_input_tokens_seen": 135033365, "step": 6289, "time_per_iteration": 2.656367778778076 }, { "auxiliary_loss_clip": 0.0112159, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.04737473, "balance_loss_mlp": 1.02025056, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 1.7941331023092641, "language_loss": 0.73271513, "learning_rate": 2.857502407441593e-06, "loss": 0.75428718, "num_input_tokens_seen": 135052185, "step": 6290, "time_per_iteration": 2.740370512008667 }, { "auxiliary_loss_clip": 0.01098389, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.04425681, "balance_loss_mlp": 1.023193, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 8.943174604406142, "language_loss": 0.79843229, "learning_rate": 2.8571505426982566e-06, "loss": 0.81982636, "num_input_tokens_seen": 135070425, "step": 6291, "time_per_iteration": 2.729116916656494 }, { "auxiliary_loss_clip": 0.01101536, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.04736066, "balance_loss_mlp": 1.01611638, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 2.1381581001103203, "language_loss": 0.76017123, "learning_rate": 2.8567986454516854e-06, "loss": 0.78151298, "num_input_tokens_seen": 135090525, "step": 6292, "time_per_iteration": 2.7115557193756104 }, { "auxiliary_loss_clip": 0.0111659, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.02922773, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 2.0329947363530616, "language_loss": 0.69857049, "learning_rate": 2.856446715715224e-06, "loss": 0.72018969, "num_input_tokens_seen": 135109575, "step": 6293, "time_per_iteration": 2.6687965393066406 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01039264, "balance_loss_clip": 1.04852223, "balance_loss_mlp": 1.02307534, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 2.030259976194038, "language_loss": 0.70870757, "learning_rate": 2.8560947535022173e-06, "loss": 0.73043227, "num_input_tokens_seen": 135127000, "step": 6294, "time_per_iteration": 2.600249767303467 }, { "auxiliary_loss_clip": 0.01115678, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.04706097, "balance_loss_mlp": 1.02365303, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 4.788626069957177, "language_loss": 0.82803214, "learning_rate": 2.855742758826011e-06, "loss": 0.84959471, "num_input_tokens_seen": 135145285, "step": 6295, "time_per_iteration": 2.656090497970581 }, { "auxiliary_loss_clip": 0.0111937, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.04782999, "balance_loss_mlp": 1.02058005, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 9.577751233202987, "language_loss": 0.71744889, "learning_rate": 2.8553907316999547e-06, "loss": 0.73900783, "num_input_tokens_seen": 135165240, "step": 6296, "time_per_iteration": 2.6698925495147705 }, { "auxiliary_loss_clip": 0.01134516, "auxiliary_loss_mlp": 0.01043376, "balance_loss_clip": 1.05133939, "balance_loss_mlp": 1.02771211, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 3.288847845161644, "language_loss": 0.76889098, "learning_rate": 2.855038672137396e-06, "loss": 0.79066986, "num_input_tokens_seen": 135184045, "step": 6297, "time_per_iteration": 2.629037380218506 }, { "auxiliary_loss_clip": 0.01109354, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.04526067, "balance_loss_mlp": 1.02226055, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 1.9191527971099975, "language_loss": 0.79743183, "learning_rate": 2.854686580151684e-06, "loss": 0.81890655, "num_input_tokens_seen": 135202365, "step": 6298, "time_per_iteration": 2.673081874847412 }, { "auxiliary_loss_clip": 0.01075918, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.04113722, "balance_loss_mlp": 1.03267384, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 1.8248163373816215, "language_loss": 0.84369445, "learning_rate": 2.8543344557561722e-06, "loss": 0.86496556, "num_input_tokens_seen": 135220955, "step": 6299, "time_per_iteration": 2.748072862625122 }, { "auxiliary_loss_clip": 0.01104171, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.0473597, "balance_loss_mlp": 1.02021194, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 2.2683019346862587, "language_loss": 0.76286763, "learning_rate": 2.8539822989642116e-06, "loss": 0.78427088, "num_input_tokens_seen": 135239715, "step": 6300, "time_per_iteration": 2.742335796356201 }, { "auxiliary_loss_clip": 0.01118244, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.04743147, "balance_loss_mlp": 1.01999068, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 2.2544575031135863, "language_loss": 0.82409781, "learning_rate": 2.8536301097891577e-06, "loss": 0.84565908, "num_input_tokens_seen": 135257035, "step": 6301, "time_per_iteration": 2.6785736083984375 }, { "auxiliary_loss_clip": 0.01120863, "auxiliary_loss_mlp": 0.01039969, "balance_loss_clip": 1.04765666, "balance_loss_mlp": 1.02410781, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 2.7886341766039466, "language_loss": 0.67584914, "learning_rate": 2.8532778882443636e-06, "loss": 0.69745743, "num_input_tokens_seen": 135275720, "step": 6302, "time_per_iteration": 2.677690029144287 }, { "auxiliary_loss_clip": 0.01090953, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.04460323, "balance_loss_mlp": 1.02736425, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 1.752291551629032, "language_loss": 0.68745166, "learning_rate": 2.8529256343431867e-06, "loss": 0.70879185, "num_input_tokens_seen": 135294140, "step": 6303, "time_per_iteration": 2.8387813568115234 }, { "auxiliary_loss_clip": 0.01133092, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.04745388, "balance_loss_mlp": 1.02412772, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 1.8875159078783896, "language_loss": 0.77695227, "learning_rate": 2.8525733480989846e-06, "loss": 0.79867482, "num_input_tokens_seen": 135314845, "step": 6304, "time_per_iteration": 2.673499584197998 }, { "auxiliary_loss_clip": 0.01145067, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.05417812, "balance_loss_mlp": 1.02412987, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 2.779181085633227, "language_loss": 0.79659361, "learning_rate": 2.8522210295251146e-06, "loss": 0.81845009, "num_input_tokens_seen": 135333055, "step": 6305, "time_per_iteration": 2.5770838260650635 }, { "auxiliary_loss_clip": 0.01046795, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 1.02554131, "balance_loss_mlp": 0.99954396, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9814261912828969, "language_loss": 0.64473259, "learning_rate": 2.8518686786349387e-06, "loss": 0.66521198, "num_input_tokens_seen": 135387865, "step": 6306, "time_per_iteration": 3.0782721042633057 }, { "auxiliary_loss_clip": 0.01111605, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.04987538, "balance_loss_mlp": 1.03932941, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 3.4757923579383343, "language_loss": 0.73271245, "learning_rate": 2.851516295441817e-06, "loss": 0.75441408, "num_input_tokens_seen": 135409095, "step": 6307, "time_per_iteration": 2.756335973739624 }, { "auxiliary_loss_clip": 0.01112868, "auxiliary_loss_mlp": 0.01041837, "balance_loss_clip": 1.04757965, "balance_loss_mlp": 1.02545738, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 1.5984922637838355, "language_loss": 0.78426826, "learning_rate": 2.851163879959112e-06, "loss": 0.80581522, "num_input_tokens_seen": 135429585, "step": 6308, "time_per_iteration": 2.7782399654388428 }, { "auxiliary_loss_clip": 0.01099815, "auxiliary_loss_mlp": 0.01047567, "balance_loss_clip": 1.04646075, "balance_loss_mlp": 1.03061557, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 30.20771720098995, "language_loss": 0.72349942, "learning_rate": 2.8508114322001876e-06, "loss": 0.74497324, "num_input_tokens_seen": 135446320, "step": 6309, "time_per_iteration": 2.779332399368286 }, { "auxiliary_loss_clip": 0.0107726, "auxiliary_loss_mlp": 0.01047463, "balance_loss_clip": 1.04217935, "balance_loss_mlp": 1.03061867, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 1.3823910789919382, "language_loss": 0.78832853, "learning_rate": 2.8504589521784083e-06, "loss": 0.8095758, "num_input_tokens_seen": 135465720, "step": 6310, "time_per_iteration": 2.771423101425171 }, { "auxiliary_loss_clip": 0.01125039, "auxiliary_loss_mlp": 0.0077385, "balance_loss_clip": 1.04667282, "balance_loss_mlp": 1.00038886, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 2.0276391959107687, "language_loss": 0.76350379, "learning_rate": 2.8501064399071403e-06, "loss": 0.78249264, "num_input_tokens_seen": 135485155, "step": 6311, "time_per_iteration": 2.6458020210266113 }, { "auxiliary_loss_clip": 0.01111162, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.04782593, "balance_loss_mlp": 1.02345526, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 1.662830094695082, "language_loss": 0.7082535, "learning_rate": 2.8497538953997504e-06, "loss": 0.72975308, "num_input_tokens_seen": 135502675, "step": 6312, "time_per_iteration": 2.719555377960205 }, { "auxiliary_loss_clip": 0.01023104, "auxiliary_loss_mlp": 0.01013837, "balance_loss_clip": 1.02154779, "balance_loss_mlp": 1.0123291, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7865225154891, "language_loss": 0.56087357, "learning_rate": 2.849401318669608e-06, "loss": 0.58124298, "num_input_tokens_seen": 135562005, "step": 6313, "time_per_iteration": 3.2287843227386475 }, { "auxiliary_loss_clip": 0.01096229, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.04299724, "balance_loss_mlp": 1.03592694, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 1.6673731637282567, "language_loss": 0.71260917, "learning_rate": 2.849048709730083e-06, "loss": 0.73408955, "num_input_tokens_seen": 135582600, "step": 6314, "time_per_iteration": 2.7842931747436523 }, { "auxiliary_loss_clip": 0.01129376, "auxiliary_loss_mlp": 0.01048605, "balance_loss_clip": 1.04880047, "balance_loss_mlp": 1.03201127, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 2.0299747539506408, "language_loss": 0.73270208, "learning_rate": 2.848696068594545e-06, "loss": 0.75448191, "num_input_tokens_seen": 135600280, "step": 6315, "time_per_iteration": 2.6785545349121094 }, { "auxiliary_loss_clip": 0.01122054, "auxiliary_loss_mlp": 0.01048691, "balance_loss_clip": 1.0479691, "balance_loss_mlp": 1.03326535, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 2.0273248392275645, "language_loss": 0.71108794, "learning_rate": 2.8483433952763677e-06, "loss": 0.73279542, "num_input_tokens_seen": 135621560, "step": 6316, "time_per_iteration": 2.7634074687957764 }, { "auxiliary_loss_clip": 0.01099766, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.04686475, "balance_loss_mlp": 1.02733219, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 6.091183487708486, "language_loss": 0.6551193, "learning_rate": 2.847990689788923e-06, "loss": 0.67653567, "num_input_tokens_seen": 135641745, "step": 6317, "time_per_iteration": 2.8334715366363525 }, { "auxiliary_loss_clip": 0.01119227, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.04556906, "balance_loss_mlp": 1.02204525, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 2.5588148844770364, "language_loss": 0.85254991, "learning_rate": 2.8476379521455877e-06, "loss": 0.87410533, "num_input_tokens_seen": 135660650, "step": 6318, "time_per_iteration": 2.6611499786376953 }, { "auxiliary_loss_clip": 0.01113843, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.04669976, "balance_loss_mlp": 1.02933645, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 2.5013130494780254, "language_loss": 0.75813186, "learning_rate": 2.8472851823597354e-06, "loss": 0.77973092, "num_input_tokens_seen": 135679980, "step": 6319, "time_per_iteration": 2.643206834793091 }, { "auxiliary_loss_clip": 0.01136645, "auxiliary_loss_mlp": 0.01043703, "balance_loss_clip": 1.04961717, "balance_loss_mlp": 1.02813435, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 1.6614251909537696, "language_loss": 0.64298296, "learning_rate": 2.846932380444744e-06, "loss": 0.66478646, "num_input_tokens_seen": 135699400, "step": 6320, "time_per_iteration": 4.031519174575806 }, { "auxiliary_loss_clip": 0.01102323, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.05175698, "balance_loss_mlp": 1.03132319, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 2.289587921641626, "language_loss": 0.713642, "learning_rate": 2.846579546413992e-06, "loss": 0.73513186, "num_input_tokens_seen": 135723455, "step": 6321, "time_per_iteration": 2.8465514183044434 }, { "auxiliary_loss_clip": 0.01096183, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04067016, "balance_loss_mlp": 1.02673435, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 1.7413772853733611, "language_loss": 0.74461544, "learning_rate": 2.846226680280859e-06, "loss": 0.76599777, "num_input_tokens_seen": 135744335, "step": 6322, "time_per_iteration": 4.407487630844116 }, { "auxiliary_loss_clip": 0.01122719, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.0462966, "balance_loss_mlp": 1.02587986, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 3.5770930684707527, "language_loss": 0.84908414, "learning_rate": 2.845873782058725e-06, "loss": 0.87071967, "num_input_tokens_seen": 135761440, "step": 6323, "time_per_iteration": 2.6349892616271973 }, { "auxiliary_loss_clip": 0.01111414, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.04454303, "balance_loss_mlp": 1.02075982, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 5.3693824839272954, "language_loss": 0.73171353, "learning_rate": 2.845520851760973e-06, "loss": 0.75320327, "num_input_tokens_seen": 135779955, "step": 6324, "time_per_iteration": 4.240839958190918 }, { "auxiliary_loss_clip": 0.01105568, "auxiliary_loss_mlp": 0.01038696, "balance_loss_clip": 1.04704404, "balance_loss_mlp": 1.02263856, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 1.716026134262254, "language_loss": 0.83859229, "learning_rate": 2.8451678894009847e-06, "loss": 0.86003488, "num_input_tokens_seen": 135799840, "step": 6325, "time_per_iteration": 2.72074818611145 }, { "auxiliary_loss_clip": 0.01110489, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.04811895, "balance_loss_mlp": 1.02094209, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 2.0321742163093264, "language_loss": 0.80093408, "learning_rate": 2.8448148949921465e-06, "loss": 0.82239556, "num_input_tokens_seen": 135817880, "step": 6326, "time_per_iteration": 4.313997030258179 }, { "auxiliary_loss_clip": 0.01119893, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.04593146, "balance_loss_mlp": 1.02497053, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 1.80559395505396, "language_loss": 0.72578084, "learning_rate": 2.844461868547842e-06, "loss": 0.74736857, "num_input_tokens_seen": 135838940, "step": 6327, "time_per_iteration": 2.7500593662261963 }, { "auxiliary_loss_clip": 0.01134332, "auxiliary_loss_mlp": 0.00772576, "balance_loss_clip": 1.04898763, "balance_loss_mlp": 1.00039506, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 1.9791898832174752, "language_loss": 0.83074433, "learning_rate": 2.844108810081459e-06, "loss": 0.84981334, "num_input_tokens_seen": 135858325, "step": 6328, "time_per_iteration": 2.7503418922424316 }, { "auxiliary_loss_clip": 0.01119735, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.04522514, "balance_loss_mlp": 1.01522779, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 1.5313878449465446, "language_loss": 0.61713332, "learning_rate": 2.843755719606385e-06, "loss": 0.63863051, "num_input_tokens_seen": 135878430, "step": 6329, "time_per_iteration": 2.682016134262085 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.04332185, "balance_loss_mlp": 1.02436066, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 1.9096594726999414, "language_loss": 0.56007183, "learning_rate": 2.8434025971360104e-06, "loss": 0.58151013, "num_input_tokens_seen": 135894755, "step": 6330, "time_per_iteration": 2.6704044342041016 }, { "auxiliary_loss_clip": 0.01088801, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.04801345, "balance_loss_mlp": 1.02142704, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 3.9882905607247046, "language_loss": 0.65945244, "learning_rate": 2.8430494426837243e-06, "loss": 0.6806919, "num_input_tokens_seen": 135918275, "step": 6331, "time_per_iteration": 2.750293731689453 }, { "auxiliary_loss_clip": 0.01120934, "auxiliary_loss_mlp": 0.01042908, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.02723169, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 2.769340057272882, "language_loss": 0.7601527, "learning_rate": 2.842696256262919e-06, "loss": 0.78179109, "num_input_tokens_seen": 135937430, "step": 6332, "time_per_iteration": 2.64774227142334 }, { "auxiliary_loss_clip": 0.01073508, "auxiliary_loss_mlp": 0.00772959, "balance_loss_clip": 1.04594767, "balance_loss_mlp": 1.00029111, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 2.059894273755589, "language_loss": 0.8224051, "learning_rate": 2.842343037886987e-06, "loss": 0.84086972, "num_input_tokens_seen": 135954210, "step": 6333, "time_per_iteration": 2.7650275230407715 }, { "auxiliary_loss_clip": 0.01121534, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.04730785, "balance_loss_mlp": 1.01878643, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 1.5368445040683132, "language_loss": 0.8620519, "learning_rate": 2.8419897875693226e-06, "loss": 0.88359934, "num_input_tokens_seen": 135974425, "step": 6334, "time_per_iteration": 2.7348363399505615 }, { "auxiliary_loss_clip": 0.01123412, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.04626036, "balance_loss_mlp": 1.02280819, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 1.7714454860846107, "language_loss": 0.79359698, "learning_rate": 2.841636505323321e-06, "loss": 0.81521177, "num_input_tokens_seen": 135991985, "step": 6335, "time_per_iteration": 2.7020695209503174 }, { "auxiliary_loss_clip": 0.01121693, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.04490542, "balance_loss_mlp": 1.01847494, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 1.872444579903983, "language_loss": 0.72939491, "learning_rate": 2.8412831911623795e-06, "loss": 0.75094938, "num_input_tokens_seen": 136010015, "step": 6336, "time_per_iteration": 2.7088463306427 }, { "auxiliary_loss_clip": 0.01117324, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.01930285, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 2.014308937626889, "language_loss": 0.69164217, "learning_rate": 2.840929845099894e-06, "loss": 0.71314949, "num_input_tokens_seen": 136028440, "step": 6337, "time_per_iteration": 2.6832611560821533 }, { "auxiliary_loss_clip": 0.01111033, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.04483473, "balance_loss_mlp": 1.02133763, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 1.9800177042646252, "language_loss": 0.63416338, "learning_rate": 2.8405764671492652e-06, "loss": 0.65563887, "num_input_tokens_seen": 136048360, "step": 6338, "time_per_iteration": 2.8045074939727783 }, { "auxiliary_loss_clip": 0.01112594, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.04514265, "balance_loss_mlp": 1.02520001, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 2.42049576026076, "language_loss": 0.69146717, "learning_rate": 2.8402230573238923e-06, "loss": 0.713009, "num_input_tokens_seen": 136065500, "step": 6339, "time_per_iteration": 2.6873764991760254 }, { "auxiliary_loss_clip": 0.01107753, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.04493856, "balance_loss_mlp": 1.03165436, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 2.484915003961603, "language_loss": 0.68283296, "learning_rate": 2.839869615637177e-06, "loss": 0.70438182, "num_input_tokens_seen": 136084060, "step": 6340, "time_per_iteration": 2.730966567993164 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.0444243, "balance_loss_mlp": 1.02449322, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 2.645956512625022, "language_loss": 0.89689833, "learning_rate": 2.839516142102522e-06, "loss": 0.91829509, "num_input_tokens_seen": 136102310, "step": 6341, "time_per_iteration": 2.7552878856658936 }, { "auxiliary_loss_clip": 0.01127861, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.04863834, "balance_loss_mlp": 1.02668464, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 2.1539523414578103, "language_loss": 0.75359344, "learning_rate": 2.83916263673333e-06, "loss": 0.7753011, "num_input_tokens_seen": 136120725, "step": 6342, "time_per_iteration": 2.6937670707702637 }, { "auxiliary_loss_clip": 0.01109868, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.02071738, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 1.797512240627555, "language_loss": 0.8348105, "learning_rate": 2.838809099543007e-06, "loss": 0.85626709, "num_input_tokens_seen": 136139105, "step": 6343, "time_per_iteration": 2.6647467613220215 }, { "auxiliary_loss_clip": 0.01073856, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.04339314, "balance_loss_mlp": 1.03099144, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 1.8507846773973766, "language_loss": 0.76930642, "learning_rate": 2.838455530544959e-06, "loss": 0.7905196, "num_input_tokens_seen": 136158265, "step": 6344, "time_per_iteration": 2.807464838027954 }, { "auxiliary_loss_clip": 0.01099031, "auxiliary_loss_mlp": 0.01049913, "balance_loss_clip": 1.04580665, "balance_loss_mlp": 1.03225255, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 2.0591822661314847, "language_loss": 0.73010087, "learning_rate": 2.838101929752593e-06, "loss": 0.75159037, "num_input_tokens_seen": 136176100, "step": 6345, "time_per_iteration": 2.756462574005127 }, { "auxiliary_loss_clip": 0.01094565, "auxiliary_loss_mlp": 0.00771987, "balance_loss_clip": 1.04568338, "balance_loss_mlp": 1.00028944, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 1.8320535118847152, "language_loss": 0.69709373, "learning_rate": 2.8377482971793187e-06, "loss": 0.71575922, "num_input_tokens_seen": 136195125, "step": 6346, "time_per_iteration": 2.7221782207489014 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.01038046, "balance_loss_clip": 1.04819, "balance_loss_mlp": 1.02297819, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 1.9952986193352877, "language_loss": 0.75480664, "learning_rate": 2.8373946328385437e-06, "loss": 0.77643454, "num_input_tokens_seen": 136213885, "step": 6347, "time_per_iteration": 2.646730422973633 }, { "auxiliary_loss_clip": 0.0112204, "auxiliary_loss_mlp": 0.01039786, "balance_loss_clip": 1.04638994, "balance_loss_mlp": 1.0253861, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 3.670871038619067, "language_loss": 0.74398822, "learning_rate": 2.8370409367436813e-06, "loss": 0.76560652, "num_input_tokens_seen": 136232700, "step": 6348, "time_per_iteration": 2.651153802871704 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.04792547, "balance_loss_mlp": 1.0233444, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 2.7978232906816665, "language_loss": 0.87172502, "learning_rate": 2.836687208908142e-06, "loss": 0.89320159, "num_input_tokens_seen": 136248975, "step": 6349, "time_per_iteration": 2.693459987640381 }, { "auxiliary_loss_clip": 0.0112098, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.04788637, "balance_loss_mlp": 1.02244771, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 1.7341599494512197, "language_loss": 0.76554048, "learning_rate": 2.836333449345341e-06, "loss": 0.78712171, "num_input_tokens_seen": 136266710, "step": 6350, "time_per_iteration": 2.6194076538085938 }, { "auxiliary_loss_clip": 0.01104228, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.04922175, "balance_loss_mlp": 1.01640153, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 2.525722230514251, "language_loss": 0.75608248, "learning_rate": 2.8359796580686907e-06, "loss": 0.77744693, "num_input_tokens_seen": 136284445, "step": 6351, "time_per_iteration": 2.723487138748169 }, { "auxiliary_loss_clip": 0.01122109, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04607773, "balance_loss_mlp": 1.02048135, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 2.201358799690427, "language_loss": 0.74001205, "learning_rate": 2.8356258350916085e-06, "loss": 0.76160336, "num_input_tokens_seen": 136305730, "step": 6352, "time_per_iteration": 2.6779909133911133 }, { "auxiliary_loss_clip": 0.01093469, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.04185915, "balance_loss_mlp": 1.02093625, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 1.7014377772216425, "language_loss": 0.64249897, "learning_rate": 2.8352719804275104e-06, "loss": 0.66378438, "num_input_tokens_seen": 136323850, "step": 6353, "time_per_iteration": 2.731860399246216 }, { "auxiliary_loss_clip": 0.01133265, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.04809213, "balance_loss_mlp": 1.02529204, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 2.7523604394748644, "language_loss": 0.83447051, "learning_rate": 2.834918094089816e-06, "loss": 0.85619861, "num_input_tokens_seen": 136344880, "step": 6354, "time_per_iteration": 2.665891170501709 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.04866302, "balance_loss_mlp": 1.02162409, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 16.091226432139102, "language_loss": 0.80633152, "learning_rate": 2.834564176091943e-06, "loss": 0.82800299, "num_input_tokens_seen": 136366060, "step": 6355, "time_per_iteration": 2.6580965518951416 }, { "auxiliary_loss_clip": 0.01092469, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.04551625, "balance_loss_mlp": 1.02263832, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 1.8508447811900344, "language_loss": 0.75970227, "learning_rate": 2.8342102264473125e-06, "loss": 0.78099722, "num_input_tokens_seen": 136385625, "step": 6356, "time_per_iteration": 2.7381057739257812 }, { "auxiliary_loss_clip": 0.01123851, "auxiliary_loss_mlp": 0.00772749, "balance_loss_clip": 1.04802036, "balance_loss_mlp": 1.00034022, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 2.3854964939919188, "language_loss": 0.81208009, "learning_rate": 2.833856245169348e-06, "loss": 0.8310461, "num_input_tokens_seen": 136405750, "step": 6357, "time_per_iteration": 2.8209376335144043 }, { "auxiliary_loss_clip": 0.01118527, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.02842796, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 2.215929075758269, "language_loss": 0.77378345, "learning_rate": 2.8335022322714695e-06, "loss": 0.79541618, "num_input_tokens_seen": 136426085, "step": 6358, "time_per_iteration": 2.7004640102386475 }, { "auxiliary_loss_clip": 0.01115504, "auxiliary_loss_mlp": 0.01047061, "balance_loss_clip": 1.0469476, "balance_loss_mlp": 1.03118849, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 3.6635579737055837, "language_loss": 0.78477705, "learning_rate": 2.8331481877671036e-06, "loss": 0.80640268, "num_input_tokens_seen": 136442670, "step": 6359, "time_per_iteration": 4.184551954269409 }, { "auxiliary_loss_clip": 0.01065181, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.03820515, "balance_loss_mlp": 1.03462481, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 1.6779400536158158, "language_loss": 0.69735414, "learning_rate": 2.8327941116696754e-06, "loss": 0.71852612, "num_input_tokens_seen": 136465730, "step": 6360, "time_per_iteration": 3.1072845458984375 }, { "auxiliary_loss_clip": 0.01102455, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.04502857, "balance_loss_mlp": 1.02189279, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 1.5790785802582266, "language_loss": 0.79362941, "learning_rate": 2.83244000399261e-06, "loss": 0.81502759, "num_input_tokens_seen": 136487215, "step": 6361, "time_per_iteration": 4.285314559936523 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01043827, "balance_loss_clip": 1.04649949, "balance_loss_mlp": 1.02906859, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 1.9067122847602551, "language_loss": 0.65606177, "learning_rate": 2.832085864749337e-06, "loss": 0.67761117, "num_input_tokens_seen": 136510365, "step": 6362, "time_per_iteration": 2.8447117805480957 }, { "auxiliary_loss_clip": 0.0113439, "auxiliary_loss_mlp": 0.01035947, "balance_loss_clip": 1.0483737, "balance_loss_mlp": 1.01978207, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 2.3383155012254284, "language_loss": 0.82138497, "learning_rate": 2.8317316939532848e-06, "loss": 0.84308833, "num_input_tokens_seen": 136527100, "step": 6363, "time_per_iteration": 4.166736602783203 }, { "auxiliary_loss_clip": 0.01075728, "auxiliary_loss_mlp": 0.01042552, "balance_loss_clip": 1.04349709, "balance_loss_mlp": 1.02707291, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 2.1311203141010835, "language_loss": 0.59044886, "learning_rate": 2.8313774916178825e-06, "loss": 0.61163169, "num_input_tokens_seen": 136550870, "step": 6364, "time_per_iteration": 3.006801128387451 }, { "auxiliary_loss_clip": 0.01122076, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05097353, "balance_loss_mlp": 1.02542353, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 1.9239689491626994, "language_loss": 0.68903065, "learning_rate": 2.8310232577565635e-06, "loss": 0.7106635, "num_input_tokens_seen": 136569895, "step": 6365, "time_per_iteration": 2.695068597793579 }, { "auxiliary_loss_clip": 0.01123716, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.04955769, "balance_loss_mlp": 1.02366817, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 2.0334034116186137, "language_loss": 0.73193848, "learning_rate": 2.830668992382758e-06, "loss": 0.75357372, "num_input_tokens_seen": 136588585, "step": 6366, "time_per_iteration": 4.418980598449707 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.02265882, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 2.4539991484931645, "language_loss": 0.68623614, "learning_rate": 2.830314695509902e-06, "loss": 0.70777929, "num_input_tokens_seen": 136606640, "step": 6367, "time_per_iteration": 2.6878082752227783 }, { "auxiliary_loss_clip": 0.01125961, "auxiliary_loss_mlp": 0.01037618, "balance_loss_clip": 1.05120409, "balance_loss_mlp": 1.02256823, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 2.196344444241347, "language_loss": 0.64423102, "learning_rate": 2.82996036715143e-06, "loss": 0.66586685, "num_input_tokens_seen": 136624940, "step": 6368, "time_per_iteration": 2.6698646545410156 }, { "auxiliary_loss_clip": 0.01139795, "auxiliary_loss_mlp": 0.01040116, "balance_loss_clip": 1.05269098, "balance_loss_mlp": 1.02390361, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 1.346024597035963, "language_loss": 0.684017, "learning_rate": 2.8296060073207763e-06, "loss": 0.70581615, "num_input_tokens_seen": 136645540, "step": 6369, "time_per_iteration": 2.7156169414520264 }, { "auxiliary_loss_clip": 0.01084469, "auxiliary_loss_mlp": 0.01039929, "balance_loss_clip": 1.04267466, "balance_loss_mlp": 1.02391946, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 1.7824237306329542, "language_loss": 0.78701794, "learning_rate": 2.8292516160313804e-06, "loss": 0.80826187, "num_input_tokens_seen": 136664530, "step": 6370, "time_per_iteration": 2.7351901531219482 }, { "auxiliary_loss_clip": 0.01121027, "auxiliary_loss_mlp": 0.01050163, "balance_loss_clip": 1.04909503, "balance_loss_mlp": 1.03279376, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 2.5095706519371794, "language_loss": 0.65098304, "learning_rate": 2.8288971932966805e-06, "loss": 0.67269492, "num_input_tokens_seen": 136682315, "step": 6371, "time_per_iteration": 2.739689350128174 }, { "auxiliary_loss_clip": 0.01110581, "auxiliary_loss_mlp": 0.01041968, "balance_loss_clip": 1.04938042, "balance_loss_mlp": 1.02471852, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 3.269308088463154, "language_loss": 0.7304002, "learning_rate": 2.8285427391301155e-06, "loss": 0.75192571, "num_input_tokens_seen": 136701185, "step": 6372, "time_per_iteration": 2.7497966289520264 }, { "auxiliary_loss_clip": 0.01127864, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.05050421, "balance_loss_mlp": 1.01848698, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 1.83316702621751, "language_loss": 0.8491025, "learning_rate": 2.8281882535451266e-06, "loss": 0.87072337, "num_input_tokens_seen": 136721265, "step": 6373, "time_per_iteration": 2.6510777473449707 }, { "auxiliary_loss_clip": 0.01084717, "auxiliary_loss_mlp": 0.01048262, "balance_loss_clip": 1.0416218, "balance_loss_mlp": 1.0316565, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 2.287485479433922, "language_loss": 0.74893212, "learning_rate": 2.8278337365551567e-06, "loss": 0.770262, "num_input_tokens_seen": 136741885, "step": 6374, "time_per_iteration": 2.8658056259155273 }, { "auxiliary_loss_clip": 0.01130215, "auxiliary_loss_mlp": 0.01042427, "balance_loss_clip": 1.05264366, "balance_loss_mlp": 1.02613068, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 7.5426595342284735, "language_loss": 0.75737238, "learning_rate": 2.8274791881736485e-06, "loss": 0.77909875, "num_input_tokens_seen": 136760905, "step": 6375, "time_per_iteration": 2.6622958183288574 }, { "auxiliary_loss_clip": 0.01126708, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.05043924, "balance_loss_mlp": 1.0244453, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 2.1246389624552435, "language_loss": 0.72777182, "learning_rate": 2.8271246084140457e-06, "loss": 0.74943662, "num_input_tokens_seen": 136777240, "step": 6376, "time_per_iteration": 2.6562421321868896 }, { "auxiliary_loss_clip": 0.01122147, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.04791379, "balance_loss_mlp": 1.02381194, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 1.7414598633373413, "language_loss": 0.67441249, "learning_rate": 2.826769997289796e-06, "loss": 0.69603217, "num_input_tokens_seen": 136801040, "step": 6377, "time_per_iteration": 2.779766798019409 }, { "auxiliary_loss_clip": 0.01110002, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.05152845, "balance_loss_mlp": 1.02421689, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 2.377659826482013, "language_loss": 0.73287642, "learning_rate": 2.826415354814344e-06, "loss": 0.75438869, "num_input_tokens_seen": 136819495, "step": 6378, "time_per_iteration": 2.7345829010009766 }, { "auxiliary_loss_clip": 0.01085335, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.0479784, "balance_loss_mlp": 1.02707767, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 2.283576437082984, "language_loss": 0.69473612, "learning_rate": 2.8260606810011396e-06, "loss": 0.71601641, "num_input_tokens_seen": 136838840, "step": 6379, "time_per_iteration": 2.7592358589172363 }, { "auxiliary_loss_clip": 0.01124706, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.0516969, "balance_loss_mlp": 1.02094209, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 1.8393672130560537, "language_loss": 0.83356249, "learning_rate": 2.8257059758636315e-06, "loss": 0.85517132, "num_input_tokens_seen": 136854425, "step": 6380, "time_per_iteration": 2.6572370529174805 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.05187774, "balance_loss_mlp": 1.02010989, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 1.5891747666862521, "language_loss": 0.8141042, "learning_rate": 2.8253512394152697e-06, "loss": 0.83581179, "num_input_tokens_seen": 136874355, "step": 6381, "time_per_iteration": 2.7251663208007812 }, { "auxiliary_loss_clip": 0.01057344, "auxiliary_loss_mlp": 0.01005901, "balance_loss_clip": 1.02759361, "balance_loss_mlp": 1.00418437, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.7954141143291842, "language_loss": 0.60376751, "learning_rate": 2.8249964716695068e-06, "loss": 0.62440002, "num_input_tokens_seen": 136937475, "step": 6382, "time_per_iteration": 3.1750948429107666 }, { "auxiliary_loss_clip": 0.01139607, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.05060625, "balance_loss_mlp": 1.02099442, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 3.8324285625149925, "language_loss": 0.66432369, "learning_rate": 2.824641672639794e-06, "loss": 0.68608773, "num_input_tokens_seen": 136955805, "step": 6383, "time_per_iteration": 2.7543957233428955 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01039577, "balance_loss_clip": 1.04783142, "balance_loss_mlp": 1.02375221, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 2.110615575498957, "language_loss": 0.75144917, "learning_rate": 2.824286842339587e-06, "loss": 0.77288288, "num_input_tokens_seen": 136975240, "step": 6384, "time_per_iteration": 2.7796735763549805 }, { "auxiliary_loss_clip": 0.01122869, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.05156231, "balance_loss_mlp": 1.02510643, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 1.5394774946197278, "language_loss": 0.76096714, "learning_rate": 2.823931980782341e-06, "loss": 0.78259945, "num_input_tokens_seen": 136994985, "step": 6385, "time_per_iteration": 2.6831300258636475 }, { "auxiliary_loss_clip": 0.01046831, "auxiliary_loss_mlp": 0.01001133, "balance_loss_clip": 1.02648735, "balance_loss_mlp": 0.99943984, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 0.9063295744618779, "language_loss": 0.66955769, "learning_rate": 2.82357708798151e-06, "loss": 0.69003725, "num_input_tokens_seen": 137046290, "step": 6386, "time_per_iteration": 3.0693411827087402 }, { "auxiliary_loss_clip": 0.0109652, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.04551756, "balance_loss_mlp": 1.02686286, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 1.7986188221191803, "language_loss": 0.7215755, "learning_rate": 2.8232221639505547e-06, "loss": 0.74295932, "num_input_tokens_seen": 137064725, "step": 6387, "time_per_iteration": 2.736774206161499 }, { "auxiliary_loss_clip": 0.01134624, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.05156994, "balance_loss_mlp": 1.03039086, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 1.6374516085838389, "language_loss": 0.8088249, "learning_rate": 2.822867208702932e-06, "loss": 0.83062065, "num_input_tokens_seen": 137086030, "step": 6388, "time_per_iteration": 2.782958507537842 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04727554, "balance_loss_mlp": 1.03298843, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 1.7872750649564642, "language_loss": 0.76085746, "learning_rate": 2.8225122222521026e-06, "loss": 0.78236812, "num_input_tokens_seen": 137105400, "step": 6389, "time_per_iteration": 2.6644833087921143 }, { "auxiliary_loss_clip": 0.01119906, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.05389404, "balance_loss_mlp": 1.03203344, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 4.9507505317589775, "language_loss": 0.76550084, "learning_rate": 2.8221572046115273e-06, "loss": 0.78718758, "num_input_tokens_seen": 137124985, "step": 6390, "time_per_iteration": 2.825714588165283 }, { "auxiliary_loss_clip": 0.01090482, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.04517913, "balance_loss_mlp": 1.03196096, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 1.7614871223783444, "language_loss": 0.70377523, "learning_rate": 2.821802155794668e-06, "loss": 0.72516215, "num_input_tokens_seen": 137146745, "step": 6391, "time_per_iteration": 2.918065309524536 }, { "auxiliary_loss_clip": 0.01125443, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.04874265, "balance_loss_mlp": 1.02158153, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 1.7948670510085722, "language_loss": 0.84005457, "learning_rate": 2.8214470758149884e-06, "loss": 0.86167878, "num_input_tokens_seen": 137163195, "step": 6392, "time_per_iteration": 2.679427146911621 }, { "auxiliary_loss_clip": 0.01122701, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.04846168, "balance_loss_mlp": 1.0227809, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 2.3141685884805145, "language_loss": 0.6062203, "learning_rate": 2.8210919646859536e-06, "loss": 0.62781858, "num_input_tokens_seen": 137179330, "step": 6393, "time_per_iteration": 2.6622374057769775 }, { "auxiliary_loss_clip": 0.01110672, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.04954767, "balance_loss_mlp": 1.02025223, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 1.7908313499382054, "language_loss": 0.70639426, "learning_rate": 2.820736822421029e-06, "loss": 0.72786993, "num_input_tokens_seen": 137198655, "step": 6394, "time_per_iteration": 2.7460365295410156 }, { "auxiliary_loss_clip": 0.01123613, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.04763663, "balance_loss_mlp": 1.01871169, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 2.646318489707099, "language_loss": 0.81774974, "learning_rate": 2.8203816490336822e-06, "loss": 0.83933747, "num_input_tokens_seen": 137217120, "step": 6395, "time_per_iteration": 2.676023006439209 }, { "auxiliary_loss_clip": 0.01129196, "auxiliary_loss_mlp": 0.01046949, "balance_loss_clip": 1.05485177, "balance_loss_mlp": 1.03209007, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 1.9755185808990787, "language_loss": 0.71031433, "learning_rate": 2.8200264445373813e-06, "loss": 0.73207581, "num_input_tokens_seen": 137234410, "step": 6396, "time_per_iteration": 2.7082455158233643 }, { "auxiliary_loss_clip": 0.01044031, "auxiliary_loss_mlp": 0.0100801, "balance_loss_clip": 1.02689695, "balance_loss_mlp": 1.00657308, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.8839433118134116, "language_loss": 0.59671199, "learning_rate": 2.8196712089455954e-06, "loss": 0.61723238, "num_input_tokens_seen": 137294940, "step": 6397, "time_per_iteration": 3.2412428855895996 }, { "auxiliary_loss_clip": 0.01137376, "auxiliary_loss_mlp": 0.01035554, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.02044976, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 2.648974669995796, "language_loss": 0.85017276, "learning_rate": 2.819315942271794e-06, "loss": 0.87190199, "num_input_tokens_seen": 137315035, "step": 6398, "time_per_iteration": 2.7374656200408936 }, { "auxiliary_loss_clip": 0.01136492, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.0517211, "balance_loss_mlp": 1.0165, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 2.1032431430060075, "language_loss": 0.79989493, "learning_rate": 2.8189606445294515e-06, "loss": 0.82156688, "num_input_tokens_seen": 137333155, "step": 6399, "time_per_iteration": 4.446218729019165 }, { "auxiliary_loss_clip": 0.0113807, "auxiliary_loss_mlp": 0.00773562, "balance_loss_clip": 1.05109119, "balance_loss_mlp": 1.00025833, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 3.0376300513317416, "language_loss": 0.67328328, "learning_rate": 2.818605315732038e-06, "loss": 0.69239962, "num_input_tokens_seen": 137351515, "step": 6400, "time_per_iteration": 2.6920905113220215 }, { "auxiliary_loss_clip": 0.01122811, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.05546772, "balance_loss_mlp": 1.0264008, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 11.158483612058907, "language_loss": 0.73623443, "learning_rate": 2.81824995589303e-06, "loss": 0.75788283, "num_input_tokens_seen": 137371255, "step": 6401, "time_per_iteration": 4.2371673583984375 }, { "auxiliary_loss_clip": 0.01102005, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.04852486, "balance_loss_mlp": 1.02387738, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 2.0006804524577233, "language_loss": 0.72059876, "learning_rate": 2.8178945650259012e-06, "loss": 0.74201727, "num_input_tokens_seen": 137388980, "step": 6402, "time_per_iteration": 2.686413288116455 }, { "auxiliary_loss_clip": 0.0113478, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05094552, "balance_loss_mlp": 1.02016854, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 2.094788133183166, "language_loss": 0.82884681, "learning_rate": 2.817539143144128e-06, "loss": 0.85054541, "num_input_tokens_seen": 137406885, "step": 6403, "time_per_iteration": 4.234680891036987 }, { "auxiliary_loss_clip": 0.01078109, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.04205656, "balance_loss_mlp": 1.02466702, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 4.587008789601206, "language_loss": 0.82845348, "learning_rate": 2.817183690261189e-06, "loss": 0.84964037, "num_input_tokens_seen": 137425535, "step": 6404, "time_per_iteration": 2.777756452560425 }, { "auxiliary_loss_clip": 0.0111195, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.04970074, "balance_loss_mlp": 1.02046084, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 2.6287869212560646, "language_loss": 0.69417107, "learning_rate": 2.816828206390563e-06, "loss": 0.71563923, "num_input_tokens_seen": 137447700, "step": 6405, "time_per_iteration": 4.478301286697388 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.0438571, "balance_loss_mlp": 1.02414417, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 1.9306681180439358, "language_loss": 0.79248095, "learning_rate": 2.816472691545729e-06, "loss": 0.81388557, "num_input_tokens_seen": 137462245, "step": 6406, "time_per_iteration": 2.7157816886901855 }, { "auxiliary_loss_clip": 0.01129296, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 1.05465746, "balance_loss_mlp": 1.02483082, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 5.929375109580111, "language_loss": 0.84107637, "learning_rate": 2.8161171457401694e-06, "loss": 0.86277771, "num_input_tokens_seen": 137476455, "step": 6407, "time_per_iteration": 2.6058037281036377 }, { "auxiliary_loss_clip": 0.01049614, "auxiliary_loss_mlp": 0.00999678, "balance_loss_clip": 1.03001904, "balance_loss_mlp": 0.99828893, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.845548946049954, "language_loss": 0.64919412, "learning_rate": 2.815761568987365e-06, "loss": 0.66968703, "num_input_tokens_seen": 137539845, "step": 6408, "time_per_iteration": 3.2015879154205322 }, { "auxiliary_loss_clip": 0.01110915, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.05201948, "balance_loss_mlp": 1.02547526, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 1.5734517214124462, "language_loss": 0.73444313, "learning_rate": 2.8154059613008e-06, "loss": 0.75597274, "num_input_tokens_seen": 137559880, "step": 6409, "time_per_iteration": 2.683310031890869 }, { "auxiliary_loss_clip": 0.01099042, "auxiliary_loss_mlp": 0.01052587, "balance_loss_clip": 1.05162942, "balance_loss_mlp": 1.03458679, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 3.095928763270071, "language_loss": 0.70505756, "learning_rate": 2.81505032269396e-06, "loss": 0.72657388, "num_input_tokens_seen": 137578225, "step": 6410, "time_per_iteration": 2.7694053649902344 }, { "auxiliary_loss_clip": 0.01018797, "auxiliary_loss_mlp": 0.00754046, "balance_loss_clip": 1.02754462, "balance_loss_mlp": 1.00070059, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 0.6824056349925876, "language_loss": 0.6019417, "learning_rate": 2.81469465318033e-06, "loss": 0.61967015, "num_input_tokens_seen": 137645770, "step": 6411, "time_per_iteration": 3.3692543506622314 }, { "auxiliary_loss_clip": 0.01091571, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.01451063, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 2.4386958956664344, "language_loss": 0.78219938, "learning_rate": 2.814338952773397e-06, "loss": 0.80340695, "num_input_tokens_seen": 137664090, "step": 6412, "time_per_iteration": 2.7462196350097656 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.01995587, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 2.0249224045322802, "language_loss": 0.78112727, "learning_rate": 2.8139832214866493e-06, "loss": 0.80248463, "num_input_tokens_seen": 137683190, "step": 6413, "time_per_iteration": 2.768624782562256 }, { "auxiliary_loss_clip": 0.01056912, "auxiliary_loss_mlp": 0.01003998, "balance_loss_clip": 1.02733278, "balance_loss_mlp": 1.00254369, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 0.8082958368118873, "language_loss": 0.61342072, "learning_rate": 2.813627459333576e-06, "loss": 0.63402981, "num_input_tokens_seen": 137737315, "step": 6414, "time_per_iteration": 2.983466625213623 }, { "auxiliary_loss_clip": 0.01103716, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.05065155, "balance_loss_mlp": 1.02302015, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 2.2111312580879106, "language_loss": 0.77225536, "learning_rate": 2.8132716663276685e-06, "loss": 0.79367828, "num_input_tokens_seen": 137753535, "step": 6415, "time_per_iteration": 2.7486205101013184 }, { "auxiliary_loss_clip": 0.01109368, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.04894936, "balance_loss_mlp": 1.01676726, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 1.644505635534703, "language_loss": 0.80036473, "learning_rate": 2.8129158424824173e-06, "loss": 0.82176626, "num_input_tokens_seen": 137773405, "step": 6416, "time_per_iteration": 2.709200859069824 }, { "auxiliary_loss_clip": 0.0112133, "auxiliary_loss_mlp": 0.00771665, "balance_loss_clip": 1.04777813, "balance_loss_mlp": 1.00020468, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 1.8974153334913886, "language_loss": 0.78746861, "learning_rate": 2.8125599878113155e-06, "loss": 0.80639857, "num_input_tokens_seen": 137790810, "step": 6417, "time_per_iteration": 2.6839869022369385 }, { "auxiliary_loss_clip": 0.01106617, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.04771507, "balance_loss_mlp": 1.02424121, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 1.8492847143532247, "language_loss": 0.80066824, "learning_rate": 2.8122041023278583e-06, "loss": 0.82211387, "num_input_tokens_seen": 137810265, "step": 6418, "time_per_iteration": 2.709463119506836 }, { "auxiliary_loss_clip": 0.01106426, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.04606509, "balance_loss_mlp": 1.02115691, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 2.0121704661475524, "language_loss": 0.79591382, "learning_rate": 2.8118481860455407e-06, "loss": 0.81733727, "num_input_tokens_seen": 137828580, "step": 6419, "time_per_iteration": 2.687030553817749 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.04662013, "balance_loss_mlp": 1.0194031, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 2.202509680177809, "language_loss": 0.67581224, "learning_rate": 2.8114922389778573e-06, "loss": 0.69726223, "num_input_tokens_seen": 137846145, "step": 6420, "time_per_iteration": 2.7517049312591553 }, { "auxiliary_loss_clip": 0.01089731, "auxiliary_loss_mlp": 0.01053637, "balance_loss_clip": 1.04479241, "balance_loss_mlp": 1.03771043, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 2.406147976104497, "language_loss": 0.81137526, "learning_rate": 2.8111362611383076e-06, "loss": 0.83280897, "num_input_tokens_seen": 137863705, "step": 6421, "time_per_iteration": 2.970040798187256 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04625583, "balance_loss_mlp": 1.02510345, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 2.6092074943148797, "language_loss": 0.71989834, "learning_rate": 2.8107802525403886e-06, "loss": 0.74138188, "num_input_tokens_seen": 137880285, "step": 6422, "time_per_iteration": 2.690490961074829 }, { "auxiliary_loss_clip": 0.01104575, "auxiliary_loss_mlp": 0.0104152, "balance_loss_clip": 1.04663455, "balance_loss_mlp": 1.02759588, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 1.6942063430957965, "language_loss": 0.66644311, "learning_rate": 2.8104242131976025e-06, "loss": 0.687904, "num_input_tokens_seen": 137898335, "step": 6423, "time_per_iteration": 2.6189329624176025 }, { "auxiliary_loss_clip": 0.01128312, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.05139875, "balance_loss_mlp": 1.02860618, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 2.1536039728580394, "language_loss": 0.68359423, "learning_rate": 2.810068143123449e-06, "loss": 0.70530522, "num_input_tokens_seen": 137918605, "step": 6424, "time_per_iteration": 2.7609992027282715 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02387285, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 1.4481478329406698, "language_loss": 0.72367114, "learning_rate": 2.809712042331429e-06, "loss": 0.7450422, "num_input_tokens_seen": 137938245, "step": 6425, "time_per_iteration": 2.7069387435913086 }, { "auxiliary_loss_clip": 0.01099551, "auxiliary_loss_mlp": 0.00773141, "balance_loss_clip": 1.0428803, "balance_loss_mlp": 1.00013173, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 2.52438881915832, "language_loss": 0.80258477, "learning_rate": 2.8093559108350484e-06, "loss": 0.82131171, "num_input_tokens_seen": 137956770, "step": 6426, "time_per_iteration": 2.8976056575775146 }, { "auxiliary_loss_clip": 0.01125602, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04929447, "balance_loss_mlp": 1.02013016, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 2.2578291383073825, "language_loss": 0.7536087, "learning_rate": 2.80899974864781e-06, "loss": 0.77521622, "num_input_tokens_seen": 137977040, "step": 6427, "time_per_iteration": 2.7281436920166016 }, { "auxiliary_loss_clip": 0.01075932, "auxiliary_loss_mlp": 0.01057335, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.04013276, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 2.0875975256988055, "language_loss": 0.69435054, "learning_rate": 2.8086435557832203e-06, "loss": 0.71568322, "num_input_tokens_seen": 137993545, "step": 6428, "time_per_iteration": 2.7289116382598877 }, { "auxiliary_loss_clip": 0.01113154, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.04947257, "balance_loss_mlp": 1.02729535, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 2.847119477349317, "language_loss": 0.8444519, "learning_rate": 2.8082873322547863e-06, "loss": 0.86600363, "num_input_tokens_seen": 138010140, "step": 6429, "time_per_iteration": 2.7385170459747314 }, { "auxiliary_loss_clip": 0.01110797, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.04555535, "balance_loss_mlp": 1.02423429, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 2.174010980525696, "language_loss": 0.80673695, "learning_rate": 2.807931078076015e-06, "loss": 0.82823092, "num_input_tokens_seen": 138028880, "step": 6430, "time_per_iteration": 2.660228967666626 }, { "auxiliary_loss_clip": 0.0102628, "auxiliary_loss_mlp": 0.01015101, "balance_loss_clip": 1.02508974, "balance_loss_mlp": 1.01382565, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 0.719429045650031, "language_loss": 0.58803207, "learning_rate": 2.807574793260416e-06, "loss": 0.60844588, "num_input_tokens_seen": 138098090, "step": 6431, "time_per_iteration": 3.2772469520568848 }, { "auxiliary_loss_clip": 0.01086398, "auxiliary_loss_mlp": 0.01039293, "balance_loss_clip": 1.04541588, "balance_loss_mlp": 1.02296114, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 2.1660589654497424, "language_loss": 0.79041815, "learning_rate": 2.8072184778215004e-06, "loss": 0.81167507, "num_input_tokens_seen": 138114735, "step": 6432, "time_per_iteration": 2.7949061393737793 }, { "auxiliary_loss_clip": 0.01125593, "auxiliary_loss_mlp": 0.01048624, "balance_loss_clip": 1.04708362, "balance_loss_mlp": 1.03231645, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 2.0695366497364294, "language_loss": 0.80186564, "learning_rate": 2.806862131772779e-06, "loss": 0.82360786, "num_input_tokens_seen": 138130480, "step": 6433, "time_per_iteration": 2.6526312828063965 }, { "auxiliary_loss_clip": 0.01111087, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.04934025, "balance_loss_mlp": 1.02162611, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 1.6267030007711512, "language_loss": 0.70441496, "learning_rate": 2.806505755127765e-06, "loss": 0.72590506, "num_input_tokens_seen": 138150640, "step": 6434, "time_per_iteration": 2.6985394954681396 }, { "auxiliary_loss_clip": 0.01097728, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.04536152, "balance_loss_mlp": 1.03008235, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 1.7348790517482282, "language_loss": 0.77462173, "learning_rate": 2.806149347899972e-06, "loss": 0.79607308, "num_input_tokens_seen": 138169700, "step": 6435, "time_per_iteration": 2.7326719760894775 }, { "auxiliary_loss_clip": 0.01119609, "auxiliary_loss_mlp": 0.01035834, "balance_loss_clip": 1.04651809, "balance_loss_mlp": 1.0208497, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 2.3575842278582813, "language_loss": 0.79599082, "learning_rate": 2.805792910102915e-06, "loss": 0.81754529, "num_input_tokens_seen": 138185835, "step": 6436, "time_per_iteration": 2.6643154621124268 }, { "auxiliary_loss_clip": 0.01107099, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.04809546, "balance_loss_mlp": 1.0215621, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 1.9038851888933561, "language_loss": 0.76043606, "learning_rate": 2.8054364417501093e-06, "loss": 0.78187203, "num_input_tokens_seen": 138204080, "step": 6437, "time_per_iteration": 2.701834201812744 }, { "auxiliary_loss_clip": 0.01110073, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.04696321, "balance_loss_mlp": 1.02374589, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 2.022501790448194, "language_loss": 0.81817484, "learning_rate": 2.805079942855074e-06, "loss": 0.8396467, "num_input_tokens_seen": 138220710, "step": 6438, "time_per_iteration": 4.327820539474487 }, { "auxiliary_loss_clip": 0.01111326, "auxiliary_loss_mlp": 0.0077319, "balance_loss_clip": 1.04504764, "balance_loss_mlp": 1.00027561, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 1.7517226143139228, "language_loss": 0.75388491, "learning_rate": 2.804723413431326e-06, "loss": 0.77273011, "num_input_tokens_seen": 138241720, "step": 6439, "time_per_iteration": 2.797830104827881 }, { "auxiliary_loss_clip": 0.01131277, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.04915833, "balance_loss_mlp": 1.0235002, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 1.7565856090832077, "language_loss": 0.74071443, "learning_rate": 2.8043668534923855e-06, "loss": 0.76240611, "num_input_tokens_seen": 138261885, "step": 6440, "time_per_iteration": 4.2160422801971436 }, { "auxiliary_loss_clip": 0.01125111, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.01949763, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 2.101028456947384, "language_loss": 0.82017142, "learning_rate": 2.804010263051774e-06, "loss": 0.84177244, "num_input_tokens_seen": 138280255, "step": 6441, "time_per_iteration": 4.199851036071777 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.01039285, "balance_loss_clip": 1.05011272, "balance_loss_mlp": 1.02490842, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 2.8802239922493147, "language_loss": 0.80824792, "learning_rate": 2.8036536421230118e-06, "loss": 0.82998842, "num_input_tokens_seen": 138296675, "step": 6442, "time_per_iteration": 2.6942524909973145 }, { "auxiliary_loss_clip": 0.01090073, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.04431343, "balance_loss_mlp": 1.01747537, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 2.1593394156288044, "language_loss": 0.84054118, "learning_rate": 2.803296990719624e-06, "loss": 0.86176467, "num_input_tokens_seen": 138314985, "step": 6443, "time_per_iteration": 2.6660094261169434 }, { "auxiliary_loss_clip": 0.01033878, "auxiliary_loss_mlp": 0.01000185, "balance_loss_clip": 1.02513885, "balance_loss_mlp": 0.99879646, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.7605185654135588, "language_loss": 0.50208193, "learning_rate": 2.8029403088551327e-06, "loss": 0.52242255, "num_input_tokens_seen": 138373275, "step": 6444, "time_per_iteration": 4.807433128356934 }, { "auxiliary_loss_clip": 0.01086333, "auxiliary_loss_mlp": 0.00773648, "balance_loss_clip": 1.04187298, "balance_loss_mlp": 1.00033963, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 1.4666177781563792, "language_loss": 0.78874767, "learning_rate": 2.802583596543065e-06, "loss": 0.80734754, "num_input_tokens_seen": 138391145, "step": 6445, "time_per_iteration": 2.689142942428589 }, { "auxiliary_loss_clip": 0.0111426, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.04754841, "balance_loss_mlp": 1.02445602, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 2.4274750437973958, "language_loss": 0.81207073, "learning_rate": 2.8022268537969474e-06, "loss": 0.83360916, "num_input_tokens_seen": 138409875, "step": 6446, "time_per_iteration": 2.6582860946655273 }, { "auxiliary_loss_clip": 0.01107394, "auxiliary_loss_mlp": 0.01037275, "balance_loss_clip": 1.04530001, "balance_loss_mlp": 1.02277923, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 3.0137556994939887, "language_loss": 0.77366996, "learning_rate": 2.801870080630306e-06, "loss": 0.79511666, "num_input_tokens_seen": 138428965, "step": 6447, "time_per_iteration": 2.727285146713257 }, { "auxiliary_loss_clip": 0.01108854, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.04590762, "balance_loss_mlp": 1.02378821, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 2.4450461903172562, "language_loss": 0.76364803, "learning_rate": 2.801513277056671e-06, "loss": 0.78511459, "num_input_tokens_seen": 138448090, "step": 6448, "time_per_iteration": 2.663989543914795 }, { "auxiliary_loss_clip": 0.01102873, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.04449654, "balance_loss_mlp": 1.02322626, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 1.6490971101368535, "language_loss": 0.76146352, "learning_rate": 2.8011564430895725e-06, "loss": 0.7828809, "num_input_tokens_seen": 138466105, "step": 6449, "time_per_iteration": 2.806537628173828 }, { "auxiliary_loss_clip": 0.01098531, "auxiliary_loss_mlp": 0.00772575, "balance_loss_clip": 1.04406381, "balance_loss_mlp": 1.00027394, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 2.0995234377866985, "language_loss": 0.78572172, "learning_rate": 2.800799578742542e-06, "loss": 0.80443275, "num_input_tokens_seen": 138485160, "step": 6450, "time_per_iteration": 2.7541351318359375 }, { "auxiliary_loss_clip": 0.01137663, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04827702, "balance_loss_mlp": 1.02452803, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 2.5655640440870946, "language_loss": 0.78046334, "learning_rate": 2.8004426840291106e-06, "loss": 0.80223942, "num_input_tokens_seen": 138504135, "step": 6451, "time_per_iteration": 2.6868700981140137 }, { "auxiliary_loss_clip": 0.01126689, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.04576159, "balance_loss_mlp": 1.01696229, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 2.633183178462793, "language_loss": 0.76404589, "learning_rate": 2.800085758962812e-06, "loss": 0.78562915, "num_input_tokens_seen": 138523955, "step": 6452, "time_per_iteration": 2.708750009536743 }, { "auxiliary_loss_clip": 0.01103834, "auxiliary_loss_mlp": 0.01042785, "balance_loss_clip": 1.04665875, "balance_loss_mlp": 1.0285815, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 1.5878969811553463, "language_loss": 0.79534453, "learning_rate": 2.799728803557182e-06, "loss": 0.81681073, "num_input_tokens_seen": 138541655, "step": 6453, "time_per_iteration": 2.7226593494415283 }, { "auxiliary_loss_clip": 0.0112782, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.04889584, "balance_loss_mlp": 1.02560616, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 19.957823861770734, "language_loss": 0.71643323, "learning_rate": 2.7993718178257555e-06, "loss": 0.73812103, "num_input_tokens_seen": 138560860, "step": 6454, "time_per_iteration": 2.7265548706054688 }, { "auxiliary_loss_clip": 0.01137183, "auxiliary_loss_mlp": 0.01043076, "balance_loss_clip": 1.04976404, "balance_loss_mlp": 1.02693522, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 2.029110970619929, "language_loss": 0.77489239, "learning_rate": 2.7990148017820694e-06, "loss": 0.79669499, "num_input_tokens_seen": 138580200, "step": 6455, "time_per_iteration": 2.7688205242156982 }, { "auxiliary_loss_clip": 0.01131496, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.02897501, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 1.8133016626985128, "language_loss": 0.76193333, "learning_rate": 2.798657755439662e-06, "loss": 0.78368604, "num_input_tokens_seen": 138598315, "step": 6456, "time_per_iteration": 2.6894283294677734 }, { "auxiliary_loss_clip": 0.01059894, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.04251969, "balance_loss_mlp": 1.02365136, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 9.416859416659493, "language_loss": 0.59422505, "learning_rate": 2.7983006788120726e-06, "loss": 0.61521268, "num_input_tokens_seen": 138615695, "step": 6457, "time_per_iteration": 2.8189444541931152 }, { "auxiliary_loss_clip": 0.01136561, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.04989612, "balance_loss_mlp": 1.02262187, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 2.336997181985419, "language_loss": 0.79927063, "learning_rate": 2.797943571912841e-06, "loss": 0.82102776, "num_input_tokens_seen": 138633180, "step": 6458, "time_per_iteration": 2.66198992729187 }, { "auxiliary_loss_clip": 0.01081764, "auxiliary_loss_mlp": 0.0104529, "balance_loss_clip": 1.04428816, "balance_loss_mlp": 1.02855277, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 2.218973373394608, "language_loss": 0.81497735, "learning_rate": 2.797586434755509e-06, "loss": 0.83624792, "num_input_tokens_seen": 138654785, "step": 6459, "time_per_iteration": 2.780120611190796 }, { "auxiliary_loss_clip": 0.01105714, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.04633725, "balance_loss_mlp": 1.0236907, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 1.942341955564712, "language_loss": 0.62001127, "learning_rate": 2.7972292673536202e-06, "loss": 0.64144087, "num_input_tokens_seen": 138673330, "step": 6460, "time_per_iteration": 2.625399112701416 }, { "auxiliary_loss_clip": 0.01120569, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.04955411, "balance_loss_mlp": 1.01920033, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 1.928823237011181, "language_loss": 0.86226058, "learning_rate": 2.796872069720717e-06, "loss": 0.88379019, "num_input_tokens_seen": 138694185, "step": 6461, "time_per_iteration": 2.6901583671569824 }, { "auxiliary_loss_clip": 0.0111976, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.04810238, "balance_loss_mlp": 1.0244205, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 4.963760229824091, "language_loss": 0.70659202, "learning_rate": 2.7965148418703456e-06, "loss": 0.72817743, "num_input_tokens_seen": 138714625, "step": 6462, "time_per_iteration": 2.7463371753692627 }, { "auxiliary_loss_clip": 0.01086013, "auxiliary_loss_mlp": 0.01043745, "balance_loss_clip": 1.04045033, "balance_loss_mlp": 1.02786636, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 2.747031306466439, "language_loss": 0.76228201, "learning_rate": 2.796157583816052e-06, "loss": 0.78357965, "num_input_tokens_seen": 138733585, "step": 6463, "time_per_iteration": 2.7231578826904297 }, { "auxiliary_loss_clip": 0.01103201, "auxiliary_loss_mlp": 0.0104459, "balance_loss_clip": 1.05013013, "balance_loss_mlp": 1.02841353, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 3.605418601568306, "language_loss": 0.70244539, "learning_rate": 2.795800295571382e-06, "loss": 0.72392333, "num_input_tokens_seen": 138752335, "step": 6464, "time_per_iteration": 2.773066759109497 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.04950786, "balance_loss_mlp": 1.02211452, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 2.8184770761764777, "language_loss": 0.69632983, "learning_rate": 2.7954429771498858e-06, "loss": 0.71780872, "num_input_tokens_seen": 138768450, "step": 6465, "time_per_iteration": 2.7013487815856934 }, { "auxiliary_loss_clip": 0.01097351, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04837847, "balance_loss_mlp": 1.02645373, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 2.665243237814177, "language_loss": 0.78489739, "learning_rate": 2.7950856285651117e-06, "loss": 0.80628836, "num_input_tokens_seen": 138786775, "step": 6466, "time_per_iteration": 2.736819267272949 }, { "auxiliary_loss_clip": 0.01095374, "auxiliary_loss_mlp": 0.01037568, "balance_loss_clip": 1.0463171, "balance_loss_mlp": 1.02242851, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 1.6522613533538497, "language_loss": 0.69341898, "learning_rate": 2.794728249830611e-06, "loss": 0.71474838, "num_input_tokens_seen": 138810100, "step": 6467, "time_per_iteration": 2.778083324432373 }, { "auxiliary_loss_clip": 0.01098114, "auxiliary_loss_mlp": 0.01048152, "balance_loss_clip": 1.04706931, "balance_loss_mlp": 1.0326246, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 3.2276382920067817, "language_loss": 0.84199375, "learning_rate": 2.794370840959936e-06, "loss": 0.86345637, "num_input_tokens_seen": 138825140, "step": 6468, "time_per_iteration": 2.6842098236083984 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01036235, "balance_loss_clip": 1.048172, "balance_loss_mlp": 1.0227766, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 1.8219377355536144, "language_loss": 0.84232908, "learning_rate": 2.7940134019666383e-06, "loss": 0.86377716, "num_input_tokens_seen": 138844115, "step": 6469, "time_per_iteration": 2.7538135051727295 }, { "auxiliary_loss_clip": 0.0109067, "auxiliary_loss_mlp": 0.01048288, "balance_loss_clip": 1.04416847, "balance_loss_mlp": 1.03205132, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 2.339210402911935, "language_loss": 0.75173676, "learning_rate": 2.793655932864273e-06, "loss": 0.7731263, "num_input_tokens_seen": 138860860, "step": 6470, "time_per_iteration": 2.7425949573516846 }, { "auxiliary_loss_clip": 0.01095528, "auxiliary_loss_mlp": 0.00772188, "balance_loss_clip": 1.0480423, "balance_loss_mlp": 1.00016475, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 1.5943716760052937, "language_loss": 0.74977577, "learning_rate": 2.7932984336663953e-06, "loss": 0.76845288, "num_input_tokens_seen": 138881910, "step": 6471, "time_per_iteration": 2.8880369663238525 }, { "auxiliary_loss_clip": 0.01077518, "auxiliary_loss_mlp": 0.01049277, "balance_loss_clip": 1.03879571, "balance_loss_mlp": 1.03336215, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 2.421548050110463, "language_loss": 0.67984551, "learning_rate": 2.792940904386562e-06, "loss": 0.70111346, "num_input_tokens_seen": 138900975, "step": 6472, "time_per_iteration": 2.7776875495910645 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.04819107, "balance_loss_mlp": 1.02974129, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 1.8102352941433608, "language_loss": 0.76068687, "learning_rate": 2.7925833450383293e-06, "loss": 0.78215432, "num_input_tokens_seen": 138920795, "step": 6473, "time_per_iteration": 2.7568469047546387 }, { "auxiliary_loss_clip": 0.01113975, "auxiliary_loss_mlp": 0.01046096, "balance_loss_clip": 1.05217087, "balance_loss_mlp": 1.03031242, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 2.045216735434868, "language_loss": 0.70959115, "learning_rate": 2.792225755635257e-06, "loss": 0.73119187, "num_input_tokens_seen": 138938770, "step": 6474, "time_per_iteration": 2.6930696964263916 }, { "auxiliary_loss_clip": 0.01135028, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.05145836, "balance_loss_mlp": 1.02861369, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 1.5519949793695216, "language_loss": 0.69049072, "learning_rate": 2.7918681361909046e-06, "loss": 0.71226156, "num_input_tokens_seen": 138958880, "step": 6475, "time_per_iteration": 2.670830011367798 }, { "auxiliary_loss_clip": 0.01110637, "auxiliary_loss_mlp": 0.01057592, "balance_loss_clip": 1.04578567, "balance_loss_mlp": 1.03981757, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 1.9596553320764234, "language_loss": 0.75820196, "learning_rate": 2.7915104867188332e-06, "loss": 0.77988434, "num_input_tokens_seen": 138977240, "step": 6476, "time_per_iteration": 2.683980941772461 }, { "auxiliary_loss_clip": 0.01039888, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.02862918, "balance_loss_mlp": 1.00084782, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.7759740468574157, "language_loss": 0.58146399, "learning_rate": 2.7911528072326055e-06, "loss": 0.60188472, "num_input_tokens_seen": 139039035, "step": 6477, "time_per_iteration": 3.2430496215820312 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.04780793, "balance_loss_mlp": 1.02428961, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 1.9073891309950948, "language_loss": 0.78554142, "learning_rate": 2.7907950977457832e-06, "loss": 0.80687243, "num_input_tokens_seen": 139055560, "step": 6478, "time_per_iteration": 4.241156339645386 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.04505491, "balance_loss_mlp": 1.02545047, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 2.6992371438810783, "language_loss": 0.82647753, "learning_rate": 2.7904373582719317e-06, "loss": 0.84803581, "num_input_tokens_seen": 139071865, "step": 6479, "time_per_iteration": 4.1569294929504395 }, { "auxiliary_loss_clip": 0.01131381, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 1.04886651, "balance_loss_mlp": 1.02161551, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 2.334048099077096, "language_loss": 0.79657412, "learning_rate": 2.790079588824617e-06, "loss": 0.81825137, "num_input_tokens_seen": 139089640, "step": 6480, "time_per_iteration": 4.170635938644409 }, { "auxiliary_loss_clip": 0.0110471, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.04561472, "balance_loss_mlp": 1.01822066, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 6.364109786330533, "language_loss": 0.83021134, "learning_rate": 2.7897217894174038e-06, "loss": 0.85158312, "num_input_tokens_seen": 139109365, "step": 6481, "time_per_iteration": 2.638821840286255 }, { "auxiliary_loss_clip": 0.01102815, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.02503228, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 1.7002276765936415, "language_loss": 0.75389051, "learning_rate": 2.789363960063863e-06, "loss": 0.77530706, "num_input_tokens_seen": 139128260, "step": 6482, "time_per_iteration": 2.5737624168395996 }, { "auxiliary_loss_clip": 0.01100553, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.04781246, "balance_loss_mlp": 1.02164662, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 2.0703094316503554, "language_loss": 0.78786725, "learning_rate": 2.78900610077756e-06, "loss": 0.80923092, "num_input_tokens_seen": 139147315, "step": 6483, "time_per_iteration": 2.6177117824554443 }, { "auxiliary_loss_clip": 0.01121516, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.04790664, "balance_loss_mlp": 1.01487088, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 1.6677367088018817, "language_loss": 0.79871929, "learning_rate": 2.788648211572067e-06, "loss": 0.82024151, "num_input_tokens_seen": 139167270, "step": 6484, "time_per_iteration": 4.221461534500122 }, { "auxiliary_loss_clip": 0.01119394, "auxiliary_loss_mlp": 0.01051487, "balance_loss_clip": 1.05063844, "balance_loss_mlp": 1.03472662, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 2.1008000508061104, "language_loss": 0.77901775, "learning_rate": 2.7882902924609557e-06, "loss": 0.80072653, "num_input_tokens_seen": 139185970, "step": 6485, "time_per_iteration": 2.664097785949707 }, { "auxiliary_loss_clip": 0.01085813, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.0427084, "balance_loss_mlp": 1.02207613, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 6.223818029706007, "language_loss": 0.85190272, "learning_rate": 2.7879323434577965e-06, "loss": 0.87312996, "num_input_tokens_seen": 139203730, "step": 6486, "time_per_iteration": 2.8325467109680176 }, { "auxiliary_loss_clip": 0.01111569, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.04786611, "balance_loss_mlp": 1.01883638, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 2.4250185390770618, "language_loss": 0.85333234, "learning_rate": 2.7875743645761645e-06, "loss": 0.87478197, "num_input_tokens_seen": 139222560, "step": 6487, "time_per_iteration": 2.8390486240386963 }, { "auxiliary_loss_clip": 0.01103222, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.04449213, "balance_loss_mlp": 1.01793766, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 1.5390409302603854, "language_loss": 0.72954559, "learning_rate": 2.787216355829633e-06, "loss": 0.75090778, "num_input_tokens_seen": 139242165, "step": 6488, "time_per_iteration": 2.7613236904144287 }, { "auxiliary_loss_clip": 0.01096805, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.04673266, "balance_loss_mlp": 1.02771914, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 2.6420160637986383, "language_loss": 0.68467176, "learning_rate": 2.786858317231779e-06, "loss": 0.70608854, "num_input_tokens_seen": 139262525, "step": 6489, "time_per_iteration": 2.746307849884033 }, { "auxiliary_loss_clip": 0.01108111, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.04793715, "balance_loss_mlp": 1.02673674, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 1.6912118236512272, "language_loss": 0.80629271, "learning_rate": 2.7865002487961788e-06, "loss": 0.82778984, "num_input_tokens_seen": 139282835, "step": 6490, "time_per_iteration": 2.7116847038269043 }, { "auxiliary_loss_clip": 0.01124963, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.04856181, "balance_loss_mlp": 1.0187161, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 3.073568327903315, "language_loss": 0.89115125, "learning_rate": 2.7861421505364104e-06, "loss": 0.91273135, "num_input_tokens_seen": 139299490, "step": 6491, "time_per_iteration": 2.6211190223693848 }, { "auxiliary_loss_clip": 0.01092029, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04406416, "balance_loss_mlp": 1.02952874, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 1.8064559635296407, "language_loss": 0.78637981, "learning_rate": 2.7857840224660523e-06, "loss": 0.80775088, "num_input_tokens_seen": 139317865, "step": 6492, "time_per_iteration": 2.7505667209625244 }, { "auxiliary_loss_clip": 0.01108778, "auxiliary_loss_mlp": 0.01041967, "balance_loss_clip": 1.04486537, "balance_loss_mlp": 1.02735257, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 1.7227367696506604, "language_loss": 0.74431908, "learning_rate": 2.7854258645986857e-06, "loss": 0.76582652, "num_input_tokens_seen": 139339840, "step": 6493, "time_per_iteration": 2.7200233936309814 }, { "auxiliary_loss_clip": 0.01091358, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.04613161, "balance_loss_mlp": 1.02549398, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 2.9656676182999395, "language_loss": 0.7637316, "learning_rate": 2.7850676769478916e-06, "loss": 0.78504777, "num_input_tokens_seen": 139357555, "step": 6494, "time_per_iteration": 2.6818442344665527 }, { "auxiliary_loss_clip": 0.01131498, "auxiliary_loss_mlp": 0.01048378, "balance_loss_clip": 1.0500524, "balance_loss_mlp": 1.03182006, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 2.1152980497113782, "language_loss": 0.74208486, "learning_rate": 2.7847094595272525e-06, "loss": 0.76388359, "num_input_tokens_seen": 139374455, "step": 6495, "time_per_iteration": 2.6432337760925293 }, { "auxiliary_loss_clip": 0.01137243, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05153751, "balance_loss_mlp": 1.02913451, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 2.402575660392066, "language_loss": 0.67757058, "learning_rate": 2.784351212350352e-06, "loss": 0.69939756, "num_input_tokens_seen": 139394770, "step": 6496, "time_per_iteration": 2.762009859085083 }, { "auxiliary_loss_clip": 0.01023856, "auxiliary_loss_mlp": 0.01010625, "balance_loss_clip": 1.02393842, "balance_loss_mlp": 1.00925446, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6655460592599327, "language_loss": 0.53920811, "learning_rate": 2.783992935430775e-06, "loss": 0.55955297, "num_input_tokens_seen": 139454760, "step": 6497, "time_per_iteration": 3.351006507873535 }, { "auxiliary_loss_clip": 0.01094838, "auxiliary_loss_mlp": 0.00772151, "balance_loss_clip": 1.0476501, "balance_loss_mlp": 1.00038421, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 2.7558428999232847, "language_loss": 0.6865977, "learning_rate": 2.7836346287821068e-06, "loss": 0.70526755, "num_input_tokens_seen": 139472645, "step": 6498, "time_per_iteration": 2.7838692665100098 }, { "auxiliary_loss_clip": 0.01022021, "auxiliary_loss_mlp": 0.01009741, "balance_loss_clip": 1.02064919, "balance_loss_mlp": 1.00839996, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7248596102007157, "language_loss": 0.51767612, "learning_rate": 2.783276292417936e-06, "loss": 0.53799379, "num_input_tokens_seen": 139536730, "step": 6499, "time_per_iteration": 3.2980377674102783 }, { "auxiliary_loss_clip": 0.01122618, "auxiliary_loss_mlp": 0.01044387, "balance_loss_clip": 1.04676056, "balance_loss_mlp": 1.02793658, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 1.973185164339423, "language_loss": 0.73842579, "learning_rate": 2.7829179263518487e-06, "loss": 0.76009583, "num_input_tokens_seen": 139557540, "step": 6500, "time_per_iteration": 2.7239198684692383 }, { "auxiliary_loss_clip": 0.01125366, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.05035591, "balance_loss_mlp": 1.02246249, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 2.6021512056662814, "language_loss": 0.68837166, "learning_rate": 2.7825595305974354e-06, "loss": 0.70999795, "num_input_tokens_seen": 139576875, "step": 6501, "time_per_iteration": 2.6926429271698 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.02442181, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 2.1384909443348246, "language_loss": 0.78875881, "learning_rate": 2.782201105168287e-06, "loss": 0.8103711, "num_input_tokens_seen": 139594295, "step": 6502, "time_per_iteration": 2.647021770477295 }, { "auxiliary_loss_clip": 0.01109811, "auxiliary_loss_mlp": 0.01035328, "balance_loss_clip": 1.04876852, "balance_loss_mlp": 1.02171457, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 3.671996146003432, "language_loss": 0.80537987, "learning_rate": 2.7818426500779932e-06, "loss": 0.82683128, "num_input_tokens_seen": 139614080, "step": 6503, "time_per_iteration": 2.7318384647369385 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.04387689, "balance_loss_mlp": 1.01760423, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 1.848076786389183, "language_loss": 0.71439689, "learning_rate": 2.7814841653401485e-06, "loss": 0.7357465, "num_input_tokens_seen": 139632755, "step": 6504, "time_per_iteration": 2.6983554363250732 }, { "auxiliary_loss_clip": 0.01130195, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.0459981, "balance_loss_mlp": 1.0199374, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 1.4848516480735832, "language_loss": 0.83245611, "learning_rate": 2.7811256509683454e-06, "loss": 0.8541038, "num_input_tokens_seen": 139654205, "step": 6505, "time_per_iteration": 2.6663267612457275 }, { "auxiliary_loss_clip": 0.01131259, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.04880178, "balance_loss_mlp": 1.02123427, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1.9330872564568533, "language_loss": 0.71352887, "learning_rate": 2.7807671069761797e-06, "loss": 0.73521107, "num_input_tokens_seen": 139673595, "step": 6506, "time_per_iteration": 2.6168534755706787 }, { "auxiliary_loss_clip": 0.01105925, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.04536867, "balance_loss_mlp": 1.02267289, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 2.106647299507305, "language_loss": 0.75086504, "learning_rate": 2.7804085333772477e-06, "loss": 0.77228636, "num_input_tokens_seen": 139690565, "step": 6507, "time_per_iteration": 2.8207101821899414 }, { "auxiliary_loss_clip": 0.01053146, "auxiliary_loss_mlp": 0.01002126, "balance_loss_clip": 1.02403712, "balance_loss_mlp": 1.00068331, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.9386901837185221, "language_loss": 0.56488812, "learning_rate": 2.7800499301851446e-06, "loss": 0.58544087, "num_input_tokens_seen": 139749420, "step": 6508, "time_per_iteration": 3.3985793590545654 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.01038464, "balance_loss_clip": 1.05045915, "balance_loss_mlp": 1.02476096, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 2.0207920703954936, "language_loss": 0.76855135, "learning_rate": 2.779691297413471e-06, "loss": 0.79015261, "num_input_tokens_seen": 139766265, "step": 6509, "time_per_iteration": 2.6667048931121826 }, { "auxiliary_loss_clip": 0.01101334, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.04298568, "balance_loss_mlp": 1.02731967, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 5.905968065437354, "language_loss": 0.82739937, "learning_rate": 2.779332635075825e-06, "loss": 0.84885252, "num_input_tokens_seen": 139782400, "step": 6510, "time_per_iteration": 2.933931589126587 }, { "auxiliary_loss_clip": 0.0112259, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.04712081, "balance_loss_mlp": 1.02406788, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 5.781106582003857, "language_loss": 0.76999253, "learning_rate": 2.7789739431858073e-06, "loss": 0.79160517, "num_input_tokens_seen": 139801435, "step": 6511, "time_per_iteration": 2.6926233768463135 }, { "auxiliary_loss_clip": 0.01035867, "auxiliary_loss_mlp": 0.01006458, "balance_loss_clip": 1.02583003, "balance_loss_mlp": 1.00515223, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.716551875912138, "language_loss": 0.57749176, "learning_rate": 2.7786152217570196e-06, "loss": 0.59791505, "num_input_tokens_seen": 139869700, "step": 6512, "time_per_iteration": 3.3695731163024902 }, { "auxiliary_loss_clip": 0.01135844, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.05013657, "balance_loss_mlp": 1.02001858, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 1.8014676974175234, "language_loss": 0.69625974, "learning_rate": 2.7782564708030647e-06, "loss": 0.71797216, "num_input_tokens_seen": 139890140, "step": 6513, "time_per_iteration": 2.8037526607513428 }, { "auxiliary_loss_clip": 0.01095461, "auxiliary_loss_mlp": 0.01038913, "balance_loss_clip": 1.04791474, "balance_loss_mlp": 1.02376771, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 8.577901504868834, "language_loss": 0.75566119, "learning_rate": 2.7778976903375464e-06, "loss": 0.77700496, "num_input_tokens_seen": 139908020, "step": 6514, "time_per_iteration": 2.8419485092163086 }, { "auxiliary_loss_clip": 0.01094835, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04639578, "balance_loss_mlp": 1.02636766, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 2.188170768945522, "language_loss": 0.77334291, "learning_rate": 2.7775388803740693e-06, "loss": 0.79469454, "num_input_tokens_seen": 139926180, "step": 6515, "time_per_iteration": 2.7894155979156494 }, { "auxiliary_loss_clip": 0.01087017, "auxiliary_loss_mlp": 0.0105158, "balance_loss_clip": 1.03979194, "balance_loss_mlp": 1.03763223, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 1.5088395946363757, "language_loss": 0.79678488, "learning_rate": 2.7771800409262406e-06, "loss": 0.81817091, "num_input_tokens_seen": 139947420, "step": 6516, "time_per_iteration": 2.902660608291626 }, { "auxiliary_loss_clip": 0.01092649, "auxiliary_loss_mlp": 0.01042434, "balance_loss_clip": 1.04691982, "balance_loss_mlp": 1.02799749, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 1.9539461980584907, "language_loss": 0.70539331, "learning_rate": 2.7768211720076665e-06, "loss": 0.72674412, "num_input_tokens_seen": 139965800, "step": 6517, "time_per_iteration": 4.275412082672119 }, { "auxiliary_loss_clip": 0.0108795, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.04107618, "balance_loss_mlp": 1.03034759, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 1.7270068216094292, "language_loss": 0.72215492, "learning_rate": 2.776462273631956e-06, "loss": 0.74348831, "num_input_tokens_seen": 139988140, "step": 6518, "time_per_iteration": 4.390907287597656 }, { "auxiliary_loss_clip": 0.01124647, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.05179489, "balance_loss_mlp": 1.02679503, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 1.8265438315676477, "language_loss": 0.61835045, "learning_rate": 2.7761033458127177e-06, "loss": 0.64001071, "num_input_tokens_seen": 140010060, "step": 6519, "time_per_iteration": 4.281017780303955 }, { "auxiliary_loss_clip": 0.01142133, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.05199361, "balance_loss_mlp": 1.02807307, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 2.723028929016538, "language_loss": 0.67084813, "learning_rate": 2.775744388563563e-06, "loss": 0.6927036, "num_input_tokens_seen": 140029400, "step": 6520, "time_per_iteration": 2.6971800327301025 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.04749501, "balance_loss_mlp": 1.02648759, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 1.8214273138880266, "language_loss": 0.78716481, "learning_rate": 2.775385401898104e-06, "loss": 0.80887604, "num_input_tokens_seen": 140048940, "step": 6521, "time_per_iteration": 2.69966459274292 }, { "auxiliary_loss_clip": 0.01128458, "auxiliary_loss_mlp": 0.01040156, "balance_loss_clip": 1.05050826, "balance_loss_mlp": 1.02289462, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 2.9673341059873897, "language_loss": 0.70119011, "learning_rate": 2.775026385829952e-06, "loss": 0.72287625, "num_input_tokens_seen": 140066380, "step": 6522, "time_per_iteration": 2.7100417613983154 }, { "auxiliary_loss_clip": 0.0110971, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.0467701, "balance_loss_mlp": 1.02100325, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 2.0481488550445595, "language_loss": 0.76847959, "learning_rate": 2.774667340372722e-06, "loss": 0.78993279, "num_input_tokens_seen": 140085275, "step": 6523, "time_per_iteration": 4.336375713348389 }, { "auxiliary_loss_clip": 0.01111577, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.04617906, "balance_loss_mlp": 1.02597904, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 2.4064695780458254, "language_loss": 0.62052447, "learning_rate": 2.7743082655400293e-06, "loss": 0.64204991, "num_input_tokens_seen": 140105105, "step": 6524, "time_per_iteration": 2.861999750137329 }, { "auxiliary_loss_clip": 0.0113421, "auxiliary_loss_mlp": 0.01041444, "balance_loss_clip": 1.04792655, "balance_loss_mlp": 1.02591681, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 3.311294983146634, "language_loss": 0.74027938, "learning_rate": 2.773949161345489e-06, "loss": 0.76203597, "num_input_tokens_seen": 140125645, "step": 6525, "time_per_iteration": 2.6660265922546387 }, { "auxiliary_loss_clip": 0.01111123, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.02488267, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 1.9378599466790423, "language_loss": 0.81101322, "learning_rate": 2.773590027802719e-06, "loss": 0.83251387, "num_input_tokens_seen": 140141925, "step": 6526, "time_per_iteration": 2.6949198246002197 }, { "auxiliary_loss_clip": 0.01122115, "auxiliary_loss_mlp": 0.01043128, "balance_loss_clip": 1.04750228, "balance_loss_mlp": 1.02844119, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 2.21390394508072, "language_loss": 0.69860446, "learning_rate": 2.7732308649253383e-06, "loss": 0.72025692, "num_input_tokens_seen": 140160965, "step": 6527, "time_per_iteration": 2.648738384246826 }, { "auxiliary_loss_clip": 0.01093845, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04563034, "balance_loss_mlp": 1.01990485, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 2.870547931880311, "language_loss": 0.82659566, "learning_rate": 2.772871672726965e-06, "loss": 0.84787941, "num_input_tokens_seen": 140177780, "step": 6528, "time_per_iteration": 2.7436537742614746 }, { "auxiliary_loss_clip": 0.01105744, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.04709864, "balance_loss_mlp": 1.01909113, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 1.7012894335018593, "language_loss": 0.68846285, "learning_rate": 2.7725124512212205e-06, "loss": 0.70984709, "num_input_tokens_seen": 140201660, "step": 6529, "time_per_iteration": 2.7932794094085693 }, { "auxiliary_loss_clip": 0.01112194, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.04500198, "balance_loss_mlp": 1.02043366, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 2.4176127237752145, "language_loss": 0.80461496, "learning_rate": 2.7721532004217267e-06, "loss": 0.82609558, "num_input_tokens_seen": 140218585, "step": 6530, "time_per_iteration": 2.7094242572784424 }, { "auxiliary_loss_clip": 0.01119536, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.04586959, "balance_loss_mlp": 1.0264107, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 1.6828565274400475, "language_loss": 0.75680822, "learning_rate": 2.7717939203421063e-06, "loss": 0.77841288, "num_input_tokens_seen": 140239905, "step": 6531, "time_per_iteration": 2.7238411903381348 }, { "auxiliary_loss_clip": 0.01058847, "auxiliary_loss_mlp": 0.01008064, "balance_loss_clip": 1.03009987, "balance_loss_mlp": 1.00663972, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.8211432271778524, "language_loss": 0.60317427, "learning_rate": 2.7714346109959822e-06, "loss": 0.62384337, "num_input_tokens_seen": 140293820, "step": 6532, "time_per_iteration": 3.047954797744751 }, { "auxiliary_loss_clip": 0.01037233, "auxiliary_loss_mlp": 0.01004719, "balance_loss_clip": 1.02873898, "balance_loss_mlp": 1.00334251, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.7803139799058858, "language_loss": 0.55459583, "learning_rate": 2.771075272396981e-06, "loss": 0.57501537, "num_input_tokens_seen": 140360420, "step": 6533, "time_per_iteration": 3.306561231613159 }, { "auxiliary_loss_clip": 0.01112553, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.04983759, "balance_loss_mlp": 1.02614141, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 2.2467248181922232, "language_loss": 0.75955313, "learning_rate": 2.7707159045587284e-06, "loss": 0.78108597, "num_input_tokens_seen": 140381950, "step": 6534, "time_per_iteration": 2.7788329124450684 }, { "auxiliary_loss_clip": 0.0112134, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.04698312, "balance_loss_mlp": 1.02866912, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 2.2080736338994944, "language_loss": 0.78123498, "learning_rate": 2.770356507494851e-06, "loss": 0.80290556, "num_input_tokens_seen": 140399410, "step": 6535, "time_per_iteration": 2.6949005126953125 }, { "auxiliary_loss_clip": 0.0109337, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.04779291, "balance_loss_mlp": 1.01950169, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 1.9769476518607105, "language_loss": 0.686719, "learning_rate": 2.769997081218978e-06, "loss": 0.7079792, "num_input_tokens_seen": 140419055, "step": 6536, "time_per_iteration": 2.7684245109558105 }, { "auxiliary_loss_clip": 0.01104946, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.04767156, "balance_loss_mlp": 1.02469027, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 1.8012856746153256, "language_loss": 0.69048655, "learning_rate": 2.769637625744738e-06, "loss": 0.71191454, "num_input_tokens_seen": 140438800, "step": 6537, "time_per_iteration": 2.7638440132141113 }, { "auxiliary_loss_clip": 0.01122897, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.05155134, "balance_loss_mlp": 1.02624357, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 1.7514361880438423, "language_loss": 0.78990901, "learning_rate": 2.769278141085763e-06, "loss": 0.81154549, "num_input_tokens_seen": 140456880, "step": 6538, "time_per_iteration": 2.635075807571411 }, { "auxiliary_loss_clip": 0.01003397, "auxiliary_loss_mlp": 0.01017351, "balance_loss_clip": 1.02259159, "balance_loss_mlp": 1.01596797, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.8098068956453415, "language_loss": 0.6190061, "learning_rate": 2.768918627255683e-06, "loss": 0.63921356, "num_input_tokens_seen": 140507510, "step": 6539, "time_per_iteration": 3.0673203468322754 }, { "auxiliary_loss_clip": 0.01104217, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.04730296, "balance_loss_mlp": 1.0206002, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 3.0347619755245248, "language_loss": 0.68405032, "learning_rate": 2.7685590842681315e-06, "loss": 0.70544618, "num_input_tokens_seen": 140528740, "step": 6540, "time_per_iteration": 2.7993643283843994 }, { "auxiliary_loss_clip": 0.01105128, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04439306, "balance_loss_mlp": 1.01638293, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 1.8325322608278536, "language_loss": 0.7276125, "learning_rate": 2.7681995121367433e-06, "loss": 0.74897116, "num_input_tokens_seen": 140547560, "step": 6541, "time_per_iteration": 2.659224510192871 }, { "auxiliary_loss_clip": 0.01054751, "auxiliary_loss_mlp": 0.01009472, "balance_loss_clip": 1.02648139, "balance_loss_mlp": 1.0080775, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8313029932067456, "language_loss": 0.60319722, "learning_rate": 2.7678399108751516e-06, "loss": 0.6238395, "num_input_tokens_seen": 140601175, "step": 6542, "time_per_iteration": 2.968062400817871 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.04764903, "balance_loss_mlp": 1.0243547, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 1.6209695943494522, "language_loss": 0.82034504, "learning_rate": 2.7674802804969947e-06, "loss": 0.84194422, "num_input_tokens_seen": 140622200, "step": 6543, "time_per_iteration": 2.638796806335449 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.04355097, "balance_loss_mlp": 1.02045417, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 3.743075543188527, "language_loss": 0.69100285, "learning_rate": 2.767120621015908e-06, "loss": 0.71239114, "num_input_tokens_seen": 140643125, "step": 6544, "time_per_iteration": 2.7180936336517334 }, { "auxiliary_loss_clip": 0.01112442, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.04659534, "balance_loss_mlp": 1.0316174, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 2.0996268311737976, "language_loss": 0.76072371, "learning_rate": 2.76676093244553e-06, "loss": 0.78232014, "num_input_tokens_seen": 140662500, "step": 6545, "time_per_iteration": 2.7429869174957275 }, { "auxiliary_loss_clip": 0.01091051, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.04633403, "balance_loss_mlp": 1.02104044, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 1.7673371756448844, "language_loss": 0.74672133, "learning_rate": 2.7664012147995015e-06, "loss": 0.76796907, "num_input_tokens_seen": 140681960, "step": 6546, "time_per_iteration": 2.6785295009613037 }, { "auxiliary_loss_clip": 0.01109428, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.01946843, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 1.9230817449169166, "language_loss": 0.81627518, "learning_rate": 2.7660414680914617e-06, "loss": 0.83771199, "num_input_tokens_seen": 140699170, "step": 6547, "time_per_iteration": 2.638214588165283 }, { "auxiliary_loss_clip": 0.01114598, "auxiliary_loss_mlp": 0.00772919, "balance_loss_clip": 1.04404151, "balance_loss_mlp": 1.00032711, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 1.9821442562566327, "language_loss": 0.84406352, "learning_rate": 2.7656816923350525e-06, "loss": 0.86293864, "num_input_tokens_seen": 140714920, "step": 6548, "time_per_iteration": 2.6490747928619385 }, { "auxiliary_loss_clip": 0.01118074, "auxiliary_loss_mlp": 0.00771091, "balance_loss_clip": 1.04686236, "balance_loss_mlp": 1.00034189, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 1.7617733187332765, "language_loss": 0.7311933, "learning_rate": 2.7653218875439174e-06, "loss": 0.75008494, "num_input_tokens_seen": 140734595, "step": 6549, "time_per_iteration": 2.635380983352661 }, { "auxiliary_loss_clip": 0.01071621, "auxiliary_loss_mlp": 0.01042928, "balance_loss_clip": 1.0444963, "balance_loss_mlp": 1.0259527, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 2.774519883144605, "language_loss": 0.77592897, "learning_rate": 2.764962053731699e-06, "loss": 0.7970745, "num_input_tokens_seen": 140754050, "step": 6550, "time_per_iteration": 2.733921527862549 }, { "auxiliary_loss_clip": 0.01095205, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04455531, "balance_loss_mlp": 1.01674485, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 3.1837220930493517, "language_loss": 0.81144142, "learning_rate": 2.7646021909120434e-06, "loss": 0.83270073, "num_input_tokens_seen": 140771440, "step": 6551, "time_per_iteration": 2.851475238800049 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.0443331, "balance_loss_mlp": 1.02188659, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 12.177431380433415, "language_loss": 0.80449802, "learning_rate": 2.764242299098596e-06, "loss": 0.82603723, "num_input_tokens_seen": 140786715, "step": 6552, "time_per_iteration": 2.667344570159912 }, { "auxiliary_loss_clip": 0.01133223, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.04791522, "balance_loss_mlp": 1.02883697, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 2.002210962432939, "language_loss": 0.71199149, "learning_rate": 2.763882378305003e-06, "loss": 0.73375642, "num_input_tokens_seen": 140804950, "step": 6553, "time_per_iteration": 2.6329705715179443 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.0077145, "balance_loss_clip": 1.04818738, "balance_loss_mlp": 1.00036502, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 4.200797737547303, "language_loss": 0.64058566, "learning_rate": 2.7635224285449144e-06, "loss": 0.65948284, "num_input_tokens_seen": 140822800, "step": 6554, "time_per_iteration": 2.7190303802490234 }, { "auxiliary_loss_clip": 0.01109713, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.04655266, "balance_loss_mlp": 1.02747416, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 2.186636266316066, "language_loss": 0.78957009, "learning_rate": 2.7631624498319796e-06, "loss": 0.81107843, "num_input_tokens_seen": 140842940, "step": 6555, "time_per_iteration": 2.7675819396972656 }, { "auxiliary_loss_clip": 0.01102424, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.04469514, "balance_loss_mlp": 1.02758873, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 1.7945119387028163, "language_loss": 0.71689165, "learning_rate": 2.7628024421798473e-06, "loss": 0.7383461, "num_input_tokens_seen": 140863060, "step": 6556, "time_per_iteration": 4.261122703552246 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.0445503, "balance_loss_mlp": 1.01749015, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1.7970895618407805, "language_loss": 0.84080362, "learning_rate": 2.7624424056021705e-06, "loss": 0.86241317, "num_input_tokens_seen": 140883795, "step": 6557, "time_per_iteration": 2.7031610012054443 }, { "auxiliary_loss_clip": 0.01116561, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.04790783, "balance_loss_mlp": 1.01810956, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 3.8501140650976238, "language_loss": 0.806759, "learning_rate": 2.7620823401126004e-06, "loss": 0.82824582, "num_input_tokens_seen": 140903055, "step": 6558, "time_per_iteration": 5.6523637771606445 }, { "auxiliary_loss_clip": 0.01130051, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.04807055, "balance_loss_mlp": 1.02238965, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 1.8974962376031472, "language_loss": 0.70930403, "learning_rate": 2.761722245724792e-06, "loss": 0.73096335, "num_input_tokens_seen": 140920685, "step": 6559, "time_per_iteration": 2.6645302772521973 }, { "auxiliary_loss_clip": 0.01113668, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.02452326, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 2.3002241644217865, "language_loss": 0.80355662, "learning_rate": 2.7613621224524003e-06, "loss": 0.82510054, "num_input_tokens_seen": 140937320, "step": 6560, "time_per_iteration": 2.8372745513916016 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.0103941, "balance_loss_clip": 1.04681468, "balance_loss_mlp": 1.02334619, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 2.2192317359233034, "language_loss": 0.828062, "learning_rate": 2.7610019703090803e-06, "loss": 0.84955078, "num_input_tokens_seen": 140954855, "step": 6561, "time_per_iteration": 2.6724014282226562 }, { "auxiliary_loss_clip": 0.01119263, "auxiliary_loss_mlp": 0.01043889, "balance_loss_clip": 1.04620779, "balance_loss_mlp": 1.02972126, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 2.478683034492453, "language_loss": 0.80985552, "learning_rate": 2.7606417893084887e-06, "loss": 0.83148706, "num_input_tokens_seen": 140973250, "step": 6562, "time_per_iteration": 4.211291074752808 }, { "auxiliary_loss_clip": 0.01100981, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.04367661, "balance_loss_mlp": 1.02568245, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 1.8396668644534004, "language_loss": 0.81574059, "learning_rate": 2.7602815794642853e-06, "loss": 0.83715415, "num_input_tokens_seen": 140993050, "step": 6563, "time_per_iteration": 2.6933205127716064 }, { "auxiliary_loss_clip": 0.01078578, "auxiliary_loss_mlp": 0.01052866, "balance_loss_clip": 1.03979552, "balance_loss_mlp": 1.03385234, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 2.4284687703059276, "language_loss": 0.69678622, "learning_rate": 2.759921340790127e-06, "loss": 0.71810067, "num_input_tokens_seen": 141010815, "step": 6564, "time_per_iteration": 2.7754619121551514 }, { "auxiliary_loss_clip": 0.01119553, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.04547322, "balance_loss_mlp": 1.02260029, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 2.342409184231709, "language_loss": 0.82842124, "learning_rate": 2.759561073299676e-06, "loss": 0.84999526, "num_input_tokens_seen": 141028720, "step": 6565, "time_per_iteration": 2.652029037475586 }, { "auxiliary_loss_clip": 0.01091527, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.04201448, "balance_loss_mlp": 1.02794445, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 1.8313066364371182, "language_loss": 0.83458865, "learning_rate": 2.7592007770065937e-06, "loss": 0.85594487, "num_input_tokens_seen": 141046025, "step": 6566, "time_per_iteration": 2.6853299140930176 }, { "auxiliary_loss_clip": 0.01137834, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.04882693, "balance_loss_mlp": 1.02146816, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 2.7953182854439973, "language_loss": 0.77462149, "learning_rate": 2.7588404519245403e-06, "loss": 0.79636931, "num_input_tokens_seen": 141066865, "step": 6567, "time_per_iteration": 2.6695878505706787 }, { "auxiliary_loss_clip": 0.01114738, "auxiliary_loss_mlp": 0.01037774, "balance_loss_clip": 1.04457474, "balance_loss_mlp": 1.0235877, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 3.2000391748281065, "language_loss": 0.80752146, "learning_rate": 2.758480098067182e-06, "loss": 0.82904655, "num_input_tokens_seen": 141084210, "step": 6568, "time_per_iteration": 2.6126980781555176 }, { "auxiliary_loss_clip": 0.01100656, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04693437, "balance_loss_mlp": 1.02142143, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 3.155903507973262, "language_loss": 0.846977, "learning_rate": 2.7581197154481816e-06, "loss": 0.868343, "num_input_tokens_seen": 141103895, "step": 6569, "time_per_iteration": 2.731241464614868 }, { "auxiliary_loss_clip": 0.01076285, "auxiliary_loss_mlp": 0.01045444, "balance_loss_clip": 1.046417, "balance_loss_mlp": 1.03076911, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 2.966787083651573, "language_loss": 0.74931526, "learning_rate": 2.7577593040812066e-06, "loss": 0.77053261, "num_input_tokens_seen": 141124000, "step": 6570, "time_per_iteration": 2.816168785095215 }, { "auxiliary_loss_clip": 0.01093382, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.04271865, "balance_loss_mlp": 1.02224803, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 3.807643490882315, "language_loss": 0.80009687, "learning_rate": 2.757398863979922e-06, "loss": 0.82140559, "num_input_tokens_seen": 141142535, "step": 6571, "time_per_iteration": 2.7444143295288086 }, { "auxiliary_loss_clip": 0.0110309, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.046592, "balance_loss_mlp": 1.02792382, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 2.0513494110156105, "language_loss": 0.77667749, "learning_rate": 2.757038395157997e-06, "loss": 0.79813272, "num_input_tokens_seen": 141161575, "step": 6572, "time_per_iteration": 2.787951946258545 }, { "auxiliary_loss_clip": 0.01096298, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.04524946, "balance_loss_mlp": 1.02422285, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 2.2233910711840092, "language_loss": 0.74710405, "learning_rate": 2.7566778976291002e-06, "loss": 0.76845872, "num_input_tokens_seen": 141181150, "step": 6573, "time_per_iteration": 2.8065271377563477 }, { "auxiliary_loss_clip": 0.01119667, "auxiliary_loss_mlp": 0.01033875, "balance_loss_clip": 1.04583275, "balance_loss_mlp": 1.02073228, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 1.5702623893020402, "language_loss": 0.681665, "learning_rate": 2.7563173714069017e-06, "loss": 0.7032004, "num_input_tokens_seen": 141206310, "step": 6574, "time_per_iteration": 2.917938470840454 }, { "auxiliary_loss_clip": 0.01066027, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.03601551, "balance_loss_mlp": 1.02941298, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 11.51359836007049, "language_loss": 0.71934754, "learning_rate": 2.755956816505072e-06, "loss": 0.74047613, "num_input_tokens_seen": 141223925, "step": 6575, "time_per_iteration": 2.8125574588775635 }, { "auxiliary_loss_clip": 0.01106625, "auxiliary_loss_mlp": 0.01044084, "balance_loss_clip": 1.04328454, "balance_loss_mlp": 1.02871156, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 2.3082458130711276, "language_loss": 0.73497486, "learning_rate": 2.7555962329372845e-06, "loss": 0.75648189, "num_input_tokens_seen": 141239010, "step": 6576, "time_per_iteration": 2.7072994709014893 }, { "auxiliary_loss_clip": 0.01131853, "auxiliary_loss_mlp": 0.01038072, "balance_loss_clip": 1.04721868, "balance_loss_mlp": 1.02482772, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 2.584581612312142, "language_loss": 0.83806884, "learning_rate": 2.7552356207172124e-06, "loss": 0.85976809, "num_input_tokens_seen": 141252255, "step": 6577, "time_per_iteration": 2.673980236053467 }, { "auxiliary_loss_clip": 0.01108115, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.04473734, "balance_loss_mlp": 1.02394366, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 3.282604232183532, "language_loss": 0.90597945, "learning_rate": 2.75487497985853e-06, "loss": 0.92744309, "num_input_tokens_seen": 141269325, "step": 6578, "time_per_iteration": 2.8357715606689453 }, { "auxiliary_loss_clip": 0.01113431, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.04971015, "balance_loss_mlp": 1.0215559, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 1.9328811386040925, "language_loss": 0.77836883, "learning_rate": 2.7545143103749117e-06, "loss": 0.7998836, "num_input_tokens_seen": 141288505, "step": 6579, "time_per_iteration": 2.78900146484375 }, { "auxiliary_loss_clip": 0.01080071, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.04296517, "balance_loss_mlp": 1.02181292, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 2.0515288557813705, "language_loss": 0.68375254, "learning_rate": 2.754153612280037e-06, "loss": 0.70492923, "num_input_tokens_seen": 141303680, "step": 6580, "time_per_iteration": 2.796602249145508 }, { "auxiliary_loss_clip": 0.01119101, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.04687381, "balance_loss_mlp": 1.01770234, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 5.6192422063497425, "language_loss": 0.58592093, "learning_rate": 2.7537928855875797e-06, "loss": 0.60742974, "num_input_tokens_seen": 141324090, "step": 6581, "time_per_iteration": 2.738732099533081 }, { "auxiliary_loss_clip": 0.0110807, "auxiliary_loss_mlp": 0.01047889, "balance_loss_clip": 1.04554892, "balance_loss_mlp": 1.03111625, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 1.840388254222325, "language_loss": 0.69687581, "learning_rate": 2.7534321303112224e-06, "loss": 0.71843535, "num_input_tokens_seen": 141342235, "step": 6582, "time_per_iteration": 2.74564790725708 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.0077198, "balance_loss_clip": 1.04670966, "balance_loss_mlp": 1.00066948, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 2.093309053458098, "language_loss": 0.76838243, "learning_rate": 2.753071346464642e-06, "loss": 0.78742981, "num_input_tokens_seen": 141361195, "step": 6583, "time_per_iteration": 2.6127665042877197 }, { "auxiliary_loss_clip": 0.01084294, "auxiliary_loss_mlp": 0.00772199, "balance_loss_clip": 1.04135418, "balance_loss_mlp": 1.00058353, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 2.422087879109688, "language_loss": 0.66005278, "learning_rate": 2.7527105340615207e-06, "loss": 0.67861772, "num_input_tokens_seen": 141378275, "step": 6584, "time_per_iteration": 2.8412790298461914 }, { "auxiliary_loss_clip": 0.0109769, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.04634333, "balance_loss_mlp": 1.02687716, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 7.452692779947077, "language_loss": 0.72775561, "learning_rate": 2.7523496931155413e-06, "loss": 0.74916053, "num_input_tokens_seen": 141396960, "step": 6585, "time_per_iteration": 2.8504436016082764 }, { "auxiliary_loss_clip": 0.0109915, "auxiliary_loss_mlp": 0.01041099, "balance_loss_clip": 1.04335117, "balance_loss_mlp": 1.02628136, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 1.8603715362450812, "language_loss": 0.73381901, "learning_rate": 2.7519888236403856e-06, "loss": 0.75522149, "num_input_tokens_seen": 141417320, "step": 6586, "time_per_iteration": 2.8426311016082764 }, { "auxiliary_loss_clip": 0.01101854, "auxiliary_loss_mlp": 0.0103792, "balance_loss_clip": 1.04255629, "balance_loss_mlp": 1.02266693, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 2.174728382433504, "language_loss": 0.71447468, "learning_rate": 2.7516279256497382e-06, "loss": 0.73587245, "num_input_tokens_seen": 141435985, "step": 6587, "time_per_iteration": 2.7798478603363037 }, { "auxiliary_loss_clip": 0.01007869, "auxiliary_loss_mlp": 0.01003214, "balance_loss_clip": 1.02249742, "balance_loss_mlp": 1.00195026, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 0.9478406102040471, "language_loss": 0.61186492, "learning_rate": 2.751266999157285e-06, "loss": 0.63197577, "num_input_tokens_seen": 141486075, "step": 6588, "time_per_iteration": 3.1663742065429688 }, { "auxiliary_loss_clip": 0.0110963, "auxiliary_loss_mlp": 0.00772247, "balance_loss_clip": 1.04547548, "balance_loss_mlp": 1.0006907, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 3.1004380492305206, "language_loss": 0.81686854, "learning_rate": 2.7509060441767115e-06, "loss": 0.8356874, "num_input_tokens_seen": 141505280, "step": 6589, "time_per_iteration": 2.7711055278778076 }, { "auxiliary_loss_clip": 0.01106228, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.04562962, "balance_loss_mlp": 1.02241325, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 2.2429889858322802, "language_loss": 0.69913912, "learning_rate": 2.7505450607217057e-06, "loss": 0.72058284, "num_input_tokens_seen": 141523930, "step": 6590, "time_per_iteration": 2.793330669403076 }, { "auxiliary_loss_clip": 0.01117633, "auxiliary_loss_mlp": 0.01056421, "balance_loss_clip": 1.04669666, "balance_loss_mlp": 1.03980339, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 1.772211549409949, "language_loss": 0.75809395, "learning_rate": 2.750184048805956e-06, "loss": 0.77983451, "num_input_tokens_seen": 141541320, "step": 6591, "time_per_iteration": 2.7317981719970703 }, { "auxiliary_loss_clip": 0.01043506, "auxiliary_loss_mlp": 0.01049181, "balance_loss_clip": 1.03802264, "balance_loss_mlp": 1.03364813, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 2.064980952243903, "language_loss": 0.78466719, "learning_rate": 2.749823008443152e-06, "loss": 0.80559409, "num_input_tokens_seen": 141561880, "step": 6592, "time_per_iteration": 3.192194700241089 }, { "auxiliary_loss_clip": 0.01059924, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.03984666, "balance_loss_mlp": 1.01872826, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 1.9568402514544967, "language_loss": 0.69690341, "learning_rate": 2.7494619396469843e-06, "loss": 0.71784127, "num_input_tokens_seen": 141586460, "step": 6593, "time_per_iteration": 3.365752696990967 }, { "auxiliary_loss_clip": 0.01059564, "auxiliary_loss_mlp": 0.01046377, "balance_loss_clip": 1.03668404, "balance_loss_mlp": 1.03035569, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 1.624713370756075, "language_loss": 0.77905881, "learning_rate": 2.7491008424311452e-06, "loss": 0.80011821, "num_input_tokens_seen": 141605955, "step": 6594, "time_per_iteration": 2.890626907348633 }, { "auxiliary_loss_clip": 0.01025812, "auxiliary_loss_mlp": 0.01003509, "balance_loss_clip": 1.02550435, "balance_loss_mlp": 1.00200129, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9363100872746896, "language_loss": 0.6304667, "learning_rate": 2.7487397168093265e-06, "loss": 0.65075988, "num_input_tokens_seen": 141673140, "step": 6595, "time_per_iteration": 3.3955094814300537 }, { "auxiliary_loss_clip": 0.01096586, "auxiliary_loss_mlp": 0.01055368, "balance_loss_clip": 1.0442034, "balance_loss_mlp": 1.03774858, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 2.5609780352809732, "language_loss": 0.63787287, "learning_rate": 2.748378562795223e-06, "loss": 0.65939242, "num_input_tokens_seen": 141692955, "step": 6596, "time_per_iteration": 4.60092568397522 }, { "auxiliary_loss_clip": 0.01120147, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04657853, "balance_loss_mlp": 1.02747798, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 2.0315739566024567, "language_loss": 0.79006839, "learning_rate": 2.7480173804025293e-06, "loss": 0.81169188, "num_input_tokens_seen": 141710680, "step": 6597, "time_per_iteration": 5.807824373245239 }, { "auxiliary_loss_clip": 0.01099639, "auxiliary_loss_mlp": 0.00773402, "balance_loss_clip": 1.04352951, "balance_loss_mlp": 1.00076032, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 2.966898609781474, "language_loss": 0.6772182, "learning_rate": 2.747656169644941e-06, "loss": 0.69594866, "num_input_tokens_seen": 141729860, "step": 6598, "time_per_iteration": 2.786884307861328 }, { "auxiliary_loss_clip": 0.01129462, "auxiliary_loss_mlp": 0.01041455, "balance_loss_clip": 1.04473436, "balance_loss_mlp": 1.02785325, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 2.1804433985902247, "language_loss": 0.79342777, "learning_rate": 2.747294930536157e-06, "loss": 0.81513697, "num_input_tokens_seen": 141749060, "step": 6599, "time_per_iteration": 2.6758370399475098 }, { "auxiliary_loss_clip": 0.01091573, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04314208, "balance_loss_mlp": 1.02487051, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 2.279505844275463, "language_loss": 0.72878486, "learning_rate": 2.7469336630898737e-06, "loss": 0.75011677, "num_input_tokens_seen": 141769860, "step": 6600, "time_per_iteration": 2.7616889476776123 }, { "auxiliary_loss_clip": 0.01083152, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.03626251, "balance_loss_mlp": 1.0220201, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 2.0515710245603938, "language_loss": 0.85973942, "learning_rate": 2.746572367319791e-06, "loss": 0.88094509, "num_input_tokens_seen": 141788465, "step": 6601, "time_per_iteration": 2.755791664123535 }, { "auxiliary_loss_clip": 0.01095713, "auxiliary_loss_mlp": 0.01041964, "balance_loss_clip": 1.0429877, "balance_loss_mlp": 1.02468467, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 2.240549855963289, "language_loss": 0.70372766, "learning_rate": 2.7462110432396095e-06, "loss": 0.72510445, "num_input_tokens_seen": 141804955, "step": 6602, "time_per_iteration": 4.643726348876953 }, { "auxiliary_loss_clip": 0.01133428, "auxiliary_loss_mlp": 0.01047809, "balance_loss_clip": 1.04658508, "balance_loss_mlp": 1.03230548, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 3.7711392584572896, "language_loss": 0.83248609, "learning_rate": 2.7458496908630305e-06, "loss": 0.85429847, "num_input_tokens_seen": 141820025, "step": 6603, "time_per_iteration": 2.8909716606140137 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.04651403, "balance_loss_mlp": 1.02003431, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 1.9508498227264648, "language_loss": 0.73302728, "learning_rate": 2.7454883102037563e-06, "loss": 0.75449347, "num_input_tokens_seen": 141838735, "step": 6604, "time_per_iteration": 2.828908920288086 }, { "auxiliary_loss_clip": 0.01105132, "auxiliary_loss_mlp": 0.01038476, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02364659, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 1.769953580879433, "language_loss": 0.82582277, "learning_rate": 2.745126901275491e-06, "loss": 0.84725887, "num_input_tokens_seen": 141858090, "step": 6605, "time_per_iteration": 2.6773502826690674 }, { "auxiliary_loss_clip": 0.01128613, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.04549098, "balance_loss_mlp": 1.01968801, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 1.4871941413504006, "language_loss": 0.73511499, "learning_rate": 2.7447654640919383e-06, "loss": 0.75673246, "num_input_tokens_seen": 141877540, "step": 6606, "time_per_iteration": 2.632805347442627 }, { "auxiliary_loss_clip": 0.01089285, "auxiliary_loss_mlp": 0.01048599, "balance_loss_clip": 1.0436089, "balance_loss_mlp": 1.03198171, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 2.092571399644939, "language_loss": 0.74296981, "learning_rate": 2.744403998666805e-06, "loss": 0.76434863, "num_input_tokens_seen": 141897315, "step": 6607, "time_per_iteration": 2.7277770042419434 }, { "auxiliary_loss_clip": 0.01124169, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.04697132, "balance_loss_mlp": 1.02267027, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 1.5196847129379933, "language_loss": 0.6787042, "learning_rate": 2.744042505013797e-06, "loss": 0.70031989, "num_input_tokens_seen": 141919580, "step": 6608, "time_per_iteration": 2.8229119777679443 }, { "auxiliary_loss_clip": 0.01094928, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.04091311, "balance_loss_mlp": 1.03580451, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 7.314681050409252, "language_loss": 0.74670005, "learning_rate": 2.7436809831466233e-06, "loss": 0.7681911, "num_input_tokens_seen": 141937045, "step": 6609, "time_per_iteration": 2.7502245903015137 }, { "auxiliary_loss_clip": 0.01107217, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.04354, "balance_loss_mlp": 1.02056026, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 1.742454501323656, "language_loss": 0.713238, "learning_rate": 2.7433194330789927e-06, "loss": 0.73467076, "num_input_tokens_seen": 141956695, "step": 6610, "time_per_iteration": 2.7225286960601807 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.03818822, "balance_loss_mlp": 1.01509547, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 1.7960063460415152, "language_loss": 0.78151029, "learning_rate": 2.7429578548246133e-06, "loss": 0.8028695, "num_input_tokens_seen": 141975935, "step": 6611, "time_per_iteration": 2.6464622020721436 }, { "auxiliary_loss_clip": 0.01121213, "auxiliary_loss_mlp": 0.01038273, "balance_loss_clip": 1.04614162, "balance_loss_mlp": 1.0235095, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 1.7937788130001704, "language_loss": 0.7921629, "learning_rate": 2.7425962483971985e-06, "loss": 0.81375778, "num_input_tokens_seen": 141995750, "step": 6612, "time_per_iteration": 2.734950304031372 }, { "auxiliary_loss_clip": 0.01018209, "auxiliary_loss_mlp": 0.0100828, "balance_loss_clip": 1.02113628, "balance_loss_mlp": 1.00702214, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 0.8423760762856193, "language_loss": 0.64935088, "learning_rate": 2.742234613810459e-06, "loss": 0.66961575, "num_input_tokens_seen": 142057655, "step": 6613, "time_per_iteration": 3.1294105052948 }, { "auxiliary_loss_clip": 0.01097901, "auxiliary_loss_mlp": 0.01042526, "balance_loss_clip": 1.03916883, "balance_loss_mlp": 1.02507401, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 3.0444472336295636, "language_loss": 0.71508956, "learning_rate": 2.741872951078109e-06, "loss": 0.73649383, "num_input_tokens_seen": 142076020, "step": 6614, "time_per_iteration": 2.6479976177215576 }, { "auxiliary_loss_clip": 0.01116106, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04503131, "balance_loss_mlp": 1.02034712, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 2.2927333729520885, "language_loss": 0.81362098, "learning_rate": 2.741511260213862e-06, "loss": 0.83513486, "num_input_tokens_seen": 142093790, "step": 6615, "time_per_iteration": 2.6567723751068115 }, { "auxiliary_loss_clip": 0.01094954, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.04491544, "balance_loss_mlp": 1.02023649, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 2.01024859105405, "language_loss": 0.67510247, "learning_rate": 2.741149541231434e-06, "loss": 0.69639802, "num_input_tokens_seen": 142110545, "step": 6616, "time_per_iteration": 2.6675400733947754 }, { "auxiliary_loss_clip": 0.01133654, "auxiliary_loss_mlp": 0.01043633, "balance_loss_clip": 1.04658771, "balance_loss_mlp": 1.02765918, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 2.3086733420735785, "language_loss": 0.83678514, "learning_rate": 2.740787794144541e-06, "loss": 0.85855806, "num_input_tokens_seen": 142128695, "step": 6617, "time_per_iteration": 2.5879552364349365 }, { "auxiliary_loss_clip": 0.01126085, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.04570735, "balance_loss_mlp": 1.02563334, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 1.7795732635152253, "language_loss": 0.72766519, "learning_rate": 2.7404260189669e-06, "loss": 0.74932027, "num_input_tokens_seen": 142148375, "step": 6618, "time_per_iteration": 2.613162040710449 }, { "auxiliary_loss_clip": 0.01111951, "auxiliary_loss_mlp": 0.01041983, "balance_loss_clip": 1.04827428, "balance_loss_mlp": 1.02544832, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 1.6960793445061386, "language_loss": 0.65858316, "learning_rate": 2.740064215712231e-06, "loss": 0.68012249, "num_input_tokens_seen": 142169735, "step": 6619, "time_per_iteration": 2.7474000453948975 }, { "auxiliary_loss_clip": 0.01052495, "auxiliary_loss_mlp": 0.01004058, "balance_loss_clip": 1.0230546, "balance_loss_mlp": 1.00270545, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7704475067145287, "language_loss": 0.58246851, "learning_rate": 2.7397023843942527e-06, "loss": 0.60303402, "num_input_tokens_seen": 142229520, "step": 6620, "time_per_iteration": 3.1400091648101807 }, { "auxiliary_loss_clip": 0.01113547, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.04998422, "balance_loss_mlp": 1.02314794, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 1.821199996328267, "language_loss": 0.7925806, "learning_rate": 2.739340525026686e-06, "loss": 0.81408358, "num_input_tokens_seen": 142247660, "step": 6621, "time_per_iteration": 2.7389161586761475 }, { "auxiliary_loss_clip": 0.0110802, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.04590595, "balance_loss_mlp": 1.02088952, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 1.899291394170355, "language_loss": 0.77800381, "learning_rate": 2.738978637623252e-06, "loss": 0.79943347, "num_input_tokens_seen": 142266990, "step": 6622, "time_per_iteration": 2.7175779342651367 }, { "auxiliary_loss_clip": 0.01101638, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.04108417, "balance_loss_mlp": 1.02377844, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 1.6278701941081761, "language_loss": 0.7497921, "learning_rate": 2.738616722197674e-06, "loss": 0.77119565, "num_input_tokens_seen": 142287170, "step": 6623, "time_per_iteration": 2.682567596435547 }, { "auxiliary_loss_clip": 0.01088304, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04280734, "balance_loss_mlp": 1.02590537, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 2.4968757127264465, "language_loss": 0.79733497, "learning_rate": 2.7382547787636766e-06, "loss": 0.81862563, "num_input_tokens_seen": 142305405, "step": 6624, "time_per_iteration": 2.6878697872161865 }, { "auxiliary_loss_clip": 0.01135858, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.04792297, "balance_loss_mlp": 1.0270462, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 2.0557211895564884, "language_loss": 0.83616954, "learning_rate": 2.7378928073349832e-06, "loss": 0.85796595, "num_input_tokens_seen": 142322710, "step": 6625, "time_per_iteration": 2.5847036838531494 }, { "auxiliary_loss_clip": 0.011152, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.04585958, "balance_loss_mlp": 1.02948713, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 2.4120237094780377, "language_loss": 0.87324822, "learning_rate": 2.737530807925321e-06, "loss": 0.89484465, "num_input_tokens_seen": 142338535, "step": 6626, "time_per_iteration": 2.5845320224761963 }, { "auxiliary_loss_clip": 0.01067442, "auxiliary_loss_mlp": 0.00775778, "balance_loss_clip": 1.03995085, "balance_loss_mlp": 1.00066137, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 2.3324132294494797, "language_loss": 0.83462882, "learning_rate": 2.737168780548417e-06, "loss": 0.85306096, "num_input_tokens_seen": 142354570, "step": 6627, "time_per_iteration": 2.854428291320801 }, { "auxiliary_loss_clip": 0.01087071, "auxiliary_loss_mlp": 0.00771611, "balance_loss_clip": 1.04081798, "balance_loss_mlp": 1.00056684, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 1.4575889504047923, "language_loss": 0.82904339, "learning_rate": 2.736806725217998e-06, "loss": 0.84763026, "num_input_tokens_seen": 142374395, "step": 6628, "time_per_iteration": 2.772620916366577 }, { "auxiliary_loss_clip": 0.01092039, "auxiliary_loss_mlp": 0.01062711, "balance_loss_clip": 1.04402328, "balance_loss_mlp": 1.04652882, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 1.6631347026103094, "language_loss": 0.71145642, "learning_rate": 2.7364446419477945e-06, "loss": 0.73300385, "num_input_tokens_seen": 142396040, "step": 6629, "time_per_iteration": 2.681969165802002 }, { "auxiliary_loss_clip": 0.01097676, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.04695797, "balance_loss_mlp": 1.02136111, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 1.757569665448266, "language_loss": 0.80513418, "learning_rate": 2.7360825307515366e-06, "loss": 0.82646906, "num_input_tokens_seen": 142415495, "step": 6630, "time_per_iteration": 2.7747275829315186 }, { "auxiliary_loss_clip": 0.01072778, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04495096, "balance_loss_mlp": 1.01805389, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 2.3833222910170857, "language_loss": 0.74846494, "learning_rate": 2.7357203916429555e-06, "loss": 0.76951796, "num_input_tokens_seen": 142431865, "step": 6631, "time_per_iteration": 2.8098866939544678 }, { "auxiliary_loss_clip": 0.01095184, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04248333, "balance_loss_mlp": 1.02500248, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 2.096728163981437, "language_loss": 0.7160908, "learning_rate": 2.735358224635783e-06, "loss": 0.73744667, "num_input_tokens_seen": 142450595, "step": 6632, "time_per_iteration": 2.81479811668396 }, { "auxiliary_loss_clip": 0.01063774, "auxiliary_loss_mlp": 0.00771132, "balance_loss_clip": 1.04164338, "balance_loss_mlp": 1.00057721, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 2.0680050346702945, "language_loss": 0.7479074, "learning_rate": 2.7349960297437533e-06, "loss": 0.76625645, "num_input_tokens_seen": 142466650, "step": 6633, "time_per_iteration": 2.9533073902130127 }, { "auxiliary_loss_clip": 0.01105798, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.0465138, "balance_loss_mlp": 1.01509583, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 1.7626671777587215, "language_loss": 0.81420207, "learning_rate": 2.7346338069806e-06, "loss": 0.83554673, "num_input_tokens_seen": 142486165, "step": 6634, "time_per_iteration": 2.760012626647949 }, { "auxiliary_loss_clip": 0.0110458, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04739153, "balance_loss_mlp": 1.01618731, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 2.495702621722643, "language_loss": 0.74914795, "learning_rate": 2.7342715563600597e-06, "loss": 0.77050287, "num_input_tokens_seen": 142505035, "step": 6635, "time_per_iteration": 4.225152015686035 }, { "auxiliary_loss_clip": 0.01101511, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04791617, "balance_loss_mlp": 1.02265239, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 28.19463582214486, "language_loss": 0.66373086, "learning_rate": 2.733909277895868e-06, "loss": 0.68513715, "num_input_tokens_seen": 142521870, "step": 6636, "time_per_iteration": 4.455794811248779 }, { "auxiliary_loss_clip": 0.01118899, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.04681683, "balance_loss_mlp": 1.02687669, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 2.0422591411720723, "language_loss": 0.81318372, "learning_rate": 2.733546971601763e-06, "loss": 0.83478993, "num_input_tokens_seen": 142540455, "step": 6637, "time_per_iteration": 4.3843090534210205 }, { "auxiliary_loss_clip": 0.0102804, "auxiliary_loss_mlp": 0.01018728, "balance_loss_clip": 1.02743387, "balance_loss_mlp": 1.01694012, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.719892771757815, "language_loss": 0.53119934, "learning_rate": 2.733184637491484e-06, "loss": 0.55166698, "num_input_tokens_seen": 142599665, "step": 6638, "time_per_iteration": 3.2910361289978027 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.00772668, "balance_loss_clip": 1.04786587, "balance_loss_mlp": 1.00065207, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 1.6115719033099838, "language_loss": 0.75487578, "learning_rate": 2.732822275578769e-06, "loss": 0.77373028, "num_input_tokens_seen": 142618845, "step": 6639, "time_per_iteration": 2.7083969116210938 }, { "auxiliary_loss_clip": 0.0105821, "auxiliary_loss_mlp": 0.01036909, "balance_loss_clip": 1.03856301, "balance_loss_mlp": 1.022264, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 2.505539025941121, "language_loss": 0.76163709, "learning_rate": 2.7324598858773603e-06, "loss": 0.78258824, "num_input_tokens_seen": 142640885, "step": 6640, "time_per_iteration": 2.8801841735839844 }, { "auxiliary_loss_clip": 0.01102565, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.04663992, "balance_loss_mlp": 1.02430892, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 2.779199402703341, "language_loss": 0.81995392, "learning_rate": 2.7320974684009996e-06, "loss": 0.84136951, "num_input_tokens_seen": 142659340, "step": 6641, "time_per_iteration": 4.346608638763428 }, { "auxiliary_loss_clip": 0.01136449, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.05189252, "balance_loss_mlp": 1.02393353, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 2.1545130527280985, "language_loss": 0.76744998, "learning_rate": 2.7317350231634288e-06, "loss": 0.78920233, "num_input_tokens_seen": 142677085, "step": 6642, "time_per_iteration": 2.656057596206665 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.04871511, "balance_loss_mlp": 1.0196898, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 2.1744041926742788, "language_loss": 0.72387367, "learning_rate": 2.731372550178393e-06, "loss": 0.7452786, "num_input_tokens_seen": 142694595, "step": 6643, "time_per_iteration": 2.680995225906372 }, { "auxiliary_loss_clip": 0.01123145, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.04840899, "balance_loss_mlp": 1.02565074, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 1.7059817149479597, "language_loss": 0.6665355, "learning_rate": 2.7310100494596375e-06, "loss": 0.68817025, "num_input_tokens_seen": 142714175, "step": 6644, "time_per_iteration": 2.6378324031829834 }, { "auxiliary_loss_clip": 0.01130779, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.04629064, "balance_loss_mlp": 1.02349472, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 2.1425296608964937, "language_loss": 0.78164649, "learning_rate": 2.730647521020907e-06, "loss": 0.80333817, "num_input_tokens_seen": 142730955, "step": 6645, "time_per_iteration": 2.6268746852874756 }, { "auxiliary_loss_clip": 0.0112116, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.04624033, "balance_loss_mlp": 1.02252507, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 1.7492924628724136, "language_loss": 0.69861412, "learning_rate": 2.73028496487595e-06, "loss": 0.72019678, "num_input_tokens_seen": 142751200, "step": 6646, "time_per_iteration": 2.7350409030914307 }, { "auxiliary_loss_clip": 0.0107684, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.03799927, "balance_loss_mlp": 1.02223825, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 1.7623657715359762, "language_loss": 0.72017872, "learning_rate": 2.729922381038513e-06, "loss": 0.74132061, "num_input_tokens_seen": 142770170, "step": 6647, "time_per_iteration": 2.7607529163360596 }, { "auxiliary_loss_clip": 0.01093143, "auxiliary_loss_mlp": 0.01043089, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.02973795, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 1.4563496549616326, "language_loss": 0.74217343, "learning_rate": 2.7295597695223463e-06, "loss": 0.7635358, "num_input_tokens_seen": 142792680, "step": 6648, "time_per_iteration": 2.8048219680786133 }, { "auxiliary_loss_clip": 0.01133606, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.04912674, "balance_loss_mlp": 1.02281022, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 2.0433040752683578, "language_loss": 0.6589973, "learning_rate": 2.7291971303412006e-06, "loss": 0.6807096, "num_input_tokens_seen": 142810510, "step": 6649, "time_per_iteration": 2.6976583003997803 }, { "auxiliary_loss_clip": 0.01103049, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.04713392, "balance_loss_mlp": 1.02803016, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 1.7319771659085785, "language_loss": 0.75106388, "learning_rate": 2.728834463508826e-06, "loss": 0.77251565, "num_input_tokens_seen": 142832455, "step": 6650, "time_per_iteration": 2.7441325187683105 }, { "auxiliary_loss_clip": 0.01132922, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04873252, "balance_loss_mlp": 1.02803564, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 1.5673208577322473, "language_loss": 0.72102094, "learning_rate": 2.728471769038975e-06, "loss": 0.74277604, "num_input_tokens_seen": 142852590, "step": 6651, "time_per_iteration": 2.6027066707611084 }, { "auxiliary_loss_clip": 0.01132958, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.04850328, "balance_loss_mlp": 1.03093004, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 1.8492457158027382, "language_loss": 0.73126423, "learning_rate": 2.728109046945403e-06, "loss": 0.75304615, "num_input_tokens_seen": 142870595, "step": 6652, "time_per_iteration": 2.5880327224731445 }, { "auxiliary_loss_clip": 0.01029168, "auxiliary_loss_mlp": 0.01002764, "balance_loss_clip": 1.02822125, "balance_loss_mlp": 1.00134552, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.8458278780382239, "language_loss": 0.60614997, "learning_rate": 2.727746297241862e-06, "loss": 0.62646931, "num_input_tokens_seen": 142925805, "step": 6653, "time_per_iteration": 3.1626622676849365 }, { "auxiliary_loss_clip": 0.01093219, "auxiliary_loss_mlp": 0.01039197, "balance_loss_clip": 1.04810715, "balance_loss_mlp": 1.02577376, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 3.0617453661279788, "language_loss": 0.66701174, "learning_rate": 2.7273835199421085e-06, "loss": 0.6883359, "num_input_tokens_seen": 142943145, "step": 6654, "time_per_iteration": 2.696179151535034 }, { "auxiliary_loss_clip": 0.01119303, "auxiliary_loss_mlp": 0.01043738, "balance_loss_clip": 1.04738593, "balance_loss_mlp": 1.03145993, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 2.461149956206156, "language_loss": 0.89818919, "learning_rate": 2.7270207150599e-06, "loss": 0.91981959, "num_input_tokens_seen": 142956925, "step": 6655, "time_per_iteration": 2.601891279220581 }, { "auxiliary_loss_clip": 0.01100614, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 1.04367936, "balance_loss_mlp": 1.02693462, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 1.7913118709828861, "language_loss": 0.73551166, "learning_rate": 2.7266578826089917e-06, "loss": 0.75692105, "num_input_tokens_seen": 142978040, "step": 6656, "time_per_iteration": 2.705662727355957 }, { "auxiliary_loss_clip": 0.01131953, "auxiliary_loss_mlp": 0.01046856, "balance_loss_clip": 1.04838896, "balance_loss_mlp": 1.03224063, "epoch": 0.4002404930106719, "flos": 20920048640640.0, "grad_norm": 1.6512050463613386, "language_loss": 0.73344004, "learning_rate": 2.726295022603144e-06, "loss": 0.75522816, "num_input_tokens_seen": 142998390, "step": 6657, "time_per_iteration": 2.7595558166503906 }, { "auxiliary_loss_clip": 0.0113267, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.04887247, "balance_loss_mlp": 1.03145635, "epoch": 0.40030061626333985, "flos": 28405735827840.0, "grad_norm": 1.7318374723338787, "language_loss": 0.79715288, "learning_rate": 2.725932135056117e-06, "loss": 0.81894636, "num_input_tokens_seen": 143021505, "step": 6658, "time_per_iteration": 2.6718270778656006 }, { "auxiliary_loss_clip": 0.01115521, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.04249525, "balance_loss_mlp": 1.02865553, "epoch": 0.4003607395160078, "flos": 25921615046400.0, "grad_norm": 2.0999446343296317, "language_loss": 0.77464151, "learning_rate": 2.72556921998167e-06, "loss": 0.79621947, "num_input_tokens_seen": 143041375, "step": 6659, "time_per_iteration": 2.7160539627075195 }, { "auxiliary_loss_clip": 0.01118822, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.04276848, "balance_loss_mlp": 1.01649117, "epoch": 0.4004208627686758, "flos": 20768648814720.0, "grad_norm": 1.6781351315554156, "language_loss": 0.72410327, "learning_rate": 2.7252062773935662e-06, "loss": 0.74557567, "num_input_tokens_seen": 143058725, "step": 6660, "time_per_iteration": 2.636833429336548 }, { "auxiliary_loss_clip": 0.01101229, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.04196119, "balance_loss_mlp": 1.02828765, "epoch": 0.40048098602134374, "flos": 24681224638080.0, "grad_norm": 1.813091564393644, "language_loss": 0.71008015, "learning_rate": 2.7248433073055674e-06, "loss": 0.73150557, "num_input_tokens_seen": 143076995, "step": 6661, "time_per_iteration": 2.6956517696380615 }, { "auxiliary_loss_clip": 0.0113437, "auxiliary_loss_mlp": 0.01042051, "balance_loss_clip": 1.0506804, "balance_loss_mlp": 1.02825832, "epoch": 0.4005411092740117, "flos": 23185688947200.0, "grad_norm": 1.8086148623568068, "language_loss": 0.75526643, "learning_rate": 2.724480309731437e-06, "loss": 0.77703071, "num_input_tokens_seen": 143096780, "step": 6662, "time_per_iteration": 2.6232621669769287 }, { "auxiliary_loss_clip": 0.01115634, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.04385805, "balance_loss_mlp": 1.02194118, "epoch": 0.4006012325266797, "flos": 17522324409600.0, "grad_norm": 2.00646115239694, "language_loss": 0.66450548, "learning_rate": 2.7241172846849417e-06, "loss": 0.68603182, "num_input_tokens_seen": 143112590, "step": 6663, "time_per_iteration": 2.622520923614502 }, { "auxiliary_loss_clip": 0.01112804, "auxiliary_loss_mlp": 0.01042686, "balance_loss_clip": 1.04327071, "balance_loss_mlp": 1.02767718, "epoch": 0.40066135577934764, "flos": 19857200181120.0, "grad_norm": 2.069962140682172, "language_loss": 0.86383915, "learning_rate": 2.7237542321798455e-06, "loss": 0.88539398, "num_input_tokens_seen": 143130220, "step": 6664, "time_per_iteration": 2.575124979019165 }, { "auxiliary_loss_clip": 0.01119355, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.04696763, "balance_loss_mlp": 1.0227679, "epoch": 0.40072147903201566, "flos": 18150007599360.0, "grad_norm": 16.441358853078547, "language_loss": 0.84723455, "learning_rate": 2.723391152229917e-06, "loss": 0.86879396, "num_input_tokens_seen": 143147160, "step": 6665, "time_per_iteration": 2.671715259552002 }, { "auxiliary_loss_clip": 0.01119739, "auxiliary_loss_mlp": 0.01037355, "balance_loss_clip": 1.04976356, "balance_loss_mlp": 1.02249575, "epoch": 0.4007816022846836, "flos": 18661267831680.0, "grad_norm": 1.8896907519127706, "language_loss": 0.78118432, "learning_rate": 2.7230280448489236e-06, "loss": 0.80275524, "num_input_tokens_seen": 143164605, "step": 6666, "time_per_iteration": 2.606566905975342 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.0485028, "balance_loss_mlp": 1.02380657, "epoch": 0.4008417255373516, "flos": 25703170485120.0, "grad_norm": 1.7955817814438895, "language_loss": 0.73301423, "learning_rate": 2.7226649100506333e-06, "loss": 0.75460339, "num_input_tokens_seen": 143183965, "step": 6667, "time_per_iteration": 2.652503490447998 }, { "auxiliary_loss_clip": 0.0111465, "auxiliary_loss_mlp": 0.01054818, "balance_loss_clip": 1.04516435, "balance_loss_mlp": 1.03899896, "epoch": 0.40090184879001955, "flos": 22858614679680.0, "grad_norm": 1.708550182183753, "language_loss": 0.76022822, "learning_rate": 2.7223017478488183e-06, "loss": 0.78192288, "num_input_tokens_seen": 143204965, "step": 6668, "time_per_iteration": 2.6797566413879395 }, { "auxiliary_loss_clip": 0.01096645, "auxiliary_loss_mlp": 0.01046849, "balance_loss_clip": 1.04792619, "balance_loss_mlp": 1.0321629, "epoch": 0.4009619720426875, "flos": 29059848449280.0, "grad_norm": 2.335244314112793, "language_loss": 0.8221435, "learning_rate": 2.721938558257248e-06, "loss": 0.84357846, "num_input_tokens_seen": 143225015, "step": 6669, "time_per_iteration": 2.7661361694335938 }, { "auxiliary_loss_clip": 0.010311, "auxiliary_loss_mlp": 0.01009516, "balance_loss_clip": 1.02684975, "balance_loss_mlp": 1.00805604, "epoch": 0.4010220952953555, "flos": 66059763131520.0, "grad_norm": 0.69994773813092, "language_loss": 0.53312683, "learning_rate": 2.721575341289695e-06, "loss": 0.55353302, "num_input_tokens_seen": 143294925, "step": 6670, "time_per_iteration": 3.5547046661376953 }, { "auxiliary_loss_clip": 0.01083638, "auxiliary_loss_mlp": 0.01041448, "balance_loss_clip": 1.04546833, "balance_loss_mlp": 1.02720881, "epoch": 0.40108221854802345, "flos": 29642822184960.0, "grad_norm": 1.626307597556219, "language_loss": 0.88544351, "learning_rate": 2.7212120969599333e-06, "loss": 0.90669441, "num_input_tokens_seen": 143314170, "step": 6671, "time_per_iteration": 2.9112329483032227 }, { "auxiliary_loss_clip": 0.01119533, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.04568124, "balance_loss_mlp": 1.02137589, "epoch": 0.4011423418006914, "flos": 19929560129280.0, "grad_norm": 3.0264857347014993, "language_loss": 0.79105932, "learning_rate": 2.720848825281736e-06, "loss": 0.81261927, "num_input_tokens_seen": 143330050, "step": 6672, "time_per_iteration": 2.789889335632324 }, { "auxiliary_loss_clip": 0.01096186, "auxiliary_loss_mlp": 0.01045513, "balance_loss_clip": 1.04610085, "balance_loss_mlp": 1.03012288, "epoch": 0.4012024650533594, "flos": 20084299920000.0, "grad_norm": 4.192283777131793, "language_loss": 0.6293034, "learning_rate": 2.72048552626888e-06, "loss": 0.65072036, "num_input_tokens_seen": 143348650, "step": 6673, "time_per_iteration": 2.796834945678711 }, { "auxiliary_loss_clip": 0.011055, "auxiliary_loss_mlp": 0.00771502, "balance_loss_clip": 1.04474831, "balance_loss_mlp": 1.00076985, "epoch": 0.40126258830602735, "flos": 21695719864320.0, "grad_norm": 1.5776272245666931, "language_loss": 0.79948354, "learning_rate": 2.7201221999351402e-06, "loss": 0.81825352, "num_input_tokens_seen": 143370275, "step": 6674, "time_per_iteration": 4.298279523849487 }, { "auxiliary_loss_clip": 0.0108893, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 1.04919565, "balance_loss_mlp": 1.02610552, "epoch": 0.4013227115586953, "flos": 12020379592320.0, "grad_norm": 6.494329221896898, "language_loss": 0.82218468, "learning_rate": 2.719758846294294e-06, "loss": 0.84348273, "num_input_tokens_seen": 143385390, "step": 6675, "time_per_iteration": 2.7607553005218506 }, { "auxiliary_loss_clip": 0.01116053, "auxiliary_loss_mlp": 0.01038994, "balance_loss_clip": 1.04261947, "balance_loss_mlp": 1.02364039, "epoch": 0.4013828348113633, "flos": 25447522412160.0, "grad_norm": 2.205024073964141, "language_loss": 0.93500578, "learning_rate": 2.71939546536012e-06, "loss": 0.95655626, "num_input_tokens_seen": 143404215, "step": 6676, "time_per_iteration": 5.81420373916626 }, { "auxiliary_loss_clip": 0.01126662, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.02589226, "epoch": 0.40144295806403124, "flos": 18582946225920.0, "grad_norm": 2.1287377468959727, "language_loss": 0.79300511, "learning_rate": 2.719032057146399e-06, "loss": 0.81468892, "num_input_tokens_seen": 143422245, "step": 6677, "time_per_iteration": 2.6485939025878906 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.04743207, "balance_loss_mlp": 1.02122426, "epoch": 0.4015030813166992, "flos": 22930220442240.0, "grad_norm": 2.404301700652251, "language_loss": 0.83507645, "learning_rate": 2.71866862166691e-06, "loss": 0.85651207, "num_input_tokens_seen": 143443130, "step": 6678, "time_per_iteration": 2.749229907989502 }, { "auxiliary_loss_clip": 0.01127798, "auxiliary_loss_mlp": 0.01039278, "balance_loss_clip": 1.04660463, "balance_loss_mlp": 1.02481759, "epoch": 0.4015632045693672, "flos": 20595057361920.0, "grad_norm": 2.137342142944676, "language_loss": 0.63547456, "learning_rate": 2.718305158935434e-06, "loss": 0.65714526, "num_input_tokens_seen": 143461385, "step": 6679, "time_per_iteration": 4.272741794586182 }, { "auxiliary_loss_clip": 0.01100371, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.04277802, "balance_loss_mlp": 1.01852596, "epoch": 0.4016233278220352, "flos": 23438930808960.0, "grad_norm": 2.2420809209281582, "language_loss": 0.78955674, "learning_rate": 2.7179416689657554e-06, "loss": 0.81088501, "num_input_tokens_seen": 143481750, "step": 6680, "time_per_iteration": 2.6541543006896973 }, { "auxiliary_loss_clip": 0.01099744, "auxiliary_loss_mlp": 0.00773185, "balance_loss_clip": 1.04565692, "balance_loss_mlp": 1.0009259, "epoch": 0.40168345107470316, "flos": 21431057477760.0, "grad_norm": 1.5474671150398438, "language_loss": 0.75901389, "learning_rate": 2.7175781517716556e-06, "loss": 0.77774316, "num_input_tokens_seen": 143501540, "step": 6681, "time_per_iteration": 2.747549295425415 }, { "auxiliary_loss_clip": 0.01092334, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.04743123, "balance_loss_mlp": 1.01728785, "epoch": 0.4017435743273711, "flos": 22857214049280.0, "grad_norm": 1.9537198932922564, "language_loss": 0.64593118, "learning_rate": 2.7172146073669213e-06, "loss": 0.66716748, "num_input_tokens_seen": 143520530, "step": 6682, "time_per_iteration": 2.764676094055176 }, { "auxiliary_loss_clip": 0.01084656, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 1.04031992, "balance_loss_mlp": 1.025424, "epoch": 0.4018036975800391, "flos": 28622312881920.0, "grad_norm": 8.033606907615594, "language_loss": 0.72794902, "learning_rate": 2.716851035765337e-06, "loss": 0.74919599, "num_input_tokens_seen": 143540210, "step": 6683, "time_per_iteration": 2.9106507301330566 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01043307, "balance_loss_clip": 1.04472065, "balance_loss_mlp": 1.02844119, "epoch": 0.40186382083270705, "flos": 26651212099200.0, "grad_norm": 1.6079104273266733, "language_loss": 0.73560667, "learning_rate": 2.7164874369806896e-06, "loss": 0.75720453, "num_input_tokens_seen": 143560940, "step": 6684, "time_per_iteration": 2.814746141433716 }, { "auxiliary_loss_clip": 0.01038178, "auxiliary_loss_mlp": 0.01003165, "balance_loss_clip": 1.02248073, "balance_loss_mlp": 1.00177026, "epoch": 0.401923944085375, "flos": 59259969123840.0, "grad_norm": 0.8040960642815781, "language_loss": 0.6037817, "learning_rate": 2.716123811026767e-06, "loss": 0.6241951, "num_input_tokens_seen": 143624015, "step": 6685, "time_per_iteration": 3.3159523010253906 }, { "auxiliary_loss_clip": 0.01121727, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.04626095, "balance_loss_mlp": 1.01806533, "epoch": 0.401984067338043, "flos": 16982803152000.0, "grad_norm": 2.1640557725493563, "language_loss": 0.69947135, "learning_rate": 2.715760157917357e-06, "loss": 0.7210151, "num_input_tokens_seen": 143642750, "step": 6686, "time_per_iteration": 2.7339890003204346 }, { "auxiliary_loss_clip": 0.01109024, "auxiliary_loss_mlp": 0.01036336, "balance_loss_clip": 1.04641056, "balance_loss_mlp": 1.02213836, "epoch": 0.40204419059071095, "flos": 24972496024320.0, "grad_norm": 1.482832144271372, "language_loss": 0.74904519, "learning_rate": 2.7153964776662504e-06, "loss": 0.77049881, "num_input_tokens_seen": 143664515, "step": 6687, "time_per_iteration": 2.7403111457824707 }, { "auxiliary_loss_clip": 0.01110823, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.04890549, "balance_loss_mlp": 1.02179182, "epoch": 0.4021043138433789, "flos": 23477463123840.0, "grad_norm": 1.9109413621033529, "language_loss": 0.71165651, "learning_rate": 2.7150327702872385e-06, "loss": 0.73312759, "num_input_tokens_seen": 143683135, "step": 6688, "time_per_iteration": 2.7349321842193604 }, { "auxiliary_loss_clip": 0.01105847, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.0426929, "balance_loss_mlp": 1.02785039, "epoch": 0.4021644370960469, "flos": 25995806588160.0, "grad_norm": 2.0144045301965248, "language_loss": 0.64289308, "learning_rate": 2.7146690357941112e-06, "loss": 0.66439128, "num_input_tokens_seen": 143703985, "step": 6689, "time_per_iteration": 2.740938186645508 }, { "auxiliary_loss_clip": 0.0112261, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.04519129, "balance_loss_mlp": 1.01838636, "epoch": 0.40222456034871484, "flos": 13587987922560.0, "grad_norm": 2.8658666003554147, "language_loss": 0.7358911, "learning_rate": 2.7143052742006632e-06, "loss": 0.75744528, "num_input_tokens_seen": 143719245, "step": 6690, "time_per_iteration": 2.622920513153076 }, { "auxiliary_loss_clip": 0.01099316, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.04444623, "balance_loss_mlp": 1.0230422, "epoch": 0.4022846836013828, "flos": 24278019494400.0, "grad_norm": 1.7112869735009542, "language_loss": 0.74805617, "learning_rate": 2.7139414855206872e-06, "loss": 0.76942438, "num_input_tokens_seen": 143739575, "step": 6691, "time_per_iteration": 2.704138994216919 }, { "auxiliary_loss_clip": 0.0111344, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.0485332, "balance_loss_mlp": 1.02509689, "epoch": 0.40234480685405083, "flos": 20151596050560.0, "grad_norm": 1.5633314974955987, "language_loss": 0.7267946, "learning_rate": 2.7135776697679785e-06, "loss": 0.74833167, "num_input_tokens_seen": 143758515, "step": 6692, "time_per_iteration": 2.6782071590423584 }, { "auxiliary_loss_clip": 0.01081716, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.04122448, "balance_loss_mlp": 1.02274227, "epoch": 0.4024049301067188, "flos": 22930220442240.0, "grad_norm": 2.743543242099247, "language_loss": 0.84403068, "learning_rate": 2.7132138269563333e-06, "loss": 0.8652209, "num_input_tokens_seen": 143776770, "step": 6693, "time_per_iteration": 2.746689558029175 }, { "auxiliary_loss_clip": 0.01092043, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.04803836, "balance_loss_mlp": 1.03265464, "epoch": 0.40246505335938676, "flos": 36028421487360.0, "grad_norm": 2.4363636021200716, "language_loss": 0.70996636, "learning_rate": 2.7128499570995483e-06, "loss": 0.73137438, "num_input_tokens_seen": 143798450, "step": 6694, "time_per_iteration": 2.8071961402893066 }, { "auxiliary_loss_clip": 0.01104186, "auxiliary_loss_mlp": 0.01044295, "balance_loss_clip": 1.04619551, "balance_loss_mlp": 1.0292511, "epoch": 0.4025251766120547, "flos": 20594303176320.0, "grad_norm": 2.4336892369471976, "language_loss": 0.67823637, "learning_rate": 2.7124860602114212e-06, "loss": 0.6997211, "num_input_tokens_seen": 143816995, "step": 6695, "time_per_iteration": 2.628509283065796 }, { "auxiliary_loss_clip": 0.01100807, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.04269171, "balance_loss_mlp": 1.0272975, "epoch": 0.4025852998647227, "flos": 64523932381440.0, "grad_norm": 2.090135381279502, "language_loss": 0.79316044, "learning_rate": 2.7121221363057515e-06, "loss": 0.81459951, "num_input_tokens_seen": 143842090, "step": 6696, "time_per_iteration": 3.065619707107544 }, { "auxiliary_loss_clip": 0.01107424, "auxiliary_loss_mlp": 0.0105453, "balance_loss_clip": 1.04772997, "balance_loss_mlp": 1.03700638, "epoch": 0.40264542311739066, "flos": 20886292834560.0, "grad_norm": 2.0469796766510164, "language_loss": 0.71048194, "learning_rate": 2.7117581853963393e-06, "loss": 0.73210156, "num_input_tokens_seen": 143860800, "step": 6697, "time_per_iteration": 2.732112169265747 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.04644823, "balance_loss_mlp": 1.03167999, "epoch": 0.4027055463700586, "flos": 26250197685120.0, "grad_norm": 2.1595912992700725, "language_loss": 0.6184175, "learning_rate": 2.711394207496984e-06, "loss": 0.64007437, "num_input_tokens_seen": 143878950, "step": 6698, "time_per_iteration": 2.6853909492492676 }, { "auxiliary_loss_clip": 0.01122685, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.04787982, "balance_loss_mlp": 1.02309155, "epoch": 0.4027656696227266, "flos": 20631398947200.0, "grad_norm": 2.043260848719272, "language_loss": 0.76455128, "learning_rate": 2.711030202621491e-06, "loss": 0.78616071, "num_input_tokens_seen": 143898385, "step": 6699, "time_per_iteration": 2.6033456325531006 }, { "auxiliary_loss_clip": 0.01093615, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.04446507, "balance_loss_mlp": 1.01700354, "epoch": 0.40282579287539455, "flos": 22346277039360.0, "grad_norm": 1.6890007857677205, "language_loss": 0.80442715, "learning_rate": 2.7106661707836605e-06, "loss": 0.82567334, "num_input_tokens_seen": 143918795, "step": 6700, "time_per_iteration": 2.777510404586792 }, { "auxiliary_loss_clip": 0.01112643, "auxiliary_loss_mlp": 0.01045016, "balance_loss_clip": 1.04943717, "balance_loss_mlp": 1.02808821, "epoch": 0.4028859161280625, "flos": 29274988959360.0, "grad_norm": 2.176323872107602, "language_loss": 0.74529326, "learning_rate": 2.7103021119972977e-06, "loss": 0.7668699, "num_input_tokens_seen": 143938245, "step": 6701, "time_per_iteration": 2.7424893379211426 }, { "auxiliary_loss_clip": 0.01099003, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.04379773, "balance_loss_mlp": 1.02355886, "epoch": 0.4029460393807305, "flos": 28622312881920.0, "grad_norm": 1.8130604516939894, "language_loss": 0.66064012, "learning_rate": 2.709938026276208e-06, "loss": 0.68200922, "num_input_tokens_seen": 143960995, "step": 6702, "time_per_iteration": 2.7448410987854004 }, { "auxiliary_loss_clip": 0.01105222, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.0470736, "balance_loss_mlp": 1.02900267, "epoch": 0.40300616263339845, "flos": 22601925112320.0, "grad_norm": 1.86356350955038, "language_loss": 0.66031915, "learning_rate": 2.7095739136341964e-06, "loss": 0.68182242, "num_input_tokens_seen": 143979910, "step": 6703, "time_per_iteration": 2.679979085922241 }, { "auxiliary_loss_clip": 0.01060539, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.04386449, "balance_loss_mlp": 1.02445817, "epoch": 0.4030662858860664, "flos": 25520313323520.0, "grad_norm": 2.0398618821746304, "language_loss": 0.82689512, "learning_rate": 2.709209774085071e-06, "loss": 0.84792, "num_input_tokens_seen": 144000095, "step": 6704, "time_per_iteration": 2.9296765327453613 }, { "auxiliary_loss_clip": 0.01112771, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.04960763, "balance_loss_mlp": 1.02517009, "epoch": 0.40312640913873443, "flos": 23586703361280.0, "grad_norm": 1.6638111373196858, "language_loss": 0.73759186, "learning_rate": 2.7088456076426407e-06, "loss": 0.75912088, "num_input_tokens_seen": 144019695, "step": 6705, "time_per_iteration": 3.0039970874786377 }, { "auxiliary_loss_clip": 0.0111798, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.04735386, "balance_loss_mlp": 1.02541077, "epoch": 0.4031865323914024, "flos": 20011042131840.0, "grad_norm": 1.7718881662691552, "language_loss": 0.65816283, "learning_rate": 2.708481414320713e-06, "loss": 0.67973745, "num_input_tokens_seen": 144038525, "step": 6706, "time_per_iteration": 2.6920299530029297 }, { "auxiliary_loss_clip": 0.01123098, "auxiliary_loss_mlp": 0.01039977, "balance_loss_clip": 1.05084229, "balance_loss_mlp": 1.02508759, "epoch": 0.40324665564407036, "flos": 21871430219520.0, "grad_norm": 1.5916886093016338, "language_loss": 0.71493578, "learning_rate": 2.7081171941330992e-06, "loss": 0.73656654, "num_input_tokens_seen": 144059485, "step": 6707, "time_per_iteration": 2.6424286365509033 }, { "auxiliary_loss_clip": 0.01104664, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.04652226, "balance_loss_mlp": 1.0201261, "epoch": 0.4033067788967383, "flos": 23878728933120.0, "grad_norm": 1.6049010195706548, "language_loss": 0.79860801, "learning_rate": 2.707752947093611e-06, "loss": 0.82001007, "num_input_tokens_seen": 144080265, "step": 6708, "time_per_iteration": 2.7476210594177246 }, { "auxiliary_loss_clip": 0.01081311, "auxiliary_loss_mlp": 0.01041497, "balance_loss_clip": 1.04192591, "balance_loss_mlp": 1.0254873, "epoch": 0.4033669021494063, "flos": 17419907756160.0, "grad_norm": 2.2092970812397823, "language_loss": 0.82527256, "learning_rate": 2.70738867321606e-06, "loss": 0.84650064, "num_input_tokens_seen": 144098040, "step": 6709, "time_per_iteration": 2.6981422901153564 }, { "auxiliary_loss_clip": 0.01126319, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.052701, "balance_loss_mlp": 1.02168322, "epoch": 0.40342702540207426, "flos": 29600554855680.0, "grad_norm": 3.462855853005799, "language_loss": 0.71349508, "learning_rate": 2.70702437251426e-06, "loss": 0.73513091, "num_input_tokens_seen": 144118265, "step": 6710, "time_per_iteration": 2.745234727859497 }, { "auxiliary_loss_clip": 0.01100277, "auxiliary_loss_mlp": 0.01040518, "balance_loss_clip": 1.0461812, "balance_loss_mlp": 1.02506852, "epoch": 0.4034871486547422, "flos": 11284605400320.0, "grad_norm": 2.0008015592173285, "language_loss": 0.8497777, "learning_rate": 2.7066600450020236e-06, "loss": 0.8711856, "num_input_tokens_seen": 144133865, "step": 6711, "time_per_iteration": 2.6388518810272217 }, { "auxiliary_loss_clip": 0.01124865, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.04873466, "balance_loss_mlp": 1.02192783, "epoch": 0.4035472719074102, "flos": 15552839738880.0, "grad_norm": 1.9288958482484087, "language_loss": 0.76210845, "learning_rate": 2.706295690693168e-06, "loss": 0.78373086, "num_input_tokens_seen": 144150125, "step": 6712, "time_per_iteration": 2.617612838745117 }, { "auxiliary_loss_clip": 0.0110296, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.0465771, "balance_loss_mlp": 1.02682328, "epoch": 0.40360739516007815, "flos": 24674365140480.0, "grad_norm": 2.8401310029686284, "language_loss": 0.79334903, "learning_rate": 2.7059313096015096e-06, "loss": 0.81480157, "num_input_tokens_seen": 144169295, "step": 6713, "time_per_iteration": 4.2229533195495605 }, { "auxiliary_loss_clip": 0.01096327, "auxiliary_loss_mlp": 0.01040909, "balance_loss_clip": 1.04259837, "balance_loss_mlp": 1.02437484, "epoch": 0.4036675184127461, "flos": 17304095329920.0, "grad_norm": 2.4269881355691867, "language_loss": 0.88230258, "learning_rate": 2.705566901740865e-06, "loss": 0.90367496, "num_input_tokens_seen": 144185790, "step": 6714, "time_per_iteration": 2.6861040592193604 }, { "auxiliary_loss_clip": 0.0112277, "auxiliary_loss_mlp": 0.01042461, "balance_loss_clip": 1.04913116, "balance_loss_mlp": 1.02755439, "epoch": 0.4037276416654141, "flos": 19864023765120.0, "grad_norm": 1.685218394347131, "language_loss": 0.69355965, "learning_rate": 2.7052024671250527e-06, "loss": 0.71521199, "num_input_tokens_seen": 144205190, "step": 6715, "time_per_iteration": 6.05805778503418 }, { "auxiliary_loss_clip": 0.01085368, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.03982067, "balance_loss_mlp": 1.02422547, "epoch": 0.40378776491808205, "flos": 18296271780480.0, "grad_norm": 2.4042590138214543, "language_loss": 0.7738142, "learning_rate": 2.704838005767892e-06, "loss": 0.7950592, "num_input_tokens_seen": 144222705, "step": 6716, "time_per_iteration": 2.874701738357544 }, { "auxiliary_loss_clip": 0.01084201, "auxiliary_loss_mlp": 0.01039901, "balance_loss_clip": 1.04515779, "balance_loss_mlp": 1.02554834, "epoch": 0.40384788817075, "flos": 15049372757760.0, "grad_norm": 1.8822370621315767, "language_loss": 0.7590825, "learning_rate": 2.7044735176832037e-06, "loss": 0.78032351, "num_input_tokens_seen": 144239545, "step": 6717, "time_per_iteration": 2.806605339050293 }, { "auxiliary_loss_clip": 0.01034573, "auxiliary_loss_mlp": 0.01006348, "balance_loss_clip": 1.03120637, "balance_loss_mlp": 1.00481057, "epoch": 0.40390801142341803, "flos": 61929927895680.0, "grad_norm": 0.9365934623644069, "language_loss": 0.60732949, "learning_rate": 2.7041090028848084e-06, "loss": 0.62773865, "num_input_tokens_seen": 144288145, "step": 6718, "time_per_iteration": 4.683047771453857 }, { "auxiliary_loss_clip": 0.01137275, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.04942691, "balance_loss_mlp": 1.02322555, "epoch": 0.403968134676086, "flos": 22738779930240.0, "grad_norm": 2.360676977629441, "language_loss": 0.74748445, "learning_rate": 2.7037444613865306e-06, "loss": 0.76925087, "num_input_tokens_seen": 144302315, "step": 6719, "time_per_iteration": 2.6020865440368652 }, { "auxiliary_loss_clip": 0.01122679, "auxiliary_loss_mlp": 0.01042794, "balance_loss_clip": 1.04766619, "balance_loss_mlp": 1.02643895, "epoch": 0.40402825792875396, "flos": 19784409269760.0, "grad_norm": 2.123342604077105, "language_loss": 0.81516802, "learning_rate": 2.7033798932021906e-06, "loss": 0.83682275, "num_input_tokens_seen": 144318990, "step": 6720, "time_per_iteration": 2.6707048416137695 }, { "auxiliary_loss_clip": 0.01106407, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.04365981, "balance_loss_mlp": 1.01866555, "epoch": 0.40408838118142193, "flos": 19609273532160.0, "grad_norm": 2.786601864332057, "language_loss": 0.77150661, "learning_rate": 2.7030152983456153e-06, "loss": 0.79290426, "num_input_tokens_seen": 144335765, "step": 6721, "time_per_iteration": 2.648050546646118 }, { "auxiliary_loss_clip": 0.01091711, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04391122, "balance_loss_mlp": 1.01643503, "epoch": 0.4041485044340899, "flos": 24426043441920.0, "grad_norm": 2.012609049395132, "language_loss": 0.72214961, "learning_rate": 2.7026506768306304e-06, "loss": 0.74336231, "num_input_tokens_seen": 144355825, "step": 6722, "time_per_iteration": 2.7598764896392822 }, { "auxiliary_loss_clip": 0.01117849, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.04649532, "balance_loss_mlp": 1.02137017, "epoch": 0.40420862768675786, "flos": 16760192613120.0, "grad_norm": 2.003025152561758, "language_loss": 0.66099858, "learning_rate": 2.7022860286710602e-06, "loss": 0.68252993, "num_input_tokens_seen": 144374320, "step": 6723, "time_per_iteration": 2.6525375843048096 }, { "auxiliary_loss_clip": 0.0111764, "auxiliary_loss_mlp": 0.01047962, "balance_loss_clip": 1.04678059, "balance_loss_mlp": 1.03247619, "epoch": 0.4042687509394258, "flos": 22491571553280.0, "grad_norm": 1.6479262490520643, "language_loss": 0.73566139, "learning_rate": 2.701921353880734e-06, "loss": 0.75731742, "num_input_tokens_seen": 144394325, "step": 6724, "time_per_iteration": 2.6602234840393066 }, { "auxiliary_loss_clip": 0.01096943, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.04471684, "balance_loss_mlp": 1.02009475, "epoch": 0.4043288741920938, "flos": 30336149479680.0, "grad_norm": 1.8514955948130458, "language_loss": 0.74733102, "learning_rate": 2.7015566524734787e-06, "loss": 0.76864064, "num_input_tokens_seen": 144412765, "step": 6725, "time_per_iteration": 2.7086737155914307 }, { "auxiliary_loss_clip": 0.01116531, "auxiliary_loss_mlp": 0.01035939, "balance_loss_clip": 1.04757476, "balance_loss_mlp": 1.02062047, "epoch": 0.40438899744476176, "flos": 46348321363200.0, "grad_norm": 2.3229573968410766, "language_loss": 0.76987183, "learning_rate": 2.701191924463126e-06, "loss": 0.7913965, "num_input_tokens_seen": 144435400, "step": 6726, "time_per_iteration": 2.880244493484497 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.00775301, "balance_loss_clip": 1.04148483, "balance_loss_mlp": 1.00105536, "epoch": 0.4044491206974297, "flos": 13333524998400.0, "grad_norm": 2.125548317574291, "language_loss": 0.8180182, "learning_rate": 2.7008271698635054e-06, "loss": 0.83679968, "num_input_tokens_seen": 144452925, "step": 6727, "time_per_iteration": 2.6953587532043457 }, { "auxiliary_loss_clip": 0.01128783, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.04577255, "balance_loss_mlp": 1.02264905, "epoch": 0.4045092439500977, "flos": 12093745121280.0, "grad_norm": 2.0701087852414504, "language_loss": 0.85462439, "learning_rate": 2.700462388688447e-06, "loss": 0.87628114, "num_input_tokens_seen": 144470195, "step": 6728, "time_per_iteration": 2.5963056087493896 }, { "auxiliary_loss_clip": 0.01095663, "auxiliary_loss_mlp": 0.01043865, "balance_loss_clip": 1.04611719, "balance_loss_mlp": 1.029351, "epoch": 0.40456936720276565, "flos": 21179683123200.0, "grad_norm": 1.739738235535384, "language_loss": 0.81606215, "learning_rate": 2.700097580951786e-06, "loss": 0.83745748, "num_input_tokens_seen": 144490320, "step": 6729, "time_per_iteration": 2.8157620429992676 }, { "auxiliary_loss_clip": 0.01105665, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.0443244, "balance_loss_mlp": 1.02993762, "epoch": 0.4046294904554336, "flos": 23915286000000.0, "grad_norm": 1.917482865643355, "language_loss": 0.73375344, "learning_rate": 2.6997327466673533e-06, "loss": 0.75524956, "num_input_tokens_seen": 144508990, "step": 6730, "time_per_iteration": 2.67053484916687 }, { "auxiliary_loss_clip": 0.01113781, "auxiliary_loss_mlp": 0.01041271, "balance_loss_clip": 1.04319108, "balance_loss_mlp": 1.02674532, "epoch": 0.4046896137081016, "flos": 38071235773440.0, "grad_norm": 2.5953767613834673, "language_loss": 0.67485142, "learning_rate": 2.699367885848985e-06, "loss": 0.69640195, "num_input_tokens_seen": 144529550, "step": 6731, "time_per_iteration": 2.8106632232666016 }, { "auxiliary_loss_clip": 0.01128909, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.04689097, "balance_loss_mlp": 1.02531338, "epoch": 0.4047497369607696, "flos": 23617262856960.0, "grad_norm": 1.5691591770044138, "language_loss": 0.74245793, "learning_rate": 2.699002998510517e-06, "loss": 0.76413357, "num_input_tokens_seen": 144549310, "step": 6732, "time_per_iteration": 2.6608641147613525 }, { "auxiliary_loss_clip": 0.0110044, "auxiliary_loss_mlp": 0.00770096, "balance_loss_clip": 1.04635525, "balance_loss_mlp": 1.00099349, "epoch": 0.40480986021343757, "flos": 12823593569280.0, "grad_norm": 1.738611378800115, "language_loss": 0.77579916, "learning_rate": 2.6986380846657852e-06, "loss": 0.79450446, "num_input_tokens_seen": 144567430, "step": 6733, "time_per_iteration": 2.648707151412964 }, { "auxiliary_loss_clip": 0.01102753, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.04195142, "balance_loss_mlp": 1.0276798, "epoch": 0.40486998346610553, "flos": 23768770423680.0, "grad_norm": 1.875618790304424, "language_loss": 0.76887047, "learning_rate": 2.698273144328627e-06, "loss": 0.79033035, "num_input_tokens_seen": 144585975, "step": 6734, "time_per_iteration": 2.7222812175750732 }, { "auxiliary_loss_clip": 0.01110956, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.04893517, "balance_loss_mlp": 1.01923609, "epoch": 0.4049301067187735, "flos": 22856818999680.0, "grad_norm": 2.463703641644531, "language_loss": 0.64536786, "learning_rate": 2.6979081775128805e-06, "loss": 0.66680741, "num_input_tokens_seen": 144605225, "step": 6735, "time_per_iteration": 2.682111978530884 }, { "auxiliary_loss_clip": 0.01088904, "auxiliary_loss_mlp": 0.01039113, "balance_loss_clip": 1.04142201, "balance_loss_mlp": 1.0247122, "epoch": 0.40499022997144146, "flos": 22783992174720.0, "grad_norm": 1.9621030422141739, "language_loss": 0.83120507, "learning_rate": 2.697543184232387e-06, "loss": 0.85248524, "num_input_tokens_seen": 144624145, "step": 6736, "time_per_iteration": 2.737946033477783 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.00773133, "balance_loss_clip": 1.04903114, "balance_loss_mlp": 1.00089931, "epoch": 0.4050503532241094, "flos": 23039352938880.0, "grad_norm": 1.950757015883091, "language_loss": 0.75173002, "learning_rate": 2.6971781645009863e-06, "loss": 0.77049613, "num_input_tokens_seen": 144644470, "step": 6737, "time_per_iteration": 2.7009494304656982 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.04876637, "balance_loss_mlp": 1.02858806, "epoch": 0.4051104764767774, "flos": 16647756065280.0, "grad_norm": 3.18955375846042, "language_loss": 0.72142565, "learning_rate": 2.696813118332519e-06, "loss": 0.74304938, "num_input_tokens_seen": 144661055, "step": 6738, "time_per_iteration": 2.63269305229187 }, { "auxiliary_loss_clip": 0.01094776, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.04453516, "balance_loss_mlp": 1.02065849, "epoch": 0.40517059972944536, "flos": 16358962717440.0, "grad_norm": 1.9585661201522753, "language_loss": 0.75113159, "learning_rate": 2.696448045740828e-06, "loss": 0.77241367, "num_input_tokens_seen": 144677935, "step": 6739, "time_per_iteration": 2.678330421447754 }, { "auxiliary_loss_clip": 0.01092708, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.04475963, "balance_loss_mlp": 1.02244925, "epoch": 0.4052307229821133, "flos": 28803374363520.0, "grad_norm": 2.0151914408481066, "language_loss": 0.73516095, "learning_rate": 2.6960829467397576e-06, "loss": 0.75645494, "num_input_tokens_seen": 144697725, "step": 6740, "time_per_iteration": 2.821165084838867 }, { "auxiliary_loss_clip": 0.01111182, "auxiliary_loss_mlp": 0.01032908, "balance_loss_clip": 1.04380143, "balance_loss_mlp": 1.01927674, "epoch": 0.4052908462347813, "flos": 21397876289280.0, "grad_norm": 1.5447802431592257, "language_loss": 0.77149022, "learning_rate": 2.695717821343153e-06, "loss": 0.79293114, "num_input_tokens_seen": 144718805, "step": 6741, "time_per_iteration": 2.639744758605957 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.04797888, "balance_loss_mlp": 1.02415919, "epoch": 0.40535096948744925, "flos": 22419067950720.0, "grad_norm": 2.3470472177782584, "language_loss": 0.71132898, "learning_rate": 2.6953526695648577e-06, "loss": 0.73304784, "num_input_tokens_seen": 144737105, "step": 6742, "time_per_iteration": 2.566246509552002 }, { "auxiliary_loss_clip": 0.01132445, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.04941666, "balance_loss_mlp": 1.01739824, "epoch": 0.4054110927401172, "flos": 17010776868480.0, "grad_norm": 2.3285032794047966, "language_loss": 0.71915448, "learning_rate": 2.6949874914187202e-06, "loss": 0.74079311, "num_input_tokens_seen": 144751350, "step": 6743, "time_per_iteration": 2.7150700092315674 }, { "auxiliary_loss_clip": 0.01109405, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.04626715, "balance_loss_mlp": 1.0209291, "epoch": 0.4054712159927852, "flos": 21614848392960.0, "grad_norm": 2.2533363543989053, "language_loss": 0.70529258, "learning_rate": 2.694622286918588e-06, "loss": 0.72674704, "num_input_tokens_seen": 144770030, "step": 6744, "time_per_iteration": 2.715900421142578 }, { "auxiliary_loss_clip": 0.01118115, "auxiliary_loss_mlp": 0.01036188, "balance_loss_clip": 1.04826701, "balance_loss_mlp": 1.02316439, "epoch": 0.4055313392454532, "flos": 25812554376960.0, "grad_norm": 1.8071994834567642, "language_loss": 0.80102956, "learning_rate": 2.6942570560783076e-06, "loss": 0.82257259, "num_input_tokens_seen": 144790965, "step": 6745, "time_per_iteration": 2.6989259719848633 }, { "auxiliary_loss_clip": 0.01108583, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.04861784, "balance_loss_mlp": 1.02049959, "epoch": 0.40559146249812117, "flos": 14137098111360.0, "grad_norm": 1.8906308851954157, "language_loss": 0.66942173, "learning_rate": 2.693891798911731e-06, "loss": 0.69085735, "num_input_tokens_seen": 144807755, "step": 6746, "time_per_iteration": 2.7211005687713623 }, { "auxiliary_loss_clip": 0.01092509, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.04508781, "balance_loss_mlp": 1.02044201, "epoch": 0.40565158575078913, "flos": 41355481962240.0, "grad_norm": 1.4960206584486848, "language_loss": 0.57240731, "learning_rate": 2.6935265154327075e-06, "loss": 0.59367168, "num_input_tokens_seen": 144832405, "step": 6747, "time_per_iteration": 2.8735926151275635 }, { "auxiliary_loss_clip": 0.0109681, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.04770565, "balance_loss_mlp": 1.03084731, "epoch": 0.4057117090034571, "flos": 28544529980160.0, "grad_norm": 1.7545120295248704, "language_loss": 0.8468259, "learning_rate": 2.693161205655089e-06, "loss": 0.86822933, "num_input_tokens_seen": 144853890, "step": 6748, "time_per_iteration": 2.7470786571502686 }, { "auxiliary_loss_clip": 0.01107762, "auxiliary_loss_mlp": 0.0104113, "balance_loss_clip": 1.05110598, "balance_loss_mlp": 1.02695, "epoch": 0.40577183225612506, "flos": 18004066640640.0, "grad_norm": 2.5881063547984398, "language_loss": 0.81445849, "learning_rate": 2.6927958695927287e-06, "loss": 0.83594739, "num_input_tokens_seen": 144871395, "step": 6749, "time_per_iteration": 2.677762746810913 }, { "auxiliary_loss_clip": 0.01119763, "auxiliary_loss_mlp": 0.00771508, "balance_loss_clip": 1.04914761, "balance_loss_mlp": 1.00084698, "epoch": 0.40583195550879303, "flos": 19536734016000.0, "grad_norm": 1.7422987888005266, "language_loss": 0.75235945, "learning_rate": 2.6924305072594784e-06, "loss": 0.77127212, "num_input_tokens_seen": 144890975, "step": 6750, "time_per_iteration": 2.6956052780151367 }, { "auxiliary_loss_clip": 0.0111553, "auxiliary_loss_mlp": 0.01041156, "balance_loss_clip": 1.04812646, "balance_loss_mlp": 1.02654123, "epoch": 0.405892078761461, "flos": 22309468577280.0, "grad_norm": 2.479262216129207, "language_loss": 0.73942888, "learning_rate": 2.692065118669195e-06, "loss": 0.76099575, "num_input_tokens_seen": 144908170, "step": 6751, "time_per_iteration": 2.6845548152923584 }, { "auxiliary_loss_clip": 0.01086462, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04832053, "balance_loss_mlp": 1.02627254, "epoch": 0.40595220201412896, "flos": 25484402701440.0, "grad_norm": 1.7042707146701068, "language_loss": 0.66690767, "learning_rate": 2.6916997038357326e-06, "loss": 0.68819749, "num_input_tokens_seen": 144928020, "step": 6752, "time_per_iteration": 4.372137784957886 }, { "auxiliary_loss_clip": 0.01086822, "auxiliary_loss_mlp": 0.0104486, "balance_loss_clip": 1.04698646, "balance_loss_mlp": 1.02896988, "epoch": 0.4060123252667969, "flos": 49856004103680.0, "grad_norm": 2.0675680438490374, "language_loss": 0.7062583, "learning_rate": 2.691334262772948e-06, "loss": 0.72757506, "num_input_tokens_seen": 144951240, "step": 6753, "time_per_iteration": 2.954685688018799 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.04630709, "balance_loss_mlp": 1.02162218, "epoch": 0.4060724485194649, "flos": 21135476459520.0, "grad_norm": 1.6674578897026393, "language_loss": 0.72053552, "learning_rate": 2.690968795494699e-06, "loss": 0.74199629, "num_input_tokens_seen": 144969100, "step": 6754, "time_per_iteration": 5.758596420288086 }, { "auxiliary_loss_clip": 0.01097183, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.04531932, "balance_loss_mlp": 1.02634573, "epoch": 0.40613257177213286, "flos": 21758059918080.0, "grad_norm": 1.655202233640458, "language_loss": 0.8301084, "learning_rate": 2.690603302014844e-06, "loss": 0.851484, "num_input_tokens_seen": 144987065, "step": 6755, "time_per_iteration": 2.7983388900756836 }, { "auxiliary_loss_clip": 0.01086578, "auxiliary_loss_mlp": 0.01041496, "balance_loss_clip": 1.04638743, "balance_loss_mlp": 1.02645206, "epoch": 0.4061926950248008, "flos": 25555074710400.0, "grad_norm": 1.5597680276021608, "language_loss": 0.71212381, "learning_rate": 2.6902377823472426e-06, "loss": 0.73340452, "num_input_tokens_seen": 145007310, "step": 6756, "time_per_iteration": 2.8140816688537598 }, { "auxiliary_loss_clip": 0.01071802, "auxiliary_loss_mlp": 0.00773633, "balance_loss_clip": 1.04193711, "balance_loss_mlp": 1.00074661, "epoch": 0.4062528182774688, "flos": 23695799944320.0, "grad_norm": 2.0528550033278075, "language_loss": 0.79103237, "learning_rate": 2.689872236505755e-06, "loss": 0.80948675, "num_input_tokens_seen": 145026210, "step": 6757, "time_per_iteration": 4.472316741943359 }, { "auxiliary_loss_clip": 0.01112634, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.05197811, "balance_loss_mlp": 1.01777542, "epoch": 0.4063129415301368, "flos": 21726027964800.0, "grad_norm": 1.8573345394429819, "language_loss": 0.78500074, "learning_rate": 2.6895066645042437e-06, "loss": 0.80644321, "num_input_tokens_seen": 145045475, "step": 6758, "time_per_iteration": 2.732006072998047 }, { "auxiliary_loss_clip": 0.01096195, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.05092061, "balance_loss_mlp": 1.02355731, "epoch": 0.40637306478280477, "flos": 12787575206400.0, "grad_norm": 2.1068114153090254, "language_loss": 0.89142424, "learning_rate": 2.6891410663565703e-06, "loss": 0.91276503, "num_input_tokens_seen": 145062260, "step": 6759, "time_per_iteration": 2.768120288848877 }, { "auxiliary_loss_clip": 0.0109872, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.04916096, "balance_loss_mlp": 1.02241302, "epoch": 0.40643318803547274, "flos": 24024490323840.0, "grad_norm": 1.8143975866028277, "language_loss": 0.64272439, "learning_rate": 2.688775442076598e-06, "loss": 0.66407484, "num_input_tokens_seen": 145082470, "step": 6760, "time_per_iteration": 2.724278211593628 }, { "auxiliary_loss_clip": 0.01120642, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.04679084, "balance_loss_mlp": 1.02100921, "epoch": 0.4064933112881407, "flos": 25592421876480.0, "grad_norm": 1.9958038926303674, "language_loss": 0.75134486, "learning_rate": 2.688409791678193e-06, "loss": 0.77290988, "num_input_tokens_seen": 145105685, "step": 6761, "time_per_iteration": 2.81839919090271 }, { "auxiliary_loss_clip": 0.01097139, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.04636633, "balance_loss_mlp": 1.02598023, "epoch": 0.40655343454080867, "flos": 22054323294720.0, "grad_norm": 1.6270794268543942, "language_loss": 0.70070893, "learning_rate": 2.6880441151752185e-06, "loss": 0.72207355, "num_input_tokens_seen": 145125590, "step": 6762, "time_per_iteration": 2.6583070755004883 }, { "auxiliary_loss_clip": 0.0111912, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.0519619, "balance_loss_mlp": 1.01906157, "epoch": 0.40661355779347663, "flos": 26468893641600.0, "grad_norm": 1.6183981098694702, "language_loss": 0.73523986, "learning_rate": 2.6876784125815433e-06, "loss": 0.75675833, "num_input_tokens_seen": 145146810, "step": 6763, "time_per_iteration": 2.674830198287964 }, { "auxiliary_loss_clip": 0.01090413, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04031014, "balance_loss_mlp": 1.0199244, "epoch": 0.4066736810461446, "flos": 13261129136640.0, "grad_norm": 2.065371393903723, "language_loss": 0.68689919, "learning_rate": 2.687312683911033e-06, "loss": 0.70815611, "num_input_tokens_seen": 145163130, "step": 6764, "time_per_iteration": 2.7424631118774414 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.01045832, "balance_loss_clip": 1.0461781, "balance_loss_mlp": 1.02930999, "epoch": 0.40673380429881256, "flos": 28803625758720.0, "grad_norm": 2.4553121190783, "language_loss": 0.91144872, "learning_rate": 2.686946929177557e-06, "loss": 0.93295348, "num_input_tokens_seen": 145181420, "step": 6765, "time_per_iteration": 2.705754280090332 }, { "auxiliary_loss_clip": 0.01121713, "auxiliary_loss_mlp": 0.01044564, "balance_loss_clip": 1.04742265, "balance_loss_mlp": 1.02876294, "epoch": 0.4067939275514805, "flos": 12495334152960.0, "grad_norm": 3.5832481358362673, "language_loss": 0.78673786, "learning_rate": 2.6865811483949855e-06, "loss": 0.80840063, "num_input_tokens_seen": 145198545, "step": 6766, "time_per_iteration": 2.6291732788085938 }, { "auxiliary_loss_clip": 0.01137462, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.0502665, "balance_loss_mlp": 1.02767396, "epoch": 0.4068540508041485, "flos": 18770508069120.0, "grad_norm": 2.203846422574217, "language_loss": 0.763403, "learning_rate": 2.6862153415771867e-06, "loss": 0.78519982, "num_input_tokens_seen": 145215835, "step": 6767, "time_per_iteration": 2.583494186401367 }, { "auxiliary_loss_clip": 0.01124058, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.0510633, "balance_loss_mlp": 1.02363229, "epoch": 0.40691417405681646, "flos": 28512821249280.0, "grad_norm": 2.5264206630573827, "language_loss": 0.77474844, "learning_rate": 2.685849508738034e-06, "loss": 0.79636991, "num_input_tokens_seen": 145236555, "step": 6768, "time_per_iteration": 2.6851589679718018 }, { "auxiliary_loss_clip": 0.01134023, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.05076826, "balance_loss_mlp": 1.01887226, "epoch": 0.4069742973094844, "flos": 20814040627200.0, "grad_norm": 1.8984102670150322, "language_loss": 0.87523651, "learning_rate": 2.6854836498913995e-06, "loss": 0.8969059, "num_input_tokens_seen": 145254595, "step": 6769, "time_per_iteration": 2.7267651557922363 }, { "auxiliary_loss_clip": 0.01105045, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.04947972, "balance_loss_mlp": 1.03028178, "epoch": 0.4070344205621524, "flos": 21470272151040.0, "grad_norm": 3.1498546640216234, "language_loss": 0.80951393, "learning_rate": 2.685117765051156e-06, "loss": 0.83100921, "num_input_tokens_seen": 145274005, "step": 6770, "time_per_iteration": 2.7272839546203613 }, { "auxiliary_loss_clip": 0.01136551, "auxiliary_loss_mlp": 0.0103334, "balance_loss_clip": 1.05021751, "balance_loss_mlp": 1.01781273, "epoch": 0.4070945438148204, "flos": 26830046937600.0, "grad_norm": 1.9062828764414554, "language_loss": 0.80237663, "learning_rate": 2.6847518542311783e-06, "loss": 0.82407558, "num_input_tokens_seen": 145294850, "step": 6771, "time_per_iteration": 2.5958163738250732 }, { "auxiliary_loss_clip": 0.01097968, "auxiliary_loss_mlp": 0.01044728, "balance_loss_clip": 1.04523098, "balance_loss_mlp": 1.02995801, "epoch": 0.4071546670674884, "flos": 26354158623360.0, "grad_norm": 1.4305431390081056, "language_loss": 0.76077241, "learning_rate": 2.6843859174453417e-06, "loss": 0.78219938, "num_input_tokens_seen": 145317050, "step": 6772, "time_per_iteration": 2.79603910446167 }, { "auxiliary_loss_clip": 0.01110195, "auxiliary_loss_mlp": 0.01043051, "balance_loss_clip": 1.04724109, "balance_loss_mlp": 1.0283401, "epoch": 0.40721479032015634, "flos": 17895401020800.0, "grad_norm": 1.8845179488175254, "language_loss": 0.81205189, "learning_rate": 2.6840199547075218e-06, "loss": 0.83358431, "num_input_tokens_seen": 145334480, "step": 6773, "time_per_iteration": 2.699221611022949 }, { "auxiliary_loss_clip": 0.01044722, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.03283918, "balance_loss_mlp": 1.02369332, "epoch": 0.4072749135728243, "flos": 49854570537600.0, "grad_norm": 0.9856620885651128, "language_loss": 0.64339805, "learning_rate": 2.683653966031597e-06, "loss": 0.6641022, "num_input_tokens_seen": 145388695, "step": 6774, "time_per_iteration": 3.147400140762329 }, { "auxiliary_loss_clip": 0.01089769, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.04686499, "balance_loss_mlp": 1.02041602, "epoch": 0.40733503682549227, "flos": 27563630400000.0, "grad_norm": 2.273542425652267, "language_loss": 0.72560251, "learning_rate": 2.683287951431446e-06, "loss": 0.74684727, "num_input_tokens_seen": 145408240, "step": 6775, "time_per_iteration": 2.787423849105835 }, { "auxiliary_loss_clip": 0.01105468, "auxiliary_loss_mlp": 0.00773431, "balance_loss_clip": 1.04828203, "balance_loss_mlp": 1.00090027, "epoch": 0.40739516007816023, "flos": 22126970551680.0, "grad_norm": 1.407391884450963, "language_loss": 0.77802348, "learning_rate": 2.6829219109209474e-06, "loss": 0.79681242, "num_input_tokens_seen": 145428395, "step": 6776, "time_per_iteration": 2.682548761367798 }, { "auxiliary_loss_clip": 0.01126451, "auxiliary_loss_mlp": 0.0104142, "balance_loss_clip": 1.05063748, "balance_loss_mlp": 1.02654302, "epoch": 0.4074552833308282, "flos": 23842243693440.0, "grad_norm": 2.817997105966, "language_loss": 0.79558617, "learning_rate": 2.682555844513981e-06, "loss": 0.81726491, "num_input_tokens_seen": 145448290, "step": 6777, "time_per_iteration": 2.7163336277008057 }, { "auxiliary_loss_clip": 0.01058602, "auxiliary_loss_mlp": 0.01001315, "balance_loss_clip": 1.02913916, "balance_loss_mlp": 0.99987298, "epoch": 0.40751540658349616, "flos": 58000008781440.0, "grad_norm": 0.6823534121540719, "language_loss": 0.5315339, "learning_rate": 2.6821897522244286e-06, "loss": 0.55213308, "num_input_tokens_seen": 145509785, "step": 6778, "time_per_iteration": 3.1687095165252686 }, { "auxiliary_loss_clip": 0.01135647, "auxiliary_loss_mlp": 0.00772948, "balance_loss_clip": 1.05136371, "balance_loss_mlp": 1.0008893, "epoch": 0.40757552983616413, "flos": 21214659991680.0, "grad_norm": 2.33935347330558, "language_loss": 0.82312328, "learning_rate": 2.6818236340661718e-06, "loss": 0.84220922, "num_input_tokens_seen": 145528620, "step": 6779, "time_per_iteration": 2.584343194961548 }, { "auxiliary_loss_clip": 0.0112113, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.04708278, "balance_loss_mlp": 1.02178645, "epoch": 0.4076356530888321, "flos": 26833530556800.0, "grad_norm": 1.5589663074171618, "language_loss": 0.76523471, "learning_rate": 2.6814574900530957e-06, "loss": 0.78681505, "num_input_tokens_seen": 145547775, "step": 6780, "time_per_iteration": 2.6672446727752686 }, { "auxiliary_loss_clip": 0.01117549, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.04889798, "balance_loss_mlp": 1.0212667, "epoch": 0.40769577634150006, "flos": 12203021272320.0, "grad_norm": 2.1749592638145123, "language_loss": 0.65482175, "learning_rate": 2.6810913201990827e-06, "loss": 0.67634493, "num_input_tokens_seen": 145564465, "step": 6781, "time_per_iteration": 2.612326145172119 }, { "auxiliary_loss_clip": 0.01107362, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.04472542, "balance_loss_mlp": 1.01922643, "epoch": 0.407755899594168, "flos": 33655264796160.0, "grad_norm": 1.5476514756078803, "language_loss": 0.71028459, "learning_rate": 2.6807251245180183e-06, "loss": 0.73170209, "num_input_tokens_seen": 145585965, "step": 6782, "time_per_iteration": 2.7483837604522705 }, { "auxiliary_loss_clip": 0.01124897, "auxiliary_loss_mlp": 0.01032941, "balance_loss_clip": 1.04813361, "balance_loss_mlp": 1.01833797, "epoch": 0.407816022846836, "flos": 20157342226560.0, "grad_norm": 1.9402515659282311, "language_loss": 0.82272756, "learning_rate": 2.6803589030237897e-06, "loss": 0.84430599, "num_input_tokens_seen": 145605000, "step": 6783, "time_per_iteration": 2.6157009601593018 }, { "auxiliary_loss_clip": 0.01117034, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.04744446, "balance_loss_mlp": 1.0235455, "epoch": 0.40787614609950396, "flos": 21178821196800.0, "grad_norm": 1.6713842677384587, "language_loss": 0.81044209, "learning_rate": 2.679992655730283e-06, "loss": 0.83199817, "num_input_tokens_seen": 145623740, "step": 6784, "time_per_iteration": 2.6054811477661133 }, { "auxiliary_loss_clip": 0.01107175, "auxiliary_loss_mlp": 0.01044009, "balance_loss_clip": 1.05123401, "balance_loss_mlp": 1.02725959, "epoch": 0.407936269352172, "flos": 20520650338560.0, "grad_norm": 2.1708514595694655, "language_loss": 0.65653902, "learning_rate": 2.679626382651386e-06, "loss": 0.67805088, "num_input_tokens_seen": 145643515, "step": 6785, "time_per_iteration": 2.816330671310425 }, { "auxiliary_loss_clip": 0.01115764, "auxiliary_loss_mlp": 0.01038413, "balance_loss_clip": 1.04758108, "balance_loss_mlp": 1.02347052, "epoch": 0.40799639260483994, "flos": 20118809911680.0, "grad_norm": 1.8523263348252557, "language_loss": 0.79567587, "learning_rate": 2.679260083800989e-06, "loss": 0.81721765, "num_input_tokens_seen": 145660890, "step": 6786, "time_per_iteration": 2.629009962081909 }, { "auxiliary_loss_clip": 0.01132323, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.04911721, "balance_loss_mlp": 1.02866411, "epoch": 0.4080565158575079, "flos": 20997328752000.0, "grad_norm": 1.716981063220771, "language_loss": 0.81870878, "learning_rate": 2.678893759192982e-06, "loss": 0.84045374, "num_input_tokens_seen": 145680070, "step": 6787, "time_per_iteration": 2.6304709911346436 }, { "auxiliary_loss_clip": 0.01117339, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.04705477, "balance_loss_mlp": 1.02019691, "epoch": 0.40811663911017587, "flos": 19317714837120.0, "grad_norm": 1.8408166150848957, "language_loss": 0.67954206, "learning_rate": 2.678527408841255e-06, "loss": 0.70105749, "num_input_tokens_seen": 145698010, "step": 6788, "time_per_iteration": 2.6314821243286133 }, { "auxiliary_loss_clip": 0.01102044, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.04318452, "balance_loss_mlp": 1.03095889, "epoch": 0.40817676236284384, "flos": 40625382119040.0, "grad_norm": 2.0355882471601014, "language_loss": 0.66265976, "learning_rate": 2.678161032759701e-06, "loss": 0.6841557, "num_input_tokens_seen": 145722215, "step": 6789, "time_per_iteration": 2.8329808712005615 }, { "auxiliary_loss_clip": 0.01084234, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.04236126, "balance_loss_mlp": 1.021101, "epoch": 0.4082368856155118, "flos": 20522086882560.0, "grad_norm": 1.7612282198179636, "language_loss": 0.60220939, "learning_rate": 2.6777946309622123e-06, "loss": 0.62341583, "num_input_tokens_seen": 145741090, "step": 6790, "time_per_iteration": 2.705007791519165 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.0104035, "balance_loss_clip": 1.04814339, "balance_loss_mlp": 1.02482939, "epoch": 0.40829700886817977, "flos": 11427745098240.0, "grad_norm": 2.946877052856992, "language_loss": 0.69406867, "learning_rate": 2.677428203462683e-06, "loss": 0.71562433, "num_input_tokens_seen": 145754985, "step": 6791, "time_per_iteration": 2.629746675491333 }, { "auxiliary_loss_clip": 0.01047663, "auxiliary_loss_mlp": 0.01005727, "balance_loss_clip": 1.02732182, "balance_loss_mlp": 1.00409365, "epoch": 0.40835713212084773, "flos": 67330677121920.0, "grad_norm": 0.7512190297569652, "language_loss": 0.59569383, "learning_rate": 2.6770617502750093e-06, "loss": 0.61622775, "num_input_tokens_seen": 145815260, "step": 6792, "time_per_iteration": 4.680825710296631 }, { "auxiliary_loss_clip": 0.0113903, "auxiliary_loss_mlp": 0.01043884, "balance_loss_clip": 1.05271673, "balance_loss_mlp": 1.02787423, "epoch": 0.4084172553735157, "flos": 21762010414080.0, "grad_norm": 1.9475859316217028, "language_loss": 0.80324817, "learning_rate": 2.6766952714130857e-06, "loss": 0.8250773, "num_input_tokens_seen": 145832665, "step": 6793, "time_per_iteration": 4.095003128051758 }, { "auxiliary_loss_clip": 0.01124776, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.02811408, "epoch": 0.40847737862618366, "flos": 27417258478080.0, "grad_norm": 1.8631596367030567, "language_loss": 0.84994531, "learning_rate": 2.6763287668908094e-06, "loss": 0.87163359, "num_input_tokens_seen": 145850240, "step": 6794, "time_per_iteration": 4.198231935501099 }, { "auxiliary_loss_clip": 0.01100105, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.0469923, "balance_loss_mlp": 1.02570391, "epoch": 0.4085375018788516, "flos": 18587255857920.0, "grad_norm": 2.862264995792616, "language_loss": 0.7989887, "learning_rate": 2.6759622367220788e-06, "loss": 0.82039654, "num_input_tokens_seen": 145869545, "step": 6795, "time_per_iteration": 2.7477807998657227 }, { "auxiliary_loss_clip": 0.01121705, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.04831719, "balance_loss_mlp": 1.02385116, "epoch": 0.4085976251315196, "flos": 15411783029760.0, "grad_norm": 2.7561951150254633, "language_loss": 0.70605052, "learning_rate": 2.675595680920792e-06, "loss": 0.7276659, "num_input_tokens_seen": 145884025, "step": 6796, "time_per_iteration": 4.261413335800171 }, { "auxiliary_loss_clip": 0.01116135, "auxiliary_loss_mlp": 0.0077634, "balance_loss_clip": 1.04606998, "balance_loss_mlp": 1.00082135, "epoch": 0.40865774838418756, "flos": 21252222639360.0, "grad_norm": 1.6356766399676357, "language_loss": 0.78218019, "learning_rate": 2.6752290995008498e-06, "loss": 0.80110496, "num_input_tokens_seen": 145903210, "step": 6797, "time_per_iteration": 2.6453776359558105 }, { "auxiliary_loss_clip": 0.01121906, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.04562223, "balance_loss_mlp": 1.03619301, "epoch": 0.4087178716368556, "flos": 13772245714560.0, "grad_norm": 2.2166943768421534, "language_loss": 0.86117017, "learning_rate": 2.6748624924761523e-06, "loss": 0.8828969, "num_input_tokens_seen": 145920985, "step": 6798, "time_per_iteration": 2.67480731010437 }, { "auxiliary_loss_clip": 0.01130307, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.04780984, "balance_loss_mlp": 1.02931094, "epoch": 0.40877799488952354, "flos": 23621752056960.0, "grad_norm": 1.473518761352831, "language_loss": 0.84252232, "learning_rate": 2.674495859860601e-06, "loss": 0.86425269, "num_input_tokens_seen": 145940350, "step": 6799, "time_per_iteration": 2.6273906230926514 }, { "auxiliary_loss_clip": 0.01093085, "auxiliary_loss_mlp": 0.01052249, "balance_loss_clip": 1.04557848, "balance_loss_mlp": 1.03427255, "epoch": 0.4088381181421915, "flos": 20918791664640.0, "grad_norm": 2.1256660898165913, "language_loss": 0.83567548, "learning_rate": 2.6741292016681e-06, "loss": 0.85712886, "num_input_tokens_seen": 145957460, "step": 6800, "time_per_iteration": 2.7064268589019775 }, { "auxiliary_loss_clip": 0.01119062, "auxiliary_loss_mlp": 0.01043239, "balance_loss_clip": 1.04534221, "balance_loss_mlp": 1.02778912, "epoch": 0.4088982413948595, "flos": 13297578462720.0, "grad_norm": 2.1612690472856353, "language_loss": 0.74336559, "learning_rate": 2.6737625179125514e-06, "loss": 0.76498854, "num_input_tokens_seen": 145975285, "step": 6801, "time_per_iteration": 2.631030321121216 }, { "auxiliary_loss_clip": 0.01122834, "auxiliary_loss_mlp": 0.0104231, "balance_loss_clip": 1.04511952, "balance_loss_mlp": 1.02699137, "epoch": 0.40895836464752744, "flos": 15267673664640.0, "grad_norm": 2.1715684147319907, "language_loss": 0.80430126, "learning_rate": 2.673395808607861e-06, "loss": 0.82595277, "num_input_tokens_seen": 145989150, "step": 6802, "time_per_iteration": 2.5802509784698486 }, { "auxiliary_loss_clip": 0.0112096, "auxiliary_loss_mlp": 0.01044934, "balance_loss_clip": 1.04893684, "balance_loss_mlp": 1.02843595, "epoch": 0.4090184879001954, "flos": 14501411804160.0, "grad_norm": 2.2436343912353283, "language_loss": 0.75734484, "learning_rate": 2.673029073767934e-06, "loss": 0.77900374, "num_input_tokens_seen": 146006980, "step": 6803, "time_per_iteration": 2.609602689743042 }, { "auxiliary_loss_clip": 0.0106898, "auxiliary_loss_mlp": 0.00773774, "balance_loss_clip": 1.04085743, "balance_loss_mlp": 1.00086641, "epoch": 0.40907861115286337, "flos": 13881593692800.0, "grad_norm": 1.8843395194203503, "language_loss": 0.78824151, "learning_rate": 2.6726623134066764e-06, "loss": 0.806669, "num_input_tokens_seen": 146025125, "step": 6804, "time_per_iteration": 2.7654101848602295 }, { "auxiliary_loss_clip": 0.01137979, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.04858065, "balance_loss_mlp": 1.03147769, "epoch": 0.40913873440553133, "flos": 28037615293440.0, "grad_norm": 2.2298994676504225, "language_loss": 0.75672269, "learning_rate": 2.672295527537998e-06, "loss": 0.77856231, "num_input_tokens_seen": 146044990, "step": 6805, "time_per_iteration": 2.680368185043335 }, { "auxiliary_loss_clip": 0.01089569, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.04342198, "balance_loss_mlp": 1.03309822, "epoch": 0.4091988576581993, "flos": 21618188357760.0, "grad_norm": 1.8743994628433338, "language_loss": 0.79440027, "learning_rate": 2.671928716175804e-06, "loss": 0.81577563, "num_input_tokens_seen": 146066045, "step": 6806, "time_per_iteration": 2.8212954998016357 }, { "auxiliary_loss_clip": 0.01126847, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.04977083, "balance_loss_mlp": 1.02272499, "epoch": 0.40925898091086726, "flos": 25224085860480.0, "grad_norm": 1.915245819215902, "language_loss": 0.71779263, "learning_rate": 2.671561879334007e-06, "loss": 0.73944426, "num_input_tokens_seen": 146086280, "step": 6807, "time_per_iteration": 2.7223496437072754 }, { "auxiliary_loss_clip": 0.01034248, "auxiliary_loss_mlp": 0.01005874, "balance_loss_clip": 1.0356338, "balance_loss_mlp": 1.00364494, "epoch": 0.40931910416353523, "flos": 68930568800640.0, "grad_norm": 0.8232207365722912, "language_loss": 0.58807027, "learning_rate": 2.6711950170265155e-06, "loss": 0.60847151, "num_input_tokens_seen": 146148840, "step": 6808, "time_per_iteration": 3.2951159477233887 }, { "auxiliary_loss_clip": 0.01113663, "auxiliary_loss_mlp": 0.01048693, "balance_loss_clip": 1.04732299, "balance_loss_mlp": 1.03419733, "epoch": 0.4093792274162032, "flos": 20189553747840.0, "grad_norm": 1.705790136999867, "language_loss": 0.54954052, "learning_rate": 2.670828129267242e-06, "loss": 0.57116413, "num_input_tokens_seen": 146166195, "step": 6809, "time_per_iteration": 2.663210868835449 }, { "auxiliary_loss_clip": 0.01108384, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.0446471, "balance_loss_mlp": 1.01682281, "epoch": 0.40943935066887116, "flos": 25228754628480.0, "grad_norm": 1.7788203343455933, "language_loss": 0.83185786, "learning_rate": 2.6704612160700983e-06, "loss": 0.85325718, "num_input_tokens_seen": 146185045, "step": 6810, "time_per_iteration": 2.683969020843506 }, { "auxiliary_loss_clip": 0.01105454, "auxiliary_loss_mlp": 0.01053382, "balance_loss_clip": 1.0451473, "balance_loss_mlp": 1.03608489, "epoch": 0.4094994739215392, "flos": 23255319461760.0, "grad_norm": 2.954085357706404, "language_loss": 0.77419919, "learning_rate": 2.670094277448999e-06, "loss": 0.79578757, "num_input_tokens_seen": 146204655, "step": 6811, "time_per_iteration": 2.6727347373962402 }, { "auxiliary_loss_clip": 0.01135893, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.04917455, "balance_loss_mlp": 1.02042687, "epoch": 0.40955959717420715, "flos": 17382165540480.0, "grad_norm": 1.6058461501005727, "language_loss": 0.70272696, "learning_rate": 2.669727313417857e-06, "loss": 0.72445196, "num_input_tokens_seen": 146222000, "step": 6812, "time_per_iteration": 2.6267693042755127 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01048088, "balance_loss_clip": 1.04780114, "balance_loss_mlp": 1.03210163, "epoch": 0.4096197204268751, "flos": 25082418620160.0, "grad_norm": 1.9378136524882912, "language_loss": 0.66298044, "learning_rate": 2.6693603239905872e-06, "loss": 0.68478918, "num_input_tokens_seen": 146242630, "step": 6813, "time_per_iteration": 2.6447062492370605 }, { "auxiliary_loss_clip": 0.01117463, "auxiliary_loss_mlp": 0.00774455, "balance_loss_clip": 1.04784274, "balance_loss_mlp": 1.0009681, "epoch": 0.4096798436795431, "flos": 30586769648640.0, "grad_norm": 1.8922051995482987, "language_loss": 0.73949504, "learning_rate": 2.6689933091811087e-06, "loss": 0.75841421, "num_input_tokens_seen": 146263070, "step": 6814, "time_per_iteration": 2.7325870990753174 }, { "auxiliary_loss_clip": 0.0108334, "auxiliary_loss_mlp": 0.01038435, "balance_loss_clip": 1.04231858, "balance_loss_mlp": 1.02281821, "epoch": 0.40973996693221104, "flos": 24133622820480.0, "grad_norm": 2.0095509453801728, "language_loss": 0.65957761, "learning_rate": 2.6686262690033357e-06, "loss": 0.68079543, "num_input_tokens_seen": 146282890, "step": 6815, "time_per_iteration": 2.780668258666992 }, { "auxiliary_loss_clip": 0.01122383, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.05130887, "balance_loss_mlp": 1.03100336, "epoch": 0.409800090184879, "flos": 23988974751360.0, "grad_norm": 1.5903260932613887, "language_loss": 0.76872814, "learning_rate": 2.668259203471188e-06, "loss": 0.79040992, "num_input_tokens_seen": 146301755, "step": 6816, "time_per_iteration": 2.6901748180389404 }, { "auxiliary_loss_clip": 0.01118517, "auxiliary_loss_mlp": 0.0104269, "balance_loss_clip": 1.05008173, "balance_loss_mlp": 1.02716875, "epoch": 0.40986021343754697, "flos": 16143678552960.0, "grad_norm": 2.2788575244766966, "language_loss": 0.81621635, "learning_rate": 2.6678921125985843e-06, "loss": 0.8378284, "num_input_tokens_seen": 146316835, "step": 6817, "time_per_iteration": 2.6194167137145996 }, { "auxiliary_loss_clip": 0.01114033, "auxiliary_loss_mlp": 0.01046853, "balance_loss_clip": 1.04633307, "balance_loss_mlp": 1.02987719, "epoch": 0.40992033669021494, "flos": 24790824011520.0, "grad_norm": 2.698849637369061, "language_loss": 0.8016938, "learning_rate": 2.667524996399444e-06, "loss": 0.82330263, "num_input_tokens_seen": 146336650, "step": 6818, "time_per_iteration": 2.8449223041534424 }, { "auxiliary_loss_clip": 0.0111157, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.05212271, "balance_loss_mlp": 1.02459419, "epoch": 0.4099804599428829, "flos": 29641888431360.0, "grad_norm": 1.781955605236185, "language_loss": 0.66531783, "learning_rate": 2.66715785488769e-06, "loss": 0.68682575, "num_input_tokens_seen": 146357640, "step": 6819, "time_per_iteration": 2.8016393184661865 }, { "auxiliary_loss_clip": 0.01118061, "auxiliary_loss_mlp": 0.01052321, "balance_loss_clip": 1.05068922, "balance_loss_mlp": 1.03429687, "epoch": 0.41004058319555087, "flos": 24826590979200.0, "grad_norm": 1.7017427969889725, "language_loss": 0.85438228, "learning_rate": 2.6667906880772428e-06, "loss": 0.87608612, "num_input_tokens_seen": 146379325, "step": 6820, "time_per_iteration": 2.7182726860046387 }, { "auxiliary_loss_clip": 0.01127803, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.05361152, "balance_loss_mlp": 1.02019835, "epoch": 0.41010070644821883, "flos": 25737464995200.0, "grad_norm": 1.8388824613750698, "language_loss": 0.71235943, "learning_rate": 2.6664234959820256e-06, "loss": 0.73399413, "num_input_tokens_seen": 146398635, "step": 6821, "time_per_iteration": 2.6716413497924805 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01036959, "balance_loss_clip": 1.05253363, "balance_loss_mlp": 1.02228427, "epoch": 0.4101608297008868, "flos": 22346061557760.0, "grad_norm": 1.9657765704612085, "language_loss": 0.74500406, "learning_rate": 2.6660562786159634e-06, "loss": 0.76660895, "num_input_tokens_seen": 146417585, "step": 6822, "time_per_iteration": 2.652270793914795 }, { "auxiliary_loss_clip": 0.01118135, "auxiliary_loss_mlp": 0.01038075, "balance_loss_clip": 1.05201709, "balance_loss_mlp": 1.02313757, "epoch": 0.41022095295355476, "flos": 21945083057280.0, "grad_norm": 2.1947910409652116, "language_loss": 0.75539672, "learning_rate": 2.6656890359929796e-06, "loss": 0.77695882, "num_input_tokens_seen": 146437035, "step": 6823, "time_per_iteration": 2.767306327819824 }, { "auxiliary_loss_clip": 0.01095631, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.05394316, "balance_loss_mlp": 1.02697372, "epoch": 0.4102810762062228, "flos": 27450511493760.0, "grad_norm": 2.0691169068872086, "language_loss": 0.73186851, "learning_rate": 2.665321768127001e-06, "loss": 0.75326765, "num_input_tokens_seen": 146457370, "step": 6824, "time_per_iteration": 2.793712615966797 }, { "auxiliary_loss_clip": 0.01110429, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.05025351, "balance_loss_mlp": 1.02316904, "epoch": 0.41034119945889075, "flos": 24499265316480.0, "grad_norm": 2.036284375586757, "language_loss": 0.72426587, "learning_rate": 2.6649544750319548e-06, "loss": 0.7457664, "num_input_tokens_seen": 146478105, "step": 6825, "time_per_iteration": 2.764977216720581 }, { "auxiliary_loss_clip": 0.01097265, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.04605746, "balance_loss_mlp": 1.03027654, "epoch": 0.4104013227115587, "flos": 24352641999360.0, "grad_norm": 1.8249289811640228, "language_loss": 0.85226274, "learning_rate": 2.664587156721768e-06, "loss": 0.87369001, "num_input_tokens_seen": 146497835, "step": 6826, "time_per_iteration": 2.7680137157440186 }, { "auxiliary_loss_clip": 0.01115829, "auxiliary_loss_mlp": 0.00775051, "balance_loss_clip": 1.05372024, "balance_loss_mlp": 1.00099707, "epoch": 0.4104614459642267, "flos": 23729340268800.0, "grad_norm": 1.8772466232345664, "language_loss": 0.66074443, "learning_rate": 2.6642198132103696e-06, "loss": 0.67965323, "num_input_tokens_seen": 146517735, "step": 6827, "time_per_iteration": 2.791212797164917 }, { "auxiliary_loss_clip": 0.01113343, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04942787, "balance_loss_mlp": 1.01910365, "epoch": 0.41052156921689464, "flos": 22127976132480.0, "grad_norm": 2.0535618692070914, "language_loss": 0.72474444, "learning_rate": 2.663852444511689e-06, "loss": 0.74621731, "num_input_tokens_seen": 146537640, "step": 6828, "time_per_iteration": 2.6675491333007812 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01048054, "balance_loss_clip": 1.04920423, "balance_loss_mlp": 1.03068542, "epoch": 0.4105816924695626, "flos": 20084371747200.0, "grad_norm": 2.67524304617312, "language_loss": 0.83464897, "learning_rate": 2.6634850506396574e-06, "loss": 0.85624069, "num_input_tokens_seen": 146554695, "step": 6829, "time_per_iteration": 2.762298107147217 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01039003, "balance_loss_clip": 1.05062759, "balance_loss_mlp": 1.02405417, "epoch": 0.4106418157222306, "flos": 18076785724800.0, "grad_norm": 1.5363498208464375, "language_loss": 0.89878875, "learning_rate": 2.663117631608206e-06, "loss": 0.92042506, "num_input_tokens_seen": 146573740, "step": 6830, "time_per_iteration": 2.7726032733917236 }, { "auxiliary_loss_clip": 0.01098336, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.04938424, "balance_loss_mlp": 1.01833797, "epoch": 0.41070193897489854, "flos": 21647850013440.0, "grad_norm": 1.7853690904757185, "language_loss": 0.65810287, "learning_rate": 2.662750187431268e-06, "loss": 0.67942798, "num_input_tokens_seen": 146592885, "step": 6831, "time_per_iteration": 4.213804244995117 }, { "auxiliary_loss_clip": 0.01137663, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.05280805, "balance_loss_mlp": 1.02361393, "epoch": 0.4107620622275665, "flos": 26648195356800.0, "grad_norm": 1.7075421510763598, "language_loss": 0.69710165, "learning_rate": 2.662382718122776e-06, "loss": 0.71886885, "num_input_tokens_seen": 146611995, "step": 6832, "time_per_iteration": 4.146309852600098 }, { "auxiliary_loss_clip": 0.01089843, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.05080116, "balance_loss_mlp": 1.02703142, "epoch": 0.41082218548023447, "flos": 18734310138240.0, "grad_norm": 2.3374205466797537, "language_loss": 0.73910743, "learning_rate": 2.662015223696666e-06, "loss": 0.760427, "num_input_tokens_seen": 146628045, "step": 6833, "time_per_iteration": 4.23652195930481 }, { "auxiliary_loss_clip": 0.01083988, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.04393578, "balance_loss_mlp": 1.02754784, "epoch": 0.41088230873290243, "flos": 22893771116160.0, "grad_norm": 1.56012293193972, "language_loss": 0.7299009, "learning_rate": 2.6616477041668713e-06, "loss": 0.75119424, "num_input_tokens_seen": 146648355, "step": 6834, "time_per_iteration": 2.72806453704834 }, { "auxiliary_loss_clip": 0.0113018, "auxiliary_loss_mlp": 0.01049062, "balance_loss_clip": 1.05203891, "balance_loss_mlp": 1.03320765, "epoch": 0.4109424319855704, "flos": 24276978000000.0, "grad_norm": 1.7978087117059114, "language_loss": 0.71254998, "learning_rate": 2.661280159547329e-06, "loss": 0.73434246, "num_input_tokens_seen": 146668370, "step": 6835, "time_per_iteration": 4.406278133392334 }, { "auxiliary_loss_clip": 0.01130021, "auxiliary_loss_mlp": 0.01043294, "balance_loss_clip": 1.05188155, "balance_loss_mlp": 1.02630687, "epoch": 0.41100255523823837, "flos": 12969139478400.0, "grad_norm": 1.9060780079348063, "language_loss": 0.87366456, "learning_rate": 2.660912589851978e-06, "loss": 0.89539772, "num_input_tokens_seen": 146686665, "step": 6836, "time_per_iteration": 2.6482133865356445 }, { "auxiliary_loss_clip": 0.0112613, "auxiliary_loss_mlp": 0.01040074, "balance_loss_clip": 1.05334806, "balance_loss_mlp": 1.02461267, "epoch": 0.4110626784909064, "flos": 23145648261120.0, "grad_norm": 6.565804686602276, "language_loss": 0.69167227, "learning_rate": 2.6605449950947547e-06, "loss": 0.71333432, "num_input_tokens_seen": 146706570, "step": 6837, "time_per_iteration": 2.682241916656494 }, { "auxiliary_loss_clip": 0.0114114, "auxiliary_loss_mlp": 0.01041377, "balance_loss_clip": 1.0544312, "balance_loss_mlp": 1.02540302, "epoch": 0.41112280174357435, "flos": 22747399194240.0, "grad_norm": 1.8671169017141842, "language_loss": 0.75408459, "learning_rate": 2.660177375289599e-06, "loss": 0.77590978, "num_input_tokens_seen": 146723425, "step": 6838, "time_per_iteration": 2.625422239303589 }, { "auxiliary_loss_clip": 0.0110141, "auxiliary_loss_mlp": 0.01042257, "balance_loss_clip": 1.0521034, "balance_loss_mlp": 1.02617598, "epoch": 0.4111829249962423, "flos": 21102403011840.0, "grad_norm": 2.061873935528421, "language_loss": 0.82113552, "learning_rate": 2.659809730450451e-06, "loss": 0.84257221, "num_input_tokens_seen": 146741640, "step": 6839, "time_per_iteration": 2.7850279808044434 }, { "auxiliary_loss_clip": 0.01135439, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 1.05122948, "balance_loss_mlp": 1.02421379, "epoch": 0.4112430482489103, "flos": 21505787723520.0, "grad_norm": 5.701831641175022, "language_loss": 0.80077577, "learning_rate": 2.6594420605912523e-06, "loss": 0.82252288, "num_input_tokens_seen": 146759195, "step": 6840, "time_per_iteration": 2.656494140625 }, { "auxiliary_loss_clip": 0.01120054, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.0487783, "balance_loss_mlp": 1.02117467, "epoch": 0.41130317150157825, "flos": 19570022945280.0, "grad_norm": 1.862146821875906, "language_loss": 0.6778084, "learning_rate": 2.6590743657259442e-06, "loss": 0.69935924, "num_input_tokens_seen": 146774990, "step": 6841, "time_per_iteration": 2.6612377166748047 }, { "auxiliary_loss_clip": 0.01055489, "auxiliary_loss_mlp": 0.01004436, "balance_loss_clip": 1.03532803, "balance_loss_mlp": 1.00270772, "epoch": 0.4113632947542462, "flos": 62383157706240.0, "grad_norm": 0.8163554776107808, "language_loss": 0.59717554, "learning_rate": 2.65870664586847e-06, "loss": 0.61777478, "num_input_tokens_seen": 146839610, "step": 6842, "time_per_iteration": 3.2157862186431885 }, { "auxiliary_loss_clip": 0.01120166, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.05330658, "balance_loss_mlp": 1.02400184, "epoch": 0.4114234180069142, "flos": 13918617636480.0, "grad_norm": 2.3538351775584156, "language_loss": 0.70293331, "learning_rate": 2.6583389010327742e-06, "loss": 0.72451818, "num_input_tokens_seen": 146857360, "step": 6843, "time_per_iteration": 2.6172597408294678 }, { "auxiliary_loss_clip": 0.01014929, "auxiliary_loss_mlp": 0.01002572, "balance_loss_clip": 1.01983762, "balance_loss_mlp": 1.00047398, "epoch": 0.41148354125958214, "flos": 64928505219840.0, "grad_norm": 0.7263883634768764, "language_loss": 0.53593683, "learning_rate": 2.6579711312328013e-06, "loss": 0.55611187, "num_input_tokens_seen": 146917055, "step": 6844, "time_per_iteration": 3.21069598197937 }, { "auxiliary_loss_clip": 0.01124589, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.05226612, "balance_loss_mlp": 1.02679706, "epoch": 0.4115436645122501, "flos": 18728779443840.0, "grad_norm": 1.870188515464334, "language_loss": 0.66065252, "learning_rate": 2.6576033364824967e-06, "loss": 0.68230951, "num_input_tokens_seen": 146935215, "step": 6845, "time_per_iteration": 2.6289329528808594 }, { "auxiliary_loss_clip": 0.01134084, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.02355433, "epoch": 0.41160378776491807, "flos": 16252918790400.0, "grad_norm": 2.0932374873894655, "language_loss": 0.70088863, "learning_rate": 2.657235516795808e-06, "loss": 0.72261429, "num_input_tokens_seen": 146951970, "step": 6846, "time_per_iteration": 2.578780174255371 }, { "auxiliary_loss_clip": 0.01111001, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.04926157, "balance_loss_mlp": 1.0254035, "epoch": 0.41166391101758604, "flos": 27970031854080.0, "grad_norm": 1.8006459441278344, "language_loss": 0.65271175, "learning_rate": 2.6568676721866826e-06, "loss": 0.67423248, "num_input_tokens_seen": 146975615, "step": 6847, "time_per_iteration": 2.7504281997680664 }, { "auxiliary_loss_clip": 0.01111807, "auxiliary_loss_mlp": 0.01046607, "balance_loss_clip": 1.04943776, "balance_loss_mlp": 1.03167558, "epoch": 0.411724034270254, "flos": 34131296764800.0, "grad_norm": 1.371398558221349, "language_loss": 0.70655453, "learning_rate": 2.656499802669069e-06, "loss": 0.72813869, "num_input_tokens_seen": 146998855, "step": 6848, "time_per_iteration": 2.7842190265655518 }, { "auxiliary_loss_clip": 0.01032604, "auxiliary_loss_mlp": 0.00753743, "balance_loss_clip": 1.02356267, "balance_loss_mlp": 1.00076866, "epoch": 0.41178415752292197, "flos": 67923670752000.0, "grad_norm": 0.9037714041830832, "language_loss": 0.5627954, "learning_rate": 2.6561319082569174e-06, "loss": 0.58065879, "num_input_tokens_seen": 147062710, "step": 6849, "time_per_iteration": 3.3100218772888184 }, { "auxiliary_loss_clip": 0.01115279, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.05035055, "balance_loss_mlp": 1.0254786, "epoch": 0.41184428077558993, "flos": 34313938444800.0, "grad_norm": 2.6235370790375767, "language_loss": 0.76318872, "learning_rate": 2.6557639889641783e-06, "loss": 0.78474414, "num_input_tokens_seen": 147086075, "step": 6850, "time_per_iteration": 2.879258632659912 }, { "auxiliary_loss_clip": 0.010812, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.0412885, "balance_loss_mlp": 1.02356339, "epoch": 0.41190440402825795, "flos": 35444118948480.0, "grad_norm": 1.5473555335002718, "language_loss": 0.68093288, "learning_rate": 2.6553960448048025e-06, "loss": 0.70212466, "num_input_tokens_seen": 147107590, "step": 6851, "time_per_iteration": 2.931530237197876 }, { "auxiliary_loss_clip": 0.01101431, "auxiliary_loss_mlp": 0.01049233, "balance_loss_clip": 1.0504117, "balance_loss_mlp": 1.03207839, "epoch": 0.4119645272809259, "flos": 20849879422080.0, "grad_norm": 2.1361960755807634, "language_loss": 0.79698718, "learning_rate": 2.655028075792743e-06, "loss": 0.81849384, "num_input_tokens_seen": 147123715, "step": 6852, "time_per_iteration": 2.6807408332824707 }, { "auxiliary_loss_clip": 0.01141214, "auxiliary_loss_mlp": 0.01043074, "balance_loss_clip": 1.05327845, "balance_loss_mlp": 1.02688491, "epoch": 0.4120246505335939, "flos": 27562050201600.0, "grad_norm": 1.901908158264802, "language_loss": 0.77750659, "learning_rate": 2.6546600819419537e-06, "loss": 0.79934943, "num_input_tokens_seen": 147144290, "step": 6853, "time_per_iteration": 2.699430227279663 }, { "auxiliary_loss_clip": 0.01126437, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04821801, "balance_loss_mlp": 1.0298574, "epoch": 0.41208477378626185, "flos": 37815444046080.0, "grad_norm": 1.8090743517086876, "language_loss": 0.65556479, "learning_rate": 2.6542920632663883e-06, "loss": 0.6772902, "num_input_tokens_seen": 147166340, "step": 6854, "time_per_iteration": 2.8111729621887207 }, { "auxiliary_loss_clip": 0.01104516, "auxiliary_loss_mlp": 0.01052436, "balance_loss_clip": 1.04534888, "balance_loss_mlp": 1.03615212, "epoch": 0.4121448970389298, "flos": 23440762402560.0, "grad_norm": 2.1224683572406917, "language_loss": 0.8348515, "learning_rate": 2.6539240197800023e-06, "loss": 0.85642099, "num_input_tokens_seen": 147184025, "step": 6855, "time_per_iteration": 2.6698896884918213 }, { "auxiliary_loss_clip": 0.01117307, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04969764, "balance_loss_mlp": 1.02976418, "epoch": 0.4122050202915978, "flos": 21325300859520.0, "grad_norm": 2.1069107949142554, "language_loss": 0.7929827, "learning_rate": 2.6535559514967517e-06, "loss": 0.81459653, "num_input_tokens_seen": 147202730, "step": 6856, "time_per_iteration": 2.6754775047302246 }, { "auxiliary_loss_clip": 0.01098846, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.04761338, "balance_loss_mlp": 1.02777684, "epoch": 0.41226514354426574, "flos": 17306286059520.0, "grad_norm": 2.5035417030553018, "language_loss": 0.80352724, "learning_rate": 2.6531878584305935e-06, "loss": 0.82494175, "num_input_tokens_seen": 147215315, "step": 6857, "time_per_iteration": 2.7415785789489746 }, { "auxiliary_loss_clip": 0.01123756, "auxiliary_loss_mlp": 0.0077359, "balance_loss_clip": 1.04799688, "balance_loss_mlp": 1.00088441, "epoch": 0.4123252667969337, "flos": 17638855107840.0, "grad_norm": 2.1785137319374575, "language_loss": 0.70367694, "learning_rate": 2.6528197405954873e-06, "loss": 0.72265041, "num_input_tokens_seen": 147233330, "step": 6858, "time_per_iteration": 2.6482796669006348 }, { "auxiliary_loss_clip": 0.01123125, "auxiliary_loss_mlp": 0.01046787, "balance_loss_clip": 1.04916668, "balance_loss_mlp": 1.03116488, "epoch": 0.4123853900496017, "flos": 46424811375360.0, "grad_norm": 2.660424997773602, "language_loss": 0.59025121, "learning_rate": 2.652451598005391e-06, "loss": 0.61195034, "num_input_tokens_seen": 147257780, "step": 6859, "time_per_iteration": 2.8688454627990723 }, { "auxiliary_loss_clip": 0.01132817, "auxiliary_loss_mlp": 0.0104458, "balance_loss_clip": 1.04658365, "balance_loss_mlp": 1.0293684, "epoch": 0.41244551330226964, "flos": 17675160779520.0, "grad_norm": 2.4672414929748863, "language_loss": 0.73583943, "learning_rate": 2.652083430674264e-06, "loss": 0.75761342, "num_input_tokens_seen": 147276055, "step": 6860, "time_per_iteration": 2.552107572555542 }, { "auxiliary_loss_clip": 0.01058973, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.04514742, "balance_loss_mlp": 1.024279, "epoch": 0.4125056365549376, "flos": 18693730748160.0, "grad_norm": 1.7024014286117355, "language_loss": 0.7499401, "learning_rate": 2.651715238616068e-06, "loss": 0.7709192, "num_input_tokens_seen": 147293200, "step": 6861, "time_per_iteration": 2.8850560188293457 }, { "auxiliary_loss_clip": 0.01110545, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.03024721, "epoch": 0.41256575980760557, "flos": 17895293280000.0, "grad_norm": 2.2415523494511467, "language_loss": 0.79298902, "learning_rate": 2.651347021844765e-06, "loss": 0.8145389, "num_input_tokens_seen": 147310640, "step": 6862, "time_per_iteration": 2.900341510772705 }, { "auxiliary_loss_clip": 0.01101386, "auxiliary_loss_mlp": 0.01041536, "balance_loss_clip": 1.04071999, "balance_loss_mlp": 1.02640843, "epoch": 0.41262588306027354, "flos": 21981316901760.0, "grad_norm": 1.8032442507418176, "language_loss": 0.7571404, "learning_rate": 2.650978780374318e-06, "loss": 0.77856958, "num_input_tokens_seen": 147329435, "step": 6863, "time_per_iteration": 2.653726100921631 }, { "auxiliary_loss_clip": 0.01042253, "auxiliary_loss_mlp": 0.0101594, "balance_loss_clip": 1.02186918, "balance_loss_mlp": 1.01400852, "epoch": 0.41268600631294156, "flos": 53350006740480.0, "grad_norm": 0.7071869047358454, "language_loss": 0.52727556, "learning_rate": 2.650610514218691e-06, "loss": 0.54785752, "num_input_tokens_seen": 147385805, "step": 6864, "time_per_iteration": 3.1097042560577393 }, { "auxiliary_loss_clip": 0.01138053, "auxiliary_loss_mlp": 0.01037208, "balance_loss_clip": 1.04946339, "balance_loss_mlp": 1.02124572, "epoch": 0.4127461295656095, "flos": 24385356311040.0, "grad_norm": 2.542549123445174, "language_loss": 0.72281235, "learning_rate": 2.6502422233918468e-06, "loss": 0.74456495, "num_input_tokens_seen": 147405160, "step": 6865, "time_per_iteration": 2.6489152908325195 }, { "auxiliary_loss_clip": 0.01052076, "auxiliary_loss_mlp": 0.01005202, "balance_loss_clip": 1.02275848, "balance_loss_mlp": 1.0035094, "epoch": 0.4128062528182775, "flos": 71705242696320.0, "grad_norm": 0.9209058739863084, "language_loss": 0.66585267, "learning_rate": 2.649873907907753e-06, "loss": 0.68642545, "num_input_tokens_seen": 147460245, "step": 6866, "time_per_iteration": 3.062208890914917 }, { "auxiliary_loss_clip": 0.01129627, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.04632759, "balance_loss_mlp": 1.02420402, "epoch": 0.41286637607094545, "flos": 17849111368320.0, "grad_norm": 2.3224691577841905, "language_loss": 0.8131212, "learning_rate": 2.649505567780375e-06, "loss": 0.83480746, "num_input_tokens_seen": 147476200, "step": 6867, "time_per_iteration": 2.6058406829833984 }, { "auxiliary_loss_clip": 0.01114316, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.04773378, "balance_loss_mlp": 1.02069843, "epoch": 0.4129264993236134, "flos": 25549544016000.0, "grad_norm": 2.2632029728217913, "language_loss": 0.78249037, "learning_rate": 2.6491372030236815e-06, "loss": 0.80399621, "num_input_tokens_seen": 147494315, "step": 6868, "time_per_iteration": 2.7882273197174072 }, { "auxiliary_loss_clip": 0.0104195, "auxiliary_loss_mlp": 0.01002347, "balance_loss_clip": 1.02322721, "balance_loss_mlp": 1.00078535, "epoch": 0.4129866225762814, "flos": 65414446364160.0, "grad_norm": 0.8559261941349585, "language_loss": 0.57746547, "learning_rate": 2.64876881365164e-06, "loss": 0.59790844, "num_input_tokens_seen": 147543665, "step": 6869, "time_per_iteration": 2.9020984172821045 }, { "auxiliary_loss_clip": 0.01116756, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.04666448, "balance_loss_mlp": 1.02235568, "epoch": 0.41304674582894935, "flos": 28876991287680.0, "grad_norm": 2.064989454661501, "language_loss": 0.74957705, "learning_rate": 2.64840039967822e-06, "loss": 0.77111673, "num_input_tokens_seen": 147564870, "step": 6870, "time_per_iteration": 4.271910667419434 }, { "auxiliary_loss_clip": 0.01102765, "auxiliary_loss_mlp": 0.01045795, "balance_loss_clip": 1.04849434, "balance_loss_mlp": 1.0301609, "epoch": 0.4131068690816173, "flos": 22891975436160.0, "grad_norm": 1.7132239618858751, "language_loss": 0.83188486, "learning_rate": 2.6480319611173912e-06, "loss": 0.85337055, "num_input_tokens_seen": 147584840, "step": 6871, "time_per_iteration": 2.7382373809814453 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.04694879, "balance_loss_mlp": 1.02648854, "epoch": 0.4131669923342853, "flos": 26065185707520.0, "grad_norm": 1.8588331523997874, "language_loss": 0.68419731, "learning_rate": 2.6476634979831263e-06, "loss": 0.70564461, "num_input_tokens_seen": 147604635, "step": 6872, "time_per_iteration": 2.731513738632202 }, { "auxiliary_loss_clip": 0.01116452, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.0480907, "balance_loss_mlp": 1.01936865, "epoch": 0.41322711558695324, "flos": 19244564789760.0, "grad_norm": 2.0600406966329468, "language_loss": 0.75857317, "learning_rate": 2.6472950102893964e-06, "loss": 0.78007692, "num_input_tokens_seen": 147620700, "step": 6873, "time_per_iteration": 4.200350999832153 }, { "auxiliary_loss_clip": 0.0110667, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.04465103, "balance_loss_mlp": 1.02552366, "epoch": 0.4132872388396212, "flos": 22674464628480.0, "grad_norm": 2.335780539187462, "language_loss": 0.83409697, "learning_rate": 2.6469264980501746e-06, "loss": 0.85557866, "num_input_tokens_seen": 147639490, "step": 6874, "time_per_iteration": 2.677481174468994 }, { "auxiliary_loss_clip": 0.01095645, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.04236686, "balance_loss_mlp": 1.02203512, "epoch": 0.4133473620922892, "flos": 20150195420160.0, "grad_norm": 2.13686316676373, "language_loss": 0.71832943, "learning_rate": 2.646557961279436e-06, "loss": 0.73966241, "num_input_tokens_seen": 147657205, "step": 6875, "time_per_iteration": 4.490081548690796 }, { "auxiliary_loss_clip": 0.01099487, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.0442456, "balance_loss_mlp": 1.03144503, "epoch": 0.41340748534495714, "flos": 24242755317120.0, "grad_norm": 2.0421788997824164, "language_loss": 0.82396001, "learning_rate": 2.646189399991154e-06, "loss": 0.84541547, "num_input_tokens_seen": 147677005, "step": 6876, "time_per_iteration": 2.7446470260620117 }, { "auxiliary_loss_clip": 0.01120566, "auxiliary_loss_mlp": 0.01041258, "balance_loss_clip": 1.04677415, "balance_loss_mlp": 1.02511716, "epoch": 0.41346760859762516, "flos": 14392171566720.0, "grad_norm": 2.56742905987435, "language_loss": 0.64847958, "learning_rate": 2.6458208141993048e-06, "loss": 0.67009783, "num_input_tokens_seen": 147693435, "step": 6877, "time_per_iteration": 2.5988993644714355 }, { "auxiliary_loss_clip": 0.01117576, "auxiliary_loss_mlp": 0.01038622, "balance_loss_clip": 1.04535675, "balance_loss_mlp": 1.02366138, "epoch": 0.4135277318502931, "flos": 22492002516480.0, "grad_norm": 1.9690610536683542, "language_loss": 0.76823169, "learning_rate": 2.6454522039178668e-06, "loss": 0.78979367, "num_input_tokens_seen": 147714000, "step": 6878, "time_per_iteration": 2.6289098262786865 }, { "auxiliary_loss_clip": 0.01120186, "auxiliary_loss_mlp": 0.0077293, "balance_loss_clip": 1.04670906, "balance_loss_mlp": 1.00107956, "epoch": 0.4135878551029611, "flos": 22418744728320.0, "grad_norm": 1.7550266496384528, "language_loss": 0.80281323, "learning_rate": 2.6450835691608154e-06, "loss": 0.82174444, "num_input_tokens_seen": 147731010, "step": 6879, "time_per_iteration": 2.661945343017578 }, { "auxiliary_loss_clip": 0.01130865, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.04709899, "balance_loss_mlp": 1.02471972, "epoch": 0.41364797835562905, "flos": 27053232094080.0, "grad_norm": 2.4786614895541312, "language_loss": 0.84795272, "learning_rate": 2.6447149099421315e-06, "loss": 0.869654, "num_input_tokens_seen": 147750880, "step": 6880, "time_per_iteration": 2.6188430786132812 }, { "auxiliary_loss_clip": 0.01111764, "auxiliary_loss_mlp": 0.0102976, "balance_loss_clip": 1.04788852, "balance_loss_mlp": 1.01497793, "epoch": 0.413708101608297, "flos": 22967603521920.0, "grad_norm": 3.387576232567814, "language_loss": 0.70222247, "learning_rate": 2.6443462262757927e-06, "loss": 0.72363776, "num_input_tokens_seen": 147771360, "step": 6881, "time_per_iteration": 2.733462333679199 }, { "auxiliary_loss_clip": 0.0112877, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.04717231, "balance_loss_mlp": 1.02352309, "epoch": 0.413768224860965, "flos": 13333991875200.0, "grad_norm": 2.043279627081185, "language_loss": 0.81609744, "learning_rate": 2.6439775181757805e-06, "loss": 0.837758, "num_input_tokens_seen": 147787440, "step": 6882, "time_per_iteration": 2.6478219032287598 }, { "auxiliary_loss_clip": 0.01107335, "auxiliary_loss_mlp": 0.0104742, "balance_loss_clip": 1.04388988, "balance_loss_mlp": 1.02958596, "epoch": 0.41382834811363295, "flos": 20813968800000.0, "grad_norm": 2.1226762712951195, "language_loss": 0.69825858, "learning_rate": 2.643608785656077e-06, "loss": 0.71980608, "num_input_tokens_seen": 147805720, "step": 6883, "time_per_iteration": 2.7219526767730713 }, { "auxiliary_loss_clip": 0.01117809, "auxiliary_loss_mlp": 0.01042891, "balance_loss_clip": 1.04390156, "balance_loss_mlp": 1.02804899, "epoch": 0.4138884713663009, "flos": 20667130001280.0, "grad_norm": 1.778769139531053, "language_loss": 0.76219916, "learning_rate": 2.643240028730663e-06, "loss": 0.7838062, "num_input_tokens_seen": 147824605, "step": 6884, "time_per_iteration": 2.7255208492279053 }, { "auxiliary_loss_clip": 0.01095169, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04337394, "balance_loss_mlp": 1.02405715, "epoch": 0.4139485946189689, "flos": 29056616225280.0, "grad_norm": 1.442860134230448, "language_loss": 0.75787425, "learning_rate": 2.642871247413523e-06, "loss": 0.77921343, "num_input_tokens_seen": 147845445, "step": 6885, "time_per_iteration": 2.759103775024414 }, { "auxiliary_loss_clip": 0.0113157, "auxiliary_loss_mlp": 0.01040383, "balance_loss_clip": 1.04593658, "balance_loss_mlp": 1.0249809, "epoch": 0.41400871787163684, "flos": 24425720219520.0, "grad_norm": 2.975461049679227, "language_loss": 0.70157146, "learning_rate": 2.6425024417186414e-06, "loss": 0.72329092, "num_input_tokens_seen": 147865580, "step": 6886, "time_per_iteration": 2.5969202518463135 }, { "auxiliary_loss_clip": 0.01130858, "auxiliary_loss_mlp": 0.00772578, "balance_loss_clip": 1.04714894, "balance_loss_mlp": 1.00082159, "epoch": 0.4140688411243048, "flos": 19464050845440.0, "grad_norm": 4.863732808232375, "language_loss": 0.75765413, "learning_rate": 2.642133611660002e-06, "loss": 0.77668852, "num_input_tokens_seen": 147885230, "step": 6887, "time_per_iteration": 2.6130294799804688 }, { "auxiliary_loss_clip": 0.01115226, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.04343033, "balance_loss_mlp": 1.01858318, "epoch": 0.4141289643769728, "flos": 19313656600320.0, "grad_norm": 1.960325409954457, "language_loss": 0.70337266, "learning_rate": 2.641764757251592e-06, "loss": 0.72486007, "num_input_tokens_seen": 147903035, "step": 6888, "time_per_iteration": 2.616093635559082 }, { "auxiliary_loss_clip": 0.01125875, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.04317069, "balance_loss_mlp": 1.02698743, "epoch": 0.41418908762964074, "flos": 16726903683840.0, "grad_norm": 2.06267801428711, "language_loss": 0.76650596, "learning_rate": 2.6413958785073976e-06, "loss": 0.7881794, "num_input_tokens_seen": 147918745, "step": 6889, "time_per_iteration": 2.5624022483825684 }, { "auxiliary_loss_clip": 0.01098507, "auxiliary_loss_mlp": 0.00771883, "balance_loss_clip": 1.05070317, "balance_loss_mlp": 1.00089312, "epoch": 0.41424921088230876, "flos": 25296840858240.0, "grad_norm": 2.7156921824995224, "language_loss": 0.80554968, "learning_rate": 2.6410269754414074e-06, "loss": 0.82425356, "num_input_tokens_seen": 147938265, "step": 6890, "time_per_iteration": 2.796128273010254 }, { "auxiliary_loss_clip": 0.0112736, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04589438, "balance_loss_mlp": 1.0235126, "epoch": 0.4143093341349767, "flos": 20960520289920.0, "grad_norm": 1.7630713030967287, "language_loss": 0.74180973, "learning_rate": 2.6406580480676113e-06, "loss": 0.76347136, "num_input_tokens_seen": 147957320, "step": 6891, "time_per_iteration": 2.6974401473999023 }, { "auxiliary_loss_clip": 0.01092037, "auxiliary_loss_mlp": 0.01043425, "balance_loss_clip": 1.0482198, "balance_loss_mlp": 1.02647936, "epoch": 0.4143694573876447, "flos": 22017694400640.0, "grad_norm": 1.8611116210645706, "language_loss": 0.84570521, "learning_rate": 2.6402890963999963e-06, "loss": 0.86705983, "num_input_tokens_seen": 147977045, "step": 6892, "time_per_iteration": 2.8065037727355957 }, { "auxiliary_loss_clip": 0.01081139, "auxiliary_loss_mlp": 0.00774401, "balance_loss_clip": 1.04017556, "balance_loss_mlp": 1.00088513, "epoch": 0.41442958064031266, "flos": 35697396723840.0, "grad_norm": 1.7475313827364956, "language_loss": 0.70824122, "learning_rate": 2.6399201204525554e-06, "loss": 0.72679669, "num_input_tokens_seen": 147996905, "step": 6893, "time_per_iteration": 2.865112543106079 }, { "auxiliary_loss_clip": 0.01126872, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.04508913, "balance_loss_mlp": 1.01873493, "epoch": 0.4144897038929806, "flos": 28293766156800.0, "grad_norm": 1.5118367219903406, "language_loss": 0.72955495, "learning_rate": 2.639551120239279e-06, "loss": 0.75115383, "num_input_tokens_seen": 148017875, "step": 6894, "time_per_iteration": 2.6412105560302734 }, { "auxiliary_loss_clip": 0.0111867, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.0444473, "balance_loss_mlp": 1.01803279, "epoch": 0.4145498271456486, "flos": 11648093080320.0, "grad_norm": 2.8699191887217697, "language_loss": 0.63006961, "learning_rate": 2.63918209577416e-06, "loss": 0.65158045, "num_input_tokens_seen": 148032300, "step": 6895, "time_per_iteration": 2.6429762840270996 }, { "auxiliary_loss_clip": 0.01084496, "auxiliary_loss_mlp": 0.01047641, "balance_loss_clip": 1.04230917, "balance_loss_mlp": 1.03178644, "epoch": 0.41460995039831655, "flos": 27235622378880.0, "grad_norm": 1.395247516884051, "language_loss": 0.7072767, "learning_rate": 2.638813047071192e-06, "loss": 0.728598, "num_input_tokens_seen": 148053260, "step": 6896, "time_per_iteration": 2.754567861557007 }, { "auxiliary_loss_clip": 0.01125613, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.04233313, "balance_loss_mlp": 1.03083241, "epoch": 0.4146700736509845, "flos": 25922369232000.0, "grad_norm": 1.6183082189069362, "language_loss": 0.73234701, "learning_rate": 2.6384439741443696e-06, "loss": 0.75406271, "num_input_tokens_seen": 148072965, "step": 6897, "time_per_iteration": 2.737884759902954 }, { "auxiliary_loss_clip": 0.01114786, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.04562593, "balance_loss_mlp": 1.02713859, "epoch": 0.4147301969036525, "flos": 26833243248000.0, "grad_norm": 1.834097351521641, "language_loss": 0.84865111, "learning_rate": 2.6380748770076873e-06, "loss": 0.87021732, "num_input_tokens_seen": 148093240, "step": 6898, "time_per_iteration": 2.689467430114746 }, { "auxiliary_loss_clip": 0.01079261, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.03853178, "balance_loss_mlp": 1.02030301, "epoch": 0.41479032015632045, "flos": 20298291194880.0, "grad_norm": 1.6538444757930724, "language_loss": 0.74696559, "learning_rate": 2.6377057556751416e-06, "loss": 0.76810819, "num_input_tokens_seen": 148110925, "step": 6899, "time_per_iteration": 2.73575758934021 }, { "auxiliary_loss_clip": 0.0109529, "auxiliary_loss_mlp": 0.0104143, "balance_loss_clip": 1.04097557, "balance_loss_mlp": 1.02549219, "epoch": 0.4148504434089884, "flos": 25264988472960.0, "grad_norm": 2.0028183144746254, "language_loss": 0.75739181, "learning_rate": 2.6373366101607306e-06, "loss": 0.778759, "num_input_tokens_seen": 148130670, "step": 6900, "time_per_iteration": 2.7304093837738037 }, { "auxiliary_loss_clip": 0.01112354, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.04515111, "balance_loss_mlp": 1.02218616, "epoch": 0.4149105666616564, "flos": 12822300679680.0, "grad_norm": 37.61175094058464, "language_loss": 0.79667652, "learning_rate": 2.6369674404784503e-06, "loss": 0.81818151, "num_input_tokens_seen": 148148350, "step": 6901, "time_per_iteration": 2.6238512992858887 }, { "auxiliary_loss_clip": 0.01085977, "auxiliary_loss_mlp": 0.01046173, "balance_loss_clip": 1.03959978, "balance_loss_mlp": 1.0302825, "epoch": 0.41497068991432434, "flos": 16763891713920.0, "grad_norm": 1.6395274695924928, "language_loss": 0.69640017, "learning_rate": 2.6365982466423014e-06, "loss": 0.7177217, "num_input_tokens_seen": 148167550, "step": 6902, "time_per_iteration": 2.6854305267333984 }, { "auxiliary_loss_clip": 0.01097592, "auxiliary_loss_mlp": 0.00770925, "balance_loss_clip": 1.04278207, "balance_loss_mlp": 1.00099885, "epoch": 0.4150308131669923, "flos": 18000906243840.0, "grad_norm": 2.384025861502229, "language_loss": 0.83949161, "learning_rate": 2.6362290286662834e-06, "loss": 0.85817683, "num_input_tokens_seen": 148184740, "step": 6903, "time_per_iteration": 2.6454520225524902 }, { "auxiliary_loss_clip": 0.01133263, "auxiliary_loss_mlp": 0.01042035, "balance_loss_clip": 1.04633808, "balance_loss_mlp": 1.02569163, "epoch": 0.41509093641966033, "flos": 30044770352640.0, "grad_norm": 1.9553359330266324, "language_loss": 0.67639846, "learning_rate": 2.6358597865643968e-06, "loss": 0.69815147, "num_input_tokens_seen": 148204605, "step": 6904, "time_per_iteration": 2.7322065830230713 }, { "auxiliary_loss_clip": 0.01130567, "auxiliary_loss_mlp": 0.0077237, "balance_loss_clip": 1.04620719, "balance_loss_mlp": 1.00097251, "epoch": 0.4151510596723283, "flos": 24279994742400.0, "grad_norm": 1.8757192691258513, "language_loss": 0.77572656, "learning_rate": 2.635490520350643e-06, "loss": 0.79475594, "num_input_tokens_seen": 148224675, "step": 6905, "time_per_iteration": 2.648400068283081 }, { "auxiliary_loss_clip": 0.0113062, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.04648256, "balance_loss_mlp": 1.01869583, "epoch": 0.41521118292499626, "flos": 23476206147840.0, "grad_norm": 1.5608092182069806, "language_loss": 0.68316001, "learning_rate": 2.635121230039025e-06, "loss": 0.7047962, "num_input_tokens_seen": 148243375, "step": 6906, "time_per_iteration": 2.6084086894989014 }, { "auxiliary_loss_clip": 0.01104219, "auxiliary_loss_mlp": 0.0103582, "balance_loss_clip": 1.04238176, "balance_loss_mlp": 1.02167583, "epoch": 0.4152713061776642, "flos": 22125498094080.0, "grad_norm": 2.313429051291415, "language_loss": 0.67982537, "learning_rate": 2.6347519156435467e-06, "loss": 0.70122576, "num_input_tokens_seen": 148261140, "step": 6907, "time_per_iteration": 2.715506076812744 }, { "auxiliary_loss_clip": 0.01100263, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.0479455, "balance_loss_mlp": 1.02419686, "epoch": 0.4153314294303322, "flos": 21251396626560.0, "grad_norm": 2.133321939860832, "language_loss": 0.77338696, "learning_rate": 2.6343825771782123e-06, "loss": 0.79477155, "num_input_tokens_seen": 148279655, "step": 6908, "time_per_iteration": 2.699028253555298 }, { "auxiliary_loss_clip": 0.01035537, "auxiliary_loss_mlp": 0.01050035, "balance_loss_clip": 1.02502179, "balance_loss_mlp": 1.04800892, "epoch": 0.41539155268300015, "flos": 57920681594880.0, "grad_norm": 0.8023457423545532, "language_loss": 0.64889216, "learning_rate": 2.634013214657026e-06, "loss": 0.66974789, "num_input_tokens_seen": 148339005, "step": 6909, "time_per_iteration": 3.174577474594116 }, { "auxiliary_loss_clip": 0.01096348, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.04794037, "balance_loss_mlp": 1.02368009, "epoch": 0.4154516759356681, "flos": 21903677654400.0, "grad_norm": 3.1710005220016293, "language_loss": 0.8712942, "learning_rate": 2.633643828093996e-06, "loss": 0.89263594, "num_input_tokens_seen": 148358715, "step": 6910, "time_per_iteration": 4.24171257019043 }, { "auxiliary_loss_clip": 0.01040831, "auxiliary_loss_mlp": 0.01008541, "balance_loss_clip": 1.02141929, "balance_loss_mlp": 1.00702703, "epoch": 0.4155117991883361, "flos": 67833677226240.0, "grad_norm": 0.8180681021689019, "language_loss": 0.62115103, "learning_rate": 2.633274417503128e-06, "loss": 0.64164472, "num_input_tokens_seen": 148417280, "step": 6911, "time_per_iteration": 3.171510696411133 }, { "auxiliary_loss_clip": 0.01138851, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.05016613, "balance_loss_mlp": 1.0219059, "epoch": 0.41557192244100405, "flos": 14282679934080.0, "grad_norm": 2.4116200088670845, "language_loss": 0.87474132, "learning_rate": 2.6329049828984312e-06, "loss": 0.89650595, "num_input_tokens_seen": 148432610, "step": 6912, "time_per_iteration": 5.576058864593506 }, { "auxiliary_loss_clip": 0.01117561, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.04753387, "balance_loss_mlp": 1.02098989, "epoch": 0.415632045693672, "flos": 24461954064000.0, "grad_norm": 22.77173838310247, "language_loss": 0.63224173, "learning_rate": 2.632535524293914e-06, "loss": 0.65376365, "num_input_tokens_seen": 148451510, "step": 6913, "time_per_iteration": 2.702631711959839 }, { "auxiliary_loss_clip": 0.01102511, "auxiliary_loss_mlp": 0.00771597, "balance_loss_clip": 1.04298615, "balance_loss_mlp": 1.00093937, "epoch": 0.41569216894634, "flos": 20115290378880.0, "grad_norm": 1.7272855093915238, "language_loss": 0.74980754, "learning_rate": 2.632166041703586e-06, "loss": 0.76854861, "num_input_tokens_seen": 148469945, "step": 6914, "time_per_iteration": 4.340964078903198 }, { "auxiliary_loss_clip": 0.01077278, "auxiliary_loss_mlp": 0.01044004, "balance_loss_clip": 1.04201877, "balance_loss_mlp": 1.02906704, "epoch": 0.41575229219900794, "flos": 23798827128960.0, "grad_norm": 1.8325905436461942, "language_loss": 0.87653631, "learning_rate": 2.631796535141458e-06, "loss": 0.89774919, "num_input_tokens_seen": 148486655, "step": 6915, "time_per_iteration": 2.757596731185913 }, { "auxiliary_loss_clip": 0.0109973, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.04447317, "balance_loss_mlp": 1.02728081, "epoch": 0.4158124154516759, "flos": 23108229267840.0, "grad_norm": 3.0600667343253214, "language_loss": 0.70990372, "learning_rate": 2.6314270046215426e-06, "loss": 0.73131478, "num_input_tokens_seen": 148505035, "step": 6916, "time_per_iteration": 2.6894583702087402 }, { "auxiliary_loss_clip": 0.01135969, "auxiliary_loss_mlp": 0.01038621, "balance_loss_clip": 1.04934418, "balance_loss_mlp": 1.02361822, "epoch": 0.41587253870434393, "flos": 24242970798720.0, "grad_norm": 1.53910679789622, "language_loss": 0.71859491, "learning_rate": 2.631057450157852e-06, "loss": 0.74034083, "num_input_tokens_seen": 148525575, "step": 6917, "time_per_iteration": 2.560401439666748 }, { "auxiliary_loss_clip": 0.01104226, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.04427075, "balance_loss_mlp": 1.01856291, "epoch": 0.4159326619570119, "flos": 23881602021120.0, "grad_norm": 1.8609084037764254, "language_loss": 0.80841225, "learning_rate": 2.6306878717643988e-06, "loss": 0.82977629, "num_input_tokens_seen": 148547270, "step": 6918, "time_per_iteration": 2.71455979347229 }, { "auxiliary_loss_clip": 0.01122968, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.05033052, "balance_loss_mlp": 1.02306533, "epoch": 0.41599278520967986, "flos": 40626531354240.0, "grad_norm": 1.460873312199365, "language_loss": 0.70399261, "learning_rate": 2.6303182694551995e-06, "loss": 0.72560704, "num_input_tokens_seen": 148572100, "step": 6919, "time_per_iteration": 2.784090518951416 }, { "auxiliary_loss_clip": 0.01108371, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.04570937, "balance_loss_mlp": 1.0255723, "epoch": 0.4160529084623478, "flos": 18222942165120.0, "grad_norm": 1.8818708282287906, "language_loss": 0.81701922, "learning_rate": 2.6299486432442677e-06, "loss": 0.83851242, "num_input_tokens_seen": 148591245, "step": 6920, "time_per_iteration": 2.644867181777954 }, { "auxiliary_loss_clip": 0.01113217, "auxiliary_loss_mlp": 0.01042119, "balance_loss_clip": 1.04909408, "balance_loss_mlp": 1.02627623, "epoch": 0.4161130317150158, "flos": 13661963982720.0, "grad_norm": 2.168550443744471, "language_loss": 0.65408564, "learning_rate": 2.6295789931456195e-06, "loss": 0.67563891, "num_input_tokens_seen": 148607980, "step": 6921, "time_per_iteration": 2.647270441055298 }, { "auxiliary_loss_clip": 0.01108151, "auxiliary_loss_mlp": 0.01042421, "balance_loss_clip": 1.04479325, "balance_loss_mlp": 1.02768648, "epoch": 0.41617315496768376, "flos": 16178511767040.0, "grad_norm": 2.3873319200859004, "language_loss": 0.80806041, "learning_rate": 2.629209319173274e-06, "loss": 0.82956612, "num_input_tokens_seen": 148624490, "step": 6922, "time_per_iteration": 2.6521530151367188 }, { "auxiliary_loss_clip": 0.01107722, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.04645085, "balance_loss_mlp": 1.02304578, "epoch": 0.4162332782203517, "flos": 26213317395840.0, "grad_norm": 1.6600188367705673, "language_loss": 0.67455506, "learning_rate": 2.628839621341247e-06, "loss": 0.69600594, "num_input_tokens_seen": 148646490, "step": 6923, "time_per_iteration": 2.6982760429382324 }, { "auxiliary_loss_clip": 0.01100761, "auxiliary_loss_mlp": 0.01052569, "balance_loss_clip": 1.04614723, "balance_loss_mlp": 1.03649926, "epoch": 0.4162934014730197, "flos": 28183987215360.0, "grad_norm": 2.1905305361602676, "language_loss": 0.75802875, "learning_rate": 2.6284698996635593e-06, "loss": 0.77956206, "num_input_tokens_seen": 148668580, "step": 6924, "time_per_iteration": 2.746675491333008 }, { "auxiliary_loss_clip": 0.01134317, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.04869533, "balance_loss_mlp": 1.02842665, "epoch": 0.41635352472568765, "flos": 19865316654720.0, "grad_norm": 2.7384378444587774, "language_loss": 0.73572767, "learning_rate": 2.62810015415423e-06, "loss": 0.75749695, "num_input_tokens_seen": 148688410, "step": 6925, "time_per_iteration": 2.6443655490875244 }, { "auxiliary_loss_clip": 0.01107096, "auxiliary_loss_mlp": 0.01035039, "balance_loss_clip": 1.04328012, "balance_loss_mlp": 1.02092457, "epoch": 0.4164136479783556, "flos": 14935356011520.0, "grad_norm": 2.2965796841293487, "language_loss": 0.83732742, "learning_rate": 2.6277303848272792e-06, "loss": 0.85874879, "num_input_tokens_seen": 148704855, "step": 6926, "time_per_iteration": 2.688778877258301 }, { "auxiliary_loss_clip": 0.01101563, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.04851913, "balance_loss_mlp": 1.03019416, "epoch": 0.4164737712310236, "flos": 21757593041280.0, "grad_norm": 1.7122304152619183, "language_loss": 0.86459213, "learning_rate": 2.6273605916967302e-06, "loss": 0.88604003, "num_input_tokens_seen": 148723065, "step": 6927, "time_per_iteration": 2.6891677379608154 }, { "auxiliary_loss_clip": 0.01123007, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.04902172, "balance_loss_mlp": 1.03252852, "epoch": 0.41653389448369155, "flos": 20740136394240.0, "grad_norm": 2.2496180093698555, "language_loss": 0.72619522, "learning_rate": 2.626990774776604e-06, "loss": 0.74790633, "num_input_tokens_seen": 148741780, "step": 6928, "time_per_iteration": 2.6853785514831543 }, { "auxiliary_loss_clip": 0.01103421, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.04516923, "balance_loss_mlp": 1.02305102, "epoch": 0.4165940177363595, "flos": 24972891073920.0, "grad_norm": 2.3320684503004667, "language_loss": 0.781192, "learning_rate": 2.6266209340809254e-06, "loss": 0.80260193, "num_input_tokens_seen": 148759795, "step": 6929, "time_per_iteration": 2.675412893295288 }, { "auxiliary_loss_clip": 0.01130228, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.04634309, "balance_loss_mlp": 1.02042162, "epoch": 0.41665414098902753, "flos": 20521727746560.0, "grad_norm": 2.2076337971053897, "language_loss": 0.70941442, "learning_rate": 2.6262510696237182e-06, "loss": 0.73106134, "num_input_tokens_seen": 148778680, "step": 6930, "time_per_iteration": 2.5896191596984863 }, { "auxiliary_loss_clip": 0.0110378, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.04316616, "balance_loss_mlp": 1.02566469, "epoch": 0.4167142642416955, "flos": 19682926369920.0, "grad_norm": 1.7468000498396183, "language_loss": 0.81265134, "learning_rate": 2.625881181419007e-06, "loss": 0.83409023, "num_input_tokens_seen": 148796470, "step": 6931, "time_per_iteration": 2.693753719329834 }, { "auxiliary_loss_clip": 0.01073611, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.03671885, "balance_loss_mlp": 1.0253247, "epoch": 0.41677438749436346, "flos": 23763742519680.0, "grad_norm": 1.7136797301427433, "language_loss": 0.78969777, "learning_rate": 2.6255112694808193e-06, "loss": 0.81083435, "num_input_tokens_seen": 148815300, "step": 6932, "time_per_iteration": 2.900186061859131 }, { "auxiliary_loss_clip": 0.01110051, "auxiliary_loss_mlp": 0.00772641, "balance_loss_clip": 1.04659891, "balance_loss_mlp": 1.00109386, "epoch": 0.41683451074703143, "flos": 30410053712640.0, "grad_norm": 1.8812444225834188, "language_loss": 0.81995165, "learning_rate": 2.6251413338231813e-06, "loss": 0.83877861, "num_input_tokens_seen": 148834315, "step": 6933, "time_per_iteration": 2.815415143966675 }, { "auxiliary_loss_clip": 0.01135077, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.04731107, "balance_loss_mlp": 1.02077699, "epoch": 0.4168946339996994, "flos": 21506757390720.0, "grad_norm": 2.9283724451949236, "language_loss": 0.76852083, "learning_rate": 2.624771374460121e-06, "loss": 0.79023689, "num_input_tokens_seen": 148852420, "step": 6934, "time_per_iteration": 2.7175137996673584 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.048594, "balance_loss_mlp": 1.02038264, "epoch": 0.41695475725236736, "flos": 17638675539840.0, "grad_norm": 1.7602525666099749, "language_loss": 0.67555362, "learning_rate": 2.624401391405668e-06, "loss": 0.6971271, "num_input_tokens_seen": 148869305, "step": 6935, "time_per_iteration": 2.740238666534424 }, { "auxiliary_loss_clip": 0.01106934, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.04740202, "balance_loss_mlp": 1.02606606, "epoch": 0.4170148805050353, "flos": 15668903560320.0, "grad_norm": 2.0770148597671834, "language_loss": 0.73310643, "learning_rate": 2.6240313846738513e-06, "loss": 0.75458586, "num_input_tokens_seen": 148886395, "step": 6936, "time_per_iteration": 2.71653413772583 }, { "auxiliary_loss_clip": 0.01115958, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.04845905, "balance_loss_mlp": 1.02274418, "epoch": 0.4170750037577033, "flos": 15159151699200.0, "grad_norm": 2.3408521316198794, "language_loss": 0.74009961, "learning_rate": 2.6236613542787024e-06, "loss": 0.76162577, "num_input_tokens_seen": 148905235, "step": 6937, "time_per_iteration": 2.627197265625 }, { "auxiliary_loss_clip": 0.01105318, "auxiliary_loss_mlp": 0.01038451, "balance_loss_clip": 1.04543686, "balance_loss_mlp": 1.02422357, "epoch": 0.41713512701037125, "flos": 28768289754240.0, "grad_norm": 2.1407867738666977, "language_loss": 0.84349155, "learning_rate": 2.6232913002342518e-06, "loss": 0.8649292, "num_input_tokens_seen": 148928130, "step": 6938, "time_per_iteration": 2.7512307167053223 }, { "auxiliary_loss_clip": 0.01107641, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.04718804, "balance_loss_mlp": 1.02217638, "epoch": 0.4171952502630392, "flos": 28256993608320.0, "grad_norm": 1.985550471698889, "language_loss": 0.7437641, "learning_rate": 2.6229212225545334e-06, "loss": 0.76521742, "num_input_tokens_seen": 148948790, "step": 6939, "time_per_iteration": 2.8480472564697266 }, { "auxiliary_loss_clip": 0.01121822, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.0470984, "balance_loss_mlp": 1.01803446, "epoch": 0.4172553735157072, "flos": 24571697091840.0, "grad_norm": 2.560264252806934, "language_loss": 0.74981248, "learning_rate": 2.622551121253579e-06, "loss": 0.77136433, "num_input_tokens_seen": 148967690, "step": 6940, "time_per_iteration": 2.707803249359131 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04839242, "balance_loss_mlp": 1.0266397, "epoch": 0.41731549676837515, "flos": 27045797978880.0, "grad_norm": 2.248952291582723, "language_loss": 0.71683985, "learning_rate": 2.622180996345424e-06, "loss": 0.73857027, "num_input_tokens_seen": 148987150, "step": 6941, "time_per_iteration": 2.6406352519989014 }, { "auxiliary_loss_clip": 0.01119657, "auxiliary_loss_mlp": 0.0103964, "balance_loss_clip": 1.04871619, "balance_loss_mlp": 1.02461994, "epoch": 0.4173756200210431, "flos": 28394063907840.0, "grad_norm": 2.929963903641068, "language_loss": 0.74062824, "learning_rate": 2.621810847844104e-06, "loss": 0.76222122, "num_input_tokens_seen": 149004895, "step": 6942, "time_per_iteration": 2.7269139289855957 }, { "auxiliary_loss_clip": 0.01096497, "auxiliary_loss_mlp": 0.01046649, "balance_loss_clip": 1.04605746, "balance_loss_mlp": 1.03079462, "epoch": 0.41743574327371114, "flos": 22521556431360.0, "grad_norm": 2.258418581580233, "language_loss": 0.72607493, "learning_rate": 2.6214406757636534e-06, "loss": 0.74750638, "num_input_tokens_seen": 149020970, "step": 6943, "time_per_iteration": 2.8146276473999023 }, { "auxiliary_loss_clip": 0.01100254, "auxiliary_loss_mlp": 0.00772502, "balance_loss_clip": 1.04520488, "balance_loss_mlp": 1.00081825, "epoch": 0.4174958665263791, "flos": 30113431200000.0, "grad_norm": 1.7970886758223585, "language_loss": 0.63763773, "learning_rate": 2.621070480118111e-06, "loss": 0.65636539, "num_input_tokens_seen": 149041795, "step": 6944, "time_per_iteration": 2.7709715366363525 }, { "auxiliary_loss_clip": 0.0109928, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.03980803, "balance_loss_mlp": 1.02262771, "epoch": 0.41755598977904707, "flos": 25263444188160.0, "grad_norm": 1.5620596317333308, "language_loss": 0.70201832, "learning_rate": 2.620700260921513e-06, "loss": 0.72338641, "num_input_tokens_seen": 149063700, "step": 6945, "time_per_iteration": 2.7668464183807373 }, { "auxiliary_loss_clip": 0.01086028, "auxiliary_loss_mlp": 0.01052164, "balance_loss_clip": 1.03888953, "balance_loss_mlp": 1.03434181, "epoch": 0.41761611303171503, "flos": 19828580019840.0, "grad_norm": 3.903492543127265, "language_loss": 0.81313473, "learning_rate": 2.620330018187899e-06, "loss": 0.8345167, "num_input_tokens_seen": 149082410, "step": 6946, "time_per_iteration": 2.7656164169311523 }, { "auxiliary_loss_clip": 0.0111906, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.04820168, "balance_loss_mlp": 1.01947689, "epoch": 0.417676236284383, "flos": 15523249910400.0, "grad_norm": 3.3237502950686997, "language_loss": 0.77819085, "learning_rate": 2.6199597519313086e-06, "loss": 0.79971987, "num_input_tokens_seen": 149098745, "step": 6947, "time_per_iteration": 2.6658904552459717 }, { "auxiliary_loss_clip": 0.01131014, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.04678917, "balance_loss_mlp": 1.020262, "epoch": 0.41773635953705096, "flos": 32524473761280.0, "grad_norm": 4.535535573323162, "language_loss": 0.72142154, "learning_rate": 2.6195894621657825e-06, "loss": 0.7430864, "num_input_tokens_seen": 149122255, "step": 6948, "time_per_iteration": 2.728604316711426 }, { "auxiliary_loss_clip": 0.0111373, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.04416013, "balance_loss_mlp": 1.02127814, "epoch": 0.4177964827897189, "flos": 23440941970560.0, "grad_norm": 1.752796472610303, "language_loss": 0.77020466, "learning_rate": 2.619219148905362e-06, "loss": 0.79169655, "num_input_tokens_seen": 149142845, "step": 6949, "time_per_iteration": 4.2494752407073975 }, { "auxiliary_loss_clip": 0.011131, "auxiliary_loss_mlp": 0.01040025, "balance_loss_clip": 1.05060196, "balance_loss_mlp": 1.02523708, "epoch": 0.4178566060423869, "flos": 22748907565440.0, "grad_norm": 1.637174584956538, "language_loss": 0.8214075, "learning_rate": 2.6188488121640888e-06, "loss": 0.84293878, "num_input_tokens_seen": 149163375, "step": 6950, "time_per_iteration": 2.7383689880371094 }, { "auxiliary_loss_clip": 0.01099413, "auxiliary_loss_mlp": 0.00770849, "balance_loss_clip": 1.04511857, "balance_loss_mlp": 1.00090635, "epoch": 0.41791672929505486, "flos": 26032794618240.0, "grad_norm": 1.501775844018401, "language_loss": 0.7649653, "learning_rate": 2.618478451956007e-06, "loss": 0.78366792, "num_input_tokens_seen": 149185610, "step": 6951, "time_per_iteration": 5.789496660232544 }, { "auxiliary_loss_clip": 0.01088001, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.04565978, "balance_loss_mlp": 1.01929939, "epoch": 0.4179768525477228, "flos": 19568694142080.0, "grad_norm": 1.8438034417752391, "language_loss": 0.73442549, "learning_rate": 2.61810806829516e-06, "loss": 0.75564867, "num_input_tokens_seen": 149203990, "step": 6952, "time_per_iteration": 2.762404680252075 }, { "auxiliary_loss_clip": 0.01116339, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04836369, "balance_loss_mlp": 1.0251013, "epoch": 0.4180369758003908, "flos": 17783826399360.0, "grad_norm": 2.8847563198217667, "language_loss": 0.7161783, "learning_rate": 2.617737661195593e-06, "loss": 0.73773146, "num_input_tokens_seen": 149221385, "step": 6953, "time_per_iteration": 2.6514034271240234 }, { "auxiliary_loss_clip": 0.01118442, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.04711723, "balance_loss_mlp": 1.02363181, "epoch": 0.41809709905305875, "flos": 20960663944320.0, "grad_norm": 1.7834717110535325, "language_loss": 0.75982141, "learning_rate": 2.617367230671353e-06, "loss": 0.78139216, "num_input_tokens_seen": 149241175, "step": 6954, "time_per_iteration": 4.3135082721710205 }, { "auxiliary_loss_clip": 0.01092319, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.04647863, "balance_loss_mlp": 1.02979708, "epoch": 0.4181572223057267, "flos": 22017622573440.0, "grad_norm": 2.907950037168039, "language_loss": 0.84492826, "learning_rate": 2.616996776736485e-06, "loss": 0.86631334, "num_input_tokens_seen": 149259115, "step": 6955, "time_per_iteration": 2.7724356651306152 }, { "auxiliary_loss_clip": 0.01121525, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.04870594, "balance_loss_mlp": 1.02604234, "epoch": 0.4182173455583947, "flos": 26245528917120.0, "grad_norm": 1.6794559400644542, "language_loss": 0.83262718, "learning_rate": 2.616626299405037e-06, "loss": 0.8542468, "num_input_tokens_seen": 149278705, "step": 6956, "time_per_iteration": 2.7260353565216064 }, { "auxiliary_loss_clip": 0.01093652, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.04491091, "balance_loss_mlp": 1.02423358, "epoch": 0.4182774688110627, "flos": 14791605782400.0, "grad_norm": 2.3946498969788634, "language_loss": 0.71788859, "learning_rate": 2.616255798691059e-06, "loss": 0.73921835, "num_input_tokens_seen": 149294040, "step": 6957, "time_per_iteration": 2.6826114654541016 }, { "auxiliary_loss_clip": 0.01099548, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.0462482, "balance_loss_mlp": 1.02966106, "epoch": 0.41833759206373067, "flos": 20412020632320.0, "grad_norm": 2.4781797095716276, "language_loss": 0.75947559, "learning_rate": 2.6158852746085982e-06, "loss": 0.78090888, "num_input_tokens_seen": 149310385, "step": 6958, "time_per_iteration": 2.7528226375579834 }, { "auxiliary_loss_clip": 0.01083285, "auxiliary_loss_mlp": 0.00772338, "balance_loss_clip": 1.04087532, "balance_loss_mlp": 1.0007602, "epoch": 0.41839771531639863, "flos": 23656333875840.0, "grad_norm": 1.8764496083097535, "language_loss": 0.7693305, "learning_rate": 2.6155147271717066e-06, "loss": 0.78788674, "num_input_tokens_seen": 149328235, "step": 6959, "time_per_iteration": 2.7859151363372803 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.00772565, "balance_loss_clip": 1.04304624, "balance_loss_mlp": 1.00090861, "epoch": 0.4184578385690666, "flos": 19754137082880.0, "grad_norm": 2.1131068778060498, "language_loss": 0.77339065, "learning_rate": 2.6151441563944347e-06, "loss": 0.79201001, "num_input_tokens_seen": 149347465, "step": 6960, "time_per_iteration": 2.7497265338897705 }, { "auxiliary_loss_clip": 0.01098942, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.04735017, "balance_loss_mlp": 1.02385998, "epoch": 0.41851796182173456, "flos": 20193396503040.0, "grad_norm": 1.8404962312042226, "language_loss": 0.75842559, "learning_rate": 2.614773562290835e-06, "loss": 0.7797904, "num_input_tokens_seen": 149366685, "step": 6961, "time_per_iteration": 2.6800267696380615 }, { "auxiliary_loss_clip": 0.01038031, "auxiliary_loss_mlp": 0.01001682, "balance_loss_clip": 1.03925419, "balance_loss_mlp": 0.99970287, "epoch": 0.41857808507440253, "flos": 59018794231680.0, "grad_norm": 0.7827663866056928, "language_loss": 0.54655838, "learning_rate": 2.61440294487496e-06, "loss": 0.56695551, "num_input_tokens_seen": 149422925, "step": 6962, "time_per_iteration": 3.1537134647369385 }, { "auxiliary_loss_clip": 0.01120288, "auxiliary_loss_mlp": 0.0104634, "balance_loss_clip": 1.04961705, "balance_loss_mlp": 1.0318327, "epoch": 0.4186382083270705, "flos": 18478805719680.0, "grad_norm": 1.960507757786237, "language_loss": 0.85535777, "learning_rate": 2.614032304160864e-06, "loss": 0.87702405, "num_input_tokens_seen": 149440820, "step": 6963, "time_per_iteration": 2.5925374031066895 }, { "auxiliary_loss_clip": 0.01106535, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.04856253, "balance_loss_mlp": 1.02657938, "epoch": 0.41869833157973846, "flos": 21578758202880.0, "grad_norm": 1.6555227491445992, "language_loss": 0.70422602, "learning_rate": 2.6136616401626014e-06, "loss": 0.72570229, "num_input_tokens_seen": 149461060, "step": 6964, "time_per_iteration": 2.675595760345459 }, { "auxiliary_loss_clip": 0.01131013, "auxiliary_loss_mlp": 0.01048168, "balance_loss_clip": 1.04926276, "balance_loss_mlp": 1.03433418, "epoch": 0.4187584548324064, "flos": 35517412650240.0, "grad_norm": 2.107779734715906, "language_loss": 0.71486962, "learning_rate": 2.6132909528942273e-06, "loss": 0.73666137, "num_input_tokens_seen": 149483115, "step": 6965, "time_per_iteration": 2.728795289993286 }, { "auxiliary_loss_clip": 0.01081273, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.04315698, "balance_loss_mlp": 1.02465594, "epoch": 0.4188185780850744, "flos": 18655880791680.0, "grad_norm": 1.546256806673652, "language_loss": 0.71920437, "learning_rate": 2.6129202423697997e-06, "loss": 0.74039984, "num_input_tokens_seen": 149501495, "step": 6966, "time_per_iteration": 2.9000282287597656 }, { "auxiliary_loss_clip": 0.01127558, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.04965436, "balance_loss_mlp": 1.02194023, "epoch": 0.41887870133774235, "flos": 40333428374400.0, "grad_norm": 2.0539481091161664, "language_loss": 0.71188843, "learning_rate": 2.612549508603375e-06, "loss": 0.73353529, "num_input_tokens_seen": 149523170, "step": 6967, "time_per_iteration": 2.8494174480438232 }, { "auxiliary_loss_clip": 0.01059483, "auxiliary_loss_mlp": 0.01001432, "balance_loss_clip": 1.039819, "balance_loss_mlp": 0.99973947, "epoch": 0.4189388245904103, "flos": 61371336516480.0, "grad_norm": 0.6719582962825281, "language_loss": 0.46191829, "learning_rate": 2.612178751609011e-06, "loss": 0.48252743, "num_input_tokens_seen": 149583955, "step": 6968, "time_per_iteration": 3.2362303733825684 }, { "auxiliary_loss_clip": 0.01123461, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.04708195, "balance_loss_mlp": 1.02722979, "epoch": 0.4189989478430783, "flos": 28215624119040.0, "grad_norm": 2.2684151977061386, "language_loss": 0.75044996, "learning_rate": 2.6118079714007685e-06, "loss": 0.77211517, "num_input_tokens_seen": 149604440, "step": 6969, "time_per_iteration": 2.836956739425659 }, { "auxiliary_loss_clip": 0.01108551, "auxiliary_loss_mlp": 0.01045091, "balance_loss_clip": 1.0470643, "balance_loss_mlp": 1.03178096, "epoch": 0.4190590710957463, "flos": 24565879088640.0, "grad_norm": 1.9985372976124152, "language_loss": 0.8083396, "learning_rate": 2.611437167992705e-06, "loss": 0.82987607, "num_input_tokens_seen": 149623745, "step": 6970, "time_per_iteration": 2.7209956645965576 }, { "auxiliary_loss_clip": 0.01119916, "auxiliary_loss_mlp": 0.0104141, "balance_loss_clip": 1.04898238, "balance_loss_mlp": 1.02689075, "epoch": 0.41911919434841427, "flos": 21726027964800.0, "grad_norm": 2.2489196165322713, "language_loss": 0.82699662, "learning_rate": 2.6110663413988835e-06, "loss": 0.84860986, "num_input_tokens_seen": 149643025, "step": 6971, "time_per_iteration": 2.6844992637634277 }, { "auxiliary_loss_clip": 0.01105807, "auxiliary_loss_mlp": 0.01047014, "balance_loss_clip": 1.0493474, "balance_loss_mlp": 1.03207135, "epoch": 0.41917931760108224, "flos": 17601543855360.0, "grad_norm": 1.6553402405348427, "language_loss": 0.74262661, "learning_rate": 2.6106954916333648e-06, "loss": 0.76415479, "num_input_tokens_seen": 149660695, "step": 6972, "time_per_iteration": 2.6240105628967285 }, { "auxiliary_loss_clip": 0.01102199, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.0421176, "balance_loss_mlp": 1.02589083, "epoch": 0.4192394408537502, "flos": 37816701022080.0, "grad_norm": 1.5708676830874608, "language_loss": 0.72811258, "learning_rate": 2.610324618710212e-06, "loss": 0.74953938, "num_input_tokens_seen": 149682040, "step": 6973, "time_per_iteration": 2.8109309673309326 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.05107093, "balance_loss_mlp": 1.02461183, "epoch": 0.41929956410641817, "flos": 23107726477440.0, "grad_norm": 1.8294609220169469, "language_loss": 0.74864107, "learning_rate": 2.609953722643489e-06, "loss": 0.77004373, "num_input_tokens_seen": 149700855, "step": 6974, "time_per_iteration": 2.7036855220794678 }, { "auxiliary_loss_clip": 0.01117361, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.04402697, "balance_loss_mlp": 1.02359784, "epoch": 0.41935968735908613, "flos": 22524537260160.0, "grad_norm": 1.843462386151443, "language_loss": 0.7271533, "learning_rate": 2.609582803447259e-06, "loss": 0.748703, "num_input_tokens_seen": 149717360, "step": 6975, "time_per_iteration": 2.632661819458008 }, { "auxiliary_loss_clip": 0.01113766, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04679942, "balance_loss_mlp": 1.02849412, "epoch": 0.4194198106117541, "flos": 26870446759680.0, "grad_norm": 1.580698900699299, "language_loss": 0.80874467, "learning_rate": 2.6092118611355885e-06, "loss": 0.83030754, "num_input_tokens_seen": 149738975, "step": 6976, "time_per_iteration": 2.68833327293396 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01042179, "balance_loss_clip": 1.04087496, "balance_loss_mlp": 1.02671123, "epoch": 0.41947993386442206, "flos": 19902412425600.0, "grad_norm": 4.6015574144833264, "language_loss": 0.6767152, "learning_rate": 2.6088408957225425e-06, "loss": 0.69811881, "num_input_tokens_seen": 149757055, "step": 6977, "time_per_iteration": 2.6453959941864014 }, { "auxiliary_loss_clip": 0.01122702, "auxiliary_loss_mlp": 0.0104277, "balance_loss_clip": 1.04980922, "balance_loss_mlp": 1.02926338, "epoch": 0.41954005711709, "flos": 17383889393280.0, "grad_norm": 2.3946463459425966, "language_loss": 0.80506754, "learning_rate": 2.6084699072221898e-06, "loss": 0.82672226, "num_input_tokens_seen": 149772885, "step": 6978, "time_per_iteration": 2.596269369125366 }, { "auxiliary_loss_clip": 0.01133146, "auxiliary_loss_mlp": 0.0103908, "balance_loss_clip": 1.04677558, "balance_loss_mlp": 1.02459598, "epoch": 0.419600180369758, "flos": 25003306915200.0, "grad_norm": 1.7226002389356767, "language_loss": 0.82708085, "learning_rate": 2.6080988956485964e-06, "loss": 0.84880304, "num_input_tokens_seen": 149791515, "step": 6979, "time_per_iteration": 2.588383197784424 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.04659355, "balance_loss_mlp": 1.02302253, "epoch": 0.41966030362242596, "flos": 17383781652480.0, "grad_norm": 2.4214608222579206, "language_loss": 0.83723533, "learning_rate": 2.6077278610158325e-06, "loss": 0.85889894, "num_input_tokens_seen": 149807250, "step": 6980, "time_per_iteration": 2.5890002250671387 }, { "auxiliary_loss_clip": 0.01132913, "auxiliary_loss_mlp": 0.01043925, "balance_loss_clip": 1.04753232, "balance_loss_mlp": 1.02994215, "epoch": 0.4197204268750939, "flos": 22156165330560.0, "grad_norm": 2.919161771051539, "language_loss": 0.7951659, "learning_rate": 2.6073568033379665e-06, "loss": 0.81693423, "num_input_tokens_seen": 149821640, "step": 6981, "time_per_iteration": 2.6015915870666504 }, { "auxiliary_loss_clip": 0.01096505, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.04636097, "balance_loss_mlp": 1.02382243, "epoch": 0.4197805501277619, "flos": 22084128604800.0, "grad_norm": 2.285836698514787, "language_loss": 0.84386683, "learning_rate": 2.6069857226290696e-06, "loss": 0.86520445, "num_input_tokens_seen": 149840545, "step": 6982, "time_per_iteration": 2.755657434463501 }, { "auxiliary_loss_clip": 0.01120032, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 1.04708028, "balance_loss_mlp": 1.02419019, "epoch": 0.4198406733804299, "flos": 26432192920320.0, "grad_norm": 2.941579449236281, "language_loss": 0.57212174, "learning_rate": 2.606614618903214e-06, "loss": 0.59370977, "num_input_tokens_seen": 149860375, "step": 6983, "time_per_iteration": 2.699927568435669 }, { "auxiliary_loss_clip": 0.01120799, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.05017662, "balance_loss_mlp": 1.02513719, "epoch": 0.4199007966330979, "flos": 12531029293440.0, "grad_norm": 1.788715678149628, "language_loss": 0.82569104, "learning_rate": 2.606243492174471e-06, "loss": 0.84727859, "num_input_tokens_seen": 149877850, "step": 6984, "time_per_iteration": 2.6608574390411377 }, { "auxiliary_loss_clip": 0.01110821, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.04403567, "balance_loss_mlp": 1.01740074, "epoch": 0.41996091988576584, "flos": 21762944167680.0, "grad_norm": 1.8578510762238896, "language_loss": 0.79251826, "learning_rate": 2.605872342456914e-06, "loss": 0.81393987, "num_input_tokens_seen": 149896110, "step": 6985, "time_per_iteration": 2.6915009021759033 }, { "auxiliary_loss_clip": 0.01134356, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.04694271, "balance_loss_mlp": 1.02278078, "epoch": 0.4200210431384338, "flos": 26541935948160.0, "grad_norm": 1.6735394330256788, "language_loss": 0.78439772, "learning_rate": 2.6055011697646173e-06, "loss": 0.80611569, "num_input_tokens_seen": 149916495, "step": 6986, "time_per_iteration": 2.6553595066070557 }, { "auxiliary_loss_clip": 0.01108367, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.04705167, "balance_loss_mlp": 1.01957011, "epoch": 0.42008116639110177, "flos": 26795824254720.0, "grad_norm": 1.6966099884396408, "language_loss": 0.72624969, "learning_rate": 2.605129974111655e-06, "loss": 0.7476564, "num_input_tokens_seen": 149936445, "step": 6987, "time_per_iteration": 2.7428104877471924 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.00774749, "balance_loss_clip": 1.04440594, "balance_loss_mlp": 1.00098395, "epoch": 0.42014128964376973, "flos": 32087333243520.0, "grad_norm": 1.4394465087417463, "language_loss": 0.74992245, "learning_rate": 2.604758755512104e-06, "loss": 0.76872891, "num_input_tokens_seen": 149959430, "step": 6988, "time_per_iteration": 4.499454975128174 }, { "auxiliary_loss_clip": 0.01124153, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.04908502, "balance_loss_mlp": 1.02585781, "epoch": 0.4202014128964377, "flos": 26467133875200.0, "grad_norm": 1.6029470393888554, "language_loss": 0.73995304, "learning_rate": 2.60438751398004e-06, "loss": 0.76159656, "num_input_tokens_seen": 149980365, "step": 6989, "time_per_iteration": 2.6979968547821045 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01037728, "balance_loss_clip": 1.04531431, "balance_loss_mlp": 1.02353013, "epoch": 0.42026153614910566, "flos": 13401216178560.0, "grad_norm": 2.8939358842188043, "language_loss": 0.70562875, "learning_rate": 2.6040162495295404e-06, "loss": 0.72709703, "num_input_tokens_seen": 149997375, "step": 6990, "time_per_iteration": 4.2269814014434814 }, { "auxiliary_loss_clip": 0.01052428, "auxiliary_loss_mlp": 0.00753318, "balance_loss_clip": 1.03888559, "balance_loss_mlp": 1.00109041, "epoch": 0.42032165940177363, "flos": 60250457635200.0, "grad_norm": 1.417771869116233, "language_loss": 0.60470819, "learning_rate": 2.603644962174685e-06, "loss": 0.62276566, "num_input_tokens_seen": 150051230, "step": 6991, "time_per_iteration": 4.600361585617065 }, { "auxiliary_loss_clip": 0.01135512, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.05044973, "balance_loss_mlp": 1.02417135, "epoch": 0.4203817826544416, "flos": 24535211852160.0, "grad_norm": 1.4766426515770763, "language_loss": 0.832901, "learning_rate": 2.6032736519295517e-06, "loss": 0.85464245, "num_input_tokens_seen": 150071135, "step": 6992, "time_per_iteration": 2.688693046569824 }, { "auxiliary_loss_clip": 0.01058225, "auxiliary_loss_mlp": 0.01016781, "balance_loss_clip": 1.02967906, "balance_loss_mlp": 1.01523161, "epoch": 0.42044190590710956, "flos": 58820781530880.0, "grad_norm": 0.8077151468791776, "language_loss": 0.65494478, "learning_rate": 2.6029023188082217e-06, "loss": 0.67569482, "num_input_tokens_seen": 150125220, "step": 6993, "time_per_iteration": 4.7132039070129395 }, { "auxiliary_loss_clip": 0.011371, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.04959965, "balance_loss_mlp": 1.02267361, "epoch": 0.4205020291597775, "flos": 16436063260800.0, "grad_norm": 1.948890763784571, "language_loss": 0.83380342, "learning_rate": 2.6025309628247746e-06, "loss": 0.85556042, "num_input_tokens_seen": 150142300, "step": 6994, "time_per_iteration": 2.5883679389953613 }, { "auxiliary_loss_clip": 0.01120964, "auxiliary_loss_mlp": 0.00771063, "balance_loss_clip": 1.04939461, "balance_loss_mlp": 1.00095451, "epoch": 0.4205621524124455, "flos": 18405655672320.0, "grad_norm": 1.5483229522184627, "language_loss": 0.78529471, "learning_rate": 2.6021595839932934e-06, "loss": 0.80421495, "num_input_tokens_seen": 150161345, "step": 6995, "time_per_iteration": 2.716649055480957 }, { "auxiliary_loss_clip": 0.0109323, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.04375339, "balance_loss_mlp": 1.01855421, "epoch": 0.4206222756651135, "flos": 25520097841920.0, "grad_norm": 1.4060831947988737, "language_loss": 0.80397403, "learning_rate": 2.60178818232786e-06, "loss": 0.82522857, "num_input_tokens_seen": 150182420, "step": 6996, "time_per_iteration": 2.773655891418457 }, { "auxiliary_loss_clip": 0.01111456, "auxiliary_loss_mlp": 0.00771084, "balance_loss_clip": 1.0477984, "balance_loss_mlp": 1.00100029, "epoch": 0.4206823989177815, "flos": 15304338472320.0, "grad_norm": 1.9934224916744, "language_loss": 0.7558648, "learning_rate": 2.601416757842559e-06, "loss": 0.77469015, "num_input_tokens_seen": 150200175, "step": 6997, "time_per_iteration": 2.6486191749572754 }, { "auxiliary_loss_clip": 0.01130573, "auxiliary_loss_mlp": 0.01042531, "balance_loss_clip": 1.04606771, "balance_loss_mlp": 1.02835727, "epoch": 0.42074252217044944, "flos": 15554096714880.0, "grad_norm": 3.451993658012451, "language_loss": 0.75860173, "learning_rate": 2.6010453105514743e-06, "loss": 0.78033274, "num_input_tokens_seen": 150217100, "step": 6998, "time_per_iteration": 2.548783540725708 }, { "auxiliary_loss_clip": 0.01136566, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05027032, "balance_loss_mlp": 1.02827394, "epoch": 0.4208026454231174, "flos": 26145877610880.0, "grad_norm": 1.6802908884651202, "language_loss": 0.76294345, "learning_rate": 2.60067384046869e-06, "loss": 0.78473908, "num_input_tokens_seen": 150239830, "step": 6999, "time_per_iteration": 2.6605780124664307 }, { "auxiliary_loss_clip": 0.01082307, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.04213417, "balance_loss_mlp": 1.02420449, "epoch": 0.42086276867578537, "flos": 23550110380800.0, "grad_norm": 2.828142255503796, "language_loss": 0.64361006, "learning_rate": 2.600302347608295e-06, "loss": 0.66482836, "num_input_tokens_seen": 150260690, "step": 7000, "time_per_iteration": 2.7295126914978027 }, { "auxiliary_loss_clip": 0.01089826, "auxiliary_loss_mlp": 0.01039051, "balance_loss_clip": 1.04259682, "balance_loss_mlp": 1.02433491, "epoch": 0.42092289192845334, "flos": 18113414618880.0, "grad_norm": 2.276209987309232, "language_loss": 0.76550955, "learning_rate": 2.5999308319843743e-06, "loss": 0.78679836, "num_input_tokens_seen": 150279885, "step": 7001, "time_per_iteration": 2.793407917022705 }, { "auxiliary_loss_clip": 0.01091534, "auxiliary_loss_mlp": 0.00771163, "balance_loss_clip": 1.04483819, "balance_loss_mlp": 1.00107491, "epoch": 0.4209830151811213, "flos": 20006588845440.0, "grad_norm": 1.4928891465725471, "language_loss": 0.86682802, "learning_rate": 2.5995592936110154e-06, "loss": 0.88545501, "num_input_tokens_seen": 150297390, "step": 7002, "time_per_iteration": 2.719127655029297 }, { "auxiliary_loss_clip": 0.0109333, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.04801917, "balance_loss_mlp": 1.02297568, "epoch": 0.42104313843378927, "flos": 21978946604160.0, "grad_norm": 1.8843999139097827, "language_loss": 0.67807466, "learning_rate": 2.5991877325023096e-06, "loss": 0.6993705, "num_input_tokens_seen": 150317390, "step": 7003, "time_per_iteration": 2.732848882675171 }, { "auxiliary_loss_clip": 0.01132341, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.04725492, "balance_loss_mlp": 1.02031398, "epoch": 0.42110326168645723, "flos": 25443966965760.0, "grad_norm": 1.9778982096910334, "language_loss": 0.77774739, "learning_rate": 2.598816148672344e-06, "loss": 0.79942405, "num_input_tokens_seen": 150337455, "step": 7004, "time_per_iteration": 2.630838394165039 }, { "auxiliary_loss_clip": 0.01129987, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.04988933, "balance_loss_mlp": 1.02351916, "epoch": 0.4211633849391252, "flos": 17822574195840.0, "grad_norm": 2.0674356984544557, "language_loss": 0.67855948, "learning_rate": 2.59844454213521e-06, "loss": 0.70024478, "num_input_tokens_seen": 150355385, "step": 7005, "time_per_iteration": 2.588533401489258 }, { "auxiliary_loss_clip": 0.01121703, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.0483923, "balance_loss_mlp": 1.01941752, "epoch": 0.42122350819179316, "flos": 16282436791680.0, "grad_norm": 1.9633544911967673, "language_loss": 0.72481513, "learning_rate": 2.5980729129049994e-06, "loss": 0.74636805, "num_input_tokens_seen": 150371750, "step": 7006, "time_per_iteration": 2.5879828929901123 }, { "auxiliary_loss_clip": 0.01133912, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.04963207, "balance_loss_mlp": 1.01787031, "epoch": 0.4212836314444611, "flos": 19645866512640.0, "grad_norm": 1.722108681435548, "language_loss": 0.70495522, "learning_rate": 2.5977012609958033e-06, "loss": 0.72661638, "num_input_tokens_seen": 150389955, "step": 7007, "time_per_iteration": 2.5199153423309326 }, { "auxiliary_loss_clip": 0.0110564, "auxiliary_loss_mlp": 0.00771949, "balance_loss_clip": 1.04377306, "balance_loss_mlp": 1.00098372, "epoch": 0.4213437546971291, "flos": 18369026778240.0, "grad_norm": 1.772679877033185, "language_loss": 0.82893503, "learning_rate": 2.5973295864217166e-06, "loss": 0.84771085, "num_input_tokens_seen": 150405780, "step": 7008, "time_per_iteration": 2.6636033058166504 }, { "auxiliary_loss_clip": 0.01089865, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.04483509, "balance_loss_mlp": 1.02535856, "epoch": 0.42140387794979706, "flos": 27704507541120.0, "grad_norm": 1.895033591472922, "language_loss": 0.72206765, "learning_rate": 2.596957889196831e-06, "loss": 0.74336231, "num_input_tokens_seen": 150425615, "step": 7009, "time_per_iteration": 2.738678216934204 }, { "auxiliary_loss_clip": 0.01132456, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.04812074, "balance_loss_mlp": 1.01674712, "epoch": 0.4214640012024651, "flos": 28147071012480.0, "grad_norm": 2.558025018080716, "language_loss": 0.66191494, "learning_rate": 2.596586169335243e-06, "loss": 0.68354768, "num_input_tokens_seen": 150445765, "step": 7010, "time_per_iteration": 2.6812071800231934 }, { "auxiliary_loss_clip": 0.01092262, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.0424943, "balance_loss_mlp": 1.01774001, "epoch": 0.42152412445513304, "flos": 22997265177600.0, "grad_norm": 2.024875050938184, "language_loss": 0.72456133, "learning_rate": 2.5962144268510477e-06, "loss": 0.74580765, "num_input_tokens_seen": 150464405, "step": 7011, "time_per_iteration": 2.741454601287842 }, { "auxiliary_loss_clip": 0.01046137, "auxiliary_loss_mlp": 0.01001201, "balance_loss_clip": 1.02718639, "balance_loss_mlp": 0.99971068, "epoch": 0.421584247707801, "flos": 63749592938880.0, "grad_norm": 0.7906258228604641, "language_loss": 0.54322207, "learning_rate": 2.5958426617583417e-06, "loss": 0.56369549, "num_input_tokens_seen": 150520430, "step": 7012, "time_per_iteration": 3.1284689903259277 }, { "auxiliary_loss_clip": 0.01123004, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.05000162, "balance_loss_mlp": 1.01663089, "epoch": 0.421644370960469, "flos": 24314612474880.0, "grad_norm": 1.3828895368097467, "language_loss": 0.78401852, "learning_rate": 2.5954708740712215e-06, "loss": 0.80555892, "num_input_tokens_seen": 150542610, "step": 7013, "time_per_iteration": 2.6729819774627686 }, { "auxiliary_loss_clip": 0.01133162, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.04858398, "balance_loss_mlp": 1.01826453, "epoch": 0.42170449421313694, "flos": 23440690575360.0, "grad_norm": 1.8094728177732207, "language_loss": 0.81603825, "learning_rate": 2.595099063803787e-06, "loss": 0.83770084, "num_input_tokens_seen": 150560970, "step": 7014, "time_per_iteration": 2.662652015686035 }, { "auxiliary_loss_clip": 0.01117627, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.04452634, "balance_loss_mlp": 1.01831448, "epoch": 0.4217646174658049, "flos": 23695476721920.0, "grad_norm": 1.7861369926261594, "language_loss": 0.77908784, "learning_rate": 2.5947272309701354e-06, "loss": 0.80058968, "num_input_tokens_seen": 150582615, "step": 7015, "time_per_iteration": 2.763761043548584 }, { "auxiliary_loss_clip": 0.01132815, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 1.04966104, "balance_loss_mlp": 1.02183151, "epoch": 0.42182474071847287, "flos": 24971562270720.0, "grad_norm": 1.3268186837565954, "language_loss": 0.82412994, "learning_rate": 2.594355375584368e-06, "loss": 0.84582508, "num_input_tokens_seen": 150603640, "step": 7016, "time_per_iteration": 2.771812677383423 }, { "auxiliary_loss_clip": 0.01091213, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.04072332, "balance_loss_mlp": 1.01999736, "epoch": 0.42188486397114083, "flos": 22856639431680.0, "grad_norm": 1.813350419138722, "language_loss": 0.68270308, "learning_rate": 2.593983497660586e-06, "loss": 0.70396179, "num_input_tokens_seen": 150622490, "step": 7017, "time_per_iteration": 2.703078508377075 }, { "auxiliary_loss_clip": 0.01045206, "auxiliary_loss_mlp": 0.01012048, "balance_loss_clip": 1.02663231, "balance_loss_mlp": 1.01053989, "epoch": 0.4219449872238088, "flos": 66975700965120.0, "grad_norm": 0.7659311952437052, "language_loss": 0.59381223, "learning_rate": 2.5936115972128895e-06, "loss": 0.61438477, "num_input_tokens_seen": 150689545, "step": 7018, "time_per_iteration": 3.2514843940734863 }, { "auxiliary_loss_clip": 0.01113322, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.04147184, "balance_loss_mlp": 1.01840591, "epoch": 0.42200511047647676, "flos": 13115367745920.0, "grad_norm": 2.3056993234384957, "language_loss": 0.75083554, "learning_rate": 2.593239674255382e-06, "loss": 0.77229911, "num_input_tokens_seen": 150707610, "step": 7019, "time_per_iteration": 2.6845014095306396 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.04650903, "balance_loss_mlp": 1.02023685, "epoch": 0.42206523372914473, "flos": 13991193066240.0, "grad_norm": 1.8835929197669175, "language_loss": 0.69198954, "learning_rate": 2.592867728802166e-06, "loss": 0.71341467, "num_input_tokens_seen": 150724530, "step": 7020, "time_per_iteration": 2.635646343231201 }, { "auxiliary_loss_clip": 0.01107351, "auxiliary_loss_mlp": 0.00771638, "balance_loss_clip": 1.04847479, "balance_loss_mlp": 1.00088549, "epoch": 0.4221253569818127, "flos": 21942317710080.0, "grad_norm": 3.182010152232146, "language_loss": 0.81085485, "learning_rate": 2.592495760867347e-06, "loss": 0.82964474, "num_input_tokens_seen": 150742870, "step": 7021, "time_per_iteration": 2.712358236312866 }, { "auxiliary_loss_clip": 0.0105744, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.03628528, "balance_loss_mlp": 1.02439523, "epoch": 0.42218548023448066, "flos": 32192587071360.0, "grad_norm": 1.7516152237568758, "language_loss": 0.70298421, "learning_rate": 2.5921237704650293e-06, "loss": 0.72396624, "num_input_tokens_seen": 150765500, "step": 7022, "time_per_iteration": 2.9338343143463135 }, { "auxiliary_loss_clip": 0.01114774, "auxiliary_loss_mlp": 0.01028964, "balance_loss_clip": 1.0467478, "balance_loss_mlp": 1.01637506, "epoch": 0.4222456034871487, "flos": 30118961894400.0, "grad_norm": 1.5162864908148717, "language_loss": 0.67418218, "learning_rate": 2.5917517576093188e-06, "loss": 0.69561946, "num_input_tokens_seen": 150784945, "step": 7023, "time_per_iteration": 2.7014782428741455 }, { "auxiliary_loss_clip": 0.01101297, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.0460372, "balance_loss_mlp": 1.0259577, "epoch": 0.42230572673981664, "flos": 22127904305280.0, "grad_norm": 1.6579428625462107, "language_loss": 0.69768953, "learning_rate": 2.591379722314322e-06, "loss": 0.71913004, "num_input_tokens_seen": 150803120, "step": 7024, "time_per_iteration": 2.8669025897979736 }, { "auxiliary_loss_clip": 0.011321, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.04982734, "balance_loss_mlp": 1.02107334, "epoch": 0.4223658499924846, "flos": 22055077480320.0, "grad_norm": 1.7199232023790467, "language_loss": 0.76781225, "learning_rate": 2.591007664594147e-06, "loss": 0.7894851, "num_input_tokens_seen": 150823135, "step": 7025, "time_per_iteration": 2.696200132369995 }, { "auxiliary_loss_clip": 0.01097355, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.04367328, "balance_loss_mlp": 1.02268052, "epoch": 0.4224259732451526, "flos": 20410727742720.0, "grad_norm": 1.6766870979897237, "language_loss": 0.79664457, "learning_rate": 2.5906355844629024e-06, "loss": 0.81798434, "num_input_tokens_seen": 150842070, "step": 7026, "time_per_iteration": 2.7131056785583496 }, { "auxiliary_loss_clip": 0.01053, "auxiliary_loss_mlp": 0.00999983, "balance_loss_clip": 1.02519512, "balance_loss_mlp": 0.9985466, "epoch": 0.42248609649782054, "flos": 62846655828480.0, "grad_norm": 0.7210787168966012, "language_loss": 0.61874068, "learning_rate": 2.5902634819346966e-06, "loss": 0.63927048, "num_input_tokens_seen": 150907450, "step": 7027, "time_per_iteration": 3.2111167907714844 }, { "auxiliary_loss_clip": 0.01131577, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.05022967, "balance_loss_mlp": 1.02400613, "epoch": 0.4225462197504885, "flos": 26249946289920.0, "grad_norm": 1.8872379728212205, "language_loss": 0.71137869, "learning_rate": 2.5898913570236414e-06, "loss": 0.7330761, "num_input_tokens_seen": 150928040, "step": 7028, "time_per_iteration": 4.185323476791382 }, { "auxiliary_loss_clip": 0.01109127, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.04935491, "balance_loss_mlp": 1.02702022, "epoch": 0.42260634300315647, "flos": 20521943228160.0, "grad_norm": 3.7456767842675136, "language_loss": 0.82652044, "learning_rate": 2.589519209743846e-06, "loss": 0.84802449, "num_input_tokens_seen": 150945760, "step": 7029, "time_per_iteration": 2.617464542388916 }, { "auxiliary_loss_clip": 0.01086316, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.04393244, "balance_loss_mlp": 1.02826512, "epoch": 0.42266646625582444, "flos": 24316731377280.0, "grad_norm": 1.852504104659585, "language_loss": 0.75125468, "learning_rate": 2.589147040109424e-06, "loss": 0.7725513, "num_input_tokens_seen": 150965665, "step": 7030, "time_per_iteration": 5.787954807281494 }, { "auxiliary_loss_clip": 0.01129772, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.04772067, "balance_loss_mlp": 1.02368367, "epoch": 0.4227265895084924, "flos": 24204151175040.0, "grad_norm": 1.9107182577124318, "language_loss": 0.86337131, "learning_rate": 2.588774848134486e-06, "loss": 0.88506097, "num_input_tokens_seen": 150982260, "step": 7031, "time_per_iteration": 2.622174024581909 }, { "auxiliary_loss_clip": 0.01120469, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04873753, "balance_loss_mlp": 1.0234381, "epoch": 0.42278671276116037, "flos": 16909760845440.0, "grad_norm": 1.9974648735142886, "language_loss": 0.73489487, "learning_rate": 2.5884026338331473e-06, "loss": 0.75648719, "num_input_tokens_seen": 150999990, "step": 7032, "time_per_iteration": 2.681155204772949 }, { "auxiliary_loss_clip": 0.01100841, "auxiliary_loss_mlp": 0.01044575, "balance_loss_clip": 1.04449272, "balance_loss_mlp": 1.029531, "epoch": 0.42284683601382833, "flos": 25411073086080.0, "grad_norm": 1.657781585480679, "language_loss": 0.70232797, "learning_rate": 2.5880303972195222e-06, "loss": 0.72378218, "num_input_tokens_seen": 151021105, "step": 7033, "time_per_iteration": 4.264399290084839 }, { "auxiliary_loss_clip": 0.01105188, "auxiliary_loss_mlp": 0.00773118, "balance_loss_clip": 1.04417682, "balance_loss_mlp": 1.00101566, "epoch": 0.4229069592664963, "flos": 23040322606080.0, "grad_norm": 2.084860036541982, "language_loss": 0.90209413, "learning_rate": 2.5876581383077256e-06, "loss": 0.92087722, "num_input_tokens_seen": 151040665, "step": 7034, "time_per_iteration": 2.6903390884399414 }, { "auxiliary_loss_clip": 0.01107447, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.04703283, "balance_loss_mlp": 1.02456498, "epoch": 0.42296708251916426, "flos": 26067448264320.0, "grad_norm": 1.854470548564886, "language_loss": 0.77645576, "learning_rate": 2.5872858571118723e-06, "loss": 0.79791045, "num_input_tokens_seen": 151061240, "step": 7035, "time_per_iteration": 2.839463233947754 }, { "auxiliary_loss_clip": 0.01118463, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04904413, "balance_loss_mlp": 1.02879918, "epoch": 0.4230272057718323, "flos": 19458376496640.0, "grad_norm": 1.8047665428966375, "language_loss": 0.82544887, "learning_rate": 2.5869135536460817e-06, "loss": 0.84706789, "num_input_tokens_seen": 151076870, "step": 7036, "time_per_iteration": 2.7344322204589844 }, { "auxiliary_loss_clip": 0.01105244, "auxiliary_loss_mlp": 0.01037982, "balance_loss_clip": 1.04819334, "balance_loss_mlp": 1.02430892, "epoch": 0.42308732902450025, "flos": 22383300983040.0, "grad_norm": 1.7884357315749977, "language_loss": 0.70379841, "learning_rate": 2.58654122792447e-06, "loss": 0.72523069, "num_input_tokens_seen": 151095110, "step": 7037, "time_per_iteration": 2.7701706886291504 }, { "auxiliary_loss_clip": 0.01088589, "auxiliary_loss_mlp": 0.00773432, "balance_loss_clip": 1.04192328, "balance_loss_mlp": 1.00089622, "epoch": 0.4231474522771682, "flos": 20995425331200.0, "grad_norm": 1.6174527275157642, "language_loss": 0.78031301, "learning_rate": 2.586168879961155e-06, "loss": 0.79893327, "num_input_tokens_seen": 151114355, "step": 7038, "time_per_iteration": 2.7142980098724365 }, { "auxiliary_loss_clip": 0.01093843, "auxiliary_loss_mlp": 0.01045553, "balance_loss_clip": 1.04870033, "balance_loss_mlp": 1.02938843, "epoch": 0.4232075755298362, "flos": 14975863574400.0, "grad_norm": 2.472987059089125, "language_loss": 0.67238259, "learning_rate": 2.585796509770259e-06, "loss": 0.69377655, "num_input_tokens_seen": 151131505, "step": 7039, "time_per_iteration": 2.723700761795044 }, { "auxiliary_loss_clip": 0.01126742, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.04828668, "balance_loss_mlp": 1.02421153, "epoch": 0.42326769878250414, "flos": 24532661986560.0, "grad_norm": 2.3861719735257627, "language_loss": 0.75643921, "learning_rate": 2.5854241173658996e-06, "loss": 0.77810442, "num_input_tokens_seen": 151151555, "step": 7040, "time_per_iteration": 2.6909239292144775 }, { "auxiliary_loss_clip": 0.01120351, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.04682565, "balance_loss_mlp": 1.01907206, "epoch": 0.4233278220351721, "flos": 26870303105280.0, "grad_norm": 1.612614450493485, "language_loss": 0.6520682, "learning_rate": 2.5850517027621996e-06, "loss": 0.67360854, "num_input_tokens_seen": 151172385, "step": 7041, "time_per_iteration": 2.705819845199585 }, { "auxiliary_loss_clip": 0.01105037, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.04526758, "balance_loss_mlp": 1.02961886, "epoch": 0.4233879452878401, "flos": 42814927463040.0, "grad_norm": 1.8077043446733942, "language_loss": 0.74725586, "learning_rate": 2.5846792659732803e-06, "loss": 0.76875484, "num_input_tokens_seen": 151194930, "step": 7042, "time_per_iteration": 2.8701279163360596 }, { "auxiliary_loss_clip": 0.01118432, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.04900146, "balance_loss_mlp": 1.02783322, "epoch": 0.42344806854050804, "flos": 25229006023680.0, "grad_norm": 1.5999390710673906, "language_loss": 0.82543206, "learning_rate": 2.5843068070132643e-06, "loss": 0.84703344, "num_input_tokens_seen": 151217905, "step": 7043, "time_per_iteration": 2.7351741790771484 }, { "auxiliary_loss_clip": 0.01110906, "auxiliary_loss_mlp": 0.01054459, "balance_loss_clip": 1.04981089, "balance_loss_mlp": 1.0383476, "epoch": 0.423508191793176, "flos": 22778820616320.0, "grad_norm": 4.941461848597107, "language_loss": 0.64840907, "learning_rate": 2.5839343258962763e-06, "loss": 0.67006272, "num_input_tokens_seen": 151234580, "step": 7044, "time_per_iteration": 2.729717969894409 }, { "auxiliary_loss_clip": 0.01118394, "auxiliary_loss_mlp": 0.01056481, "balance_loss_clip": 1.04780793, "balance_loss_mlp": 1.04023242, "epoch": 0.42356831504584397, "flos": 34637493179520.0, "grad_norm": 4.901784512002612, "language_loss": 0.75249708, "learning_rate": 2.5835618226364393e-06, "loss": 0.77424586, "num_input_tokens_seen": 151254765, "step": 7045, "time_per_iteration": 2.768423557281494 }, { "auxiliary_loss_clip": 0.0109684, "auxiliary_loss_mlp": 0.0105935, "balance_loss_clip": 1.04820228, "balance_loss_mlp": 1.04277968, "epoch": 0.42362843829851193, "flos": 17596767346560.0, "grad_norm": 2.3365752409002027, "language_loss": 0.80862033, "learning_rate": 2.5831892972478797e-06, "loss": 0.83018219, "num_input_tokens_seen": 151269045, "step": 7046, "time_per_iteration": 2.778648614883423 }, { "auxiliary_loss_clip": 0.01050075, "auxiliary_loss_mlp": 0.01043729, "balance_loss_clip": 1.04536414, "balance_loss_mlp": 1.02847028, "epoch": 0.4236885615511799, "flos": 22565691267840.0, "grad_norm": 1.629581050390514, "language_loss": 0.76806176, "learning_rate": 2.5828167497447242e-06, "loss": 0.78899974, "num_input_tokens_seen": 151287530, "step": 7047, "time_per_iteration": 2.957385301589966 }, { "auxiliary_loss_clip": 0.01132762, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.05149937, "balance_loss_mlp": 1.03061271, "epoch": 0.42374868480384786, "flos": 26469216864000.0, "grad_norm": 2.0123660706562294, "language_loss": 0.68135488, "learning_rate": 2.582444180141098e-06, "loss": 0.70312738, "num_input_tokens_seen": 151308905, "step": 7048, "time_per_iteration": 2.976609468460083 }, { "auxiliary_loss_clip": 0.01119986, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.04684722, "balance_loss_mlp": 1.02822733, "epoch": 0.4238088080565159, "flos": 20370220179840.0, "grad_norm": 1.9442365727521234, "language_loss": 0.78292572, "learning_rate": 2.5820715884511307e-06, "loss": 0.80456746, "num_input_tokens_seen": 151326525, "step": 7049, "time_per_iteration": 2.7592408657073975 }, { "auxiliary_loss_clip": 0.01128638, "auxiliary_loss_mlp": 0.0105084, "balance_loss_clip": 1.05336547, "balance_loss_mlp": 1.03632045, "epoch": 0.42386893130918385, "flos": 21172105353600.0, "grad_norm": 1.9473547987347861, "language_loss": 0.82839847, "learning_rate": 2.5816989746889504e-06, "loss": 0.85019326, "num_input_tokens_seen": 151344675, "step": 7050, "time_per_iteration": 2.70487117767334 }, { "auxiliary_loss_clip": 0.01132896, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.04812455, "balance_loss_mlp": 1.02791238, "epoch": 0.4239290545618518, "flos": 17675627656320.0, "grad_norm": 2.6140682586064754, "language_loss": 0.73742986, "learning_rate": 2.581326338868687e-06, "loss": 0.75918031, "num_input_tokens_seen": 151360730, "step": 7051, "time_per_iteration": 2.6406943798065186 }, { "auxiliary_loss_clip": 0.01103657, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.05070043, "balance_loss_mlp": 1.02773547, "epoch": 0.4239891778145198, "flos": 24314504734080.0, "grad_norm": 1.6610077810318091, "language_loss": 0.86273873, "learning_rate": 2.5809536810044706e-06, "loss": 0.88419318, "num_input_tokens_seen": 151380445, "step": 7052, "time_per_iteration": 2.7759416103363037 }, { "auxiliary_loss_clip": 0.01106373, "auxiliary_loss_mlp": 0.01058935, "balance_loss_clip": 1.04475808, "balance_loss_mlp": 1.04325902, "epoch": 0.42404930106718774, "flos": 20558428467840.0, "grad_norm": 2.094212061505075, "language_loss": 0.72460884, "learning_rate": 2.5805810011104323e-06, "loss": 0.74626195, "num_input_tokens_seen": 151399325, "step": 7053, "time_per_iteration": 2.6969964504241943 }, { "auxiliary_loss_clip": 0.0110264, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.05001807, "balance_loss_mlp": 1.00098944, "epoch": 0.4241094243198557, "flos": 22308067946880.0, "grad_norm": 7.333766574531878, "language_loss": 0.82380986, "learning_rate": 2.580208299200704e-06, "loss": 0.84257072, "num_input_tokens_seen": 151417240, "step": 7054, "time_per_iteration": 2.71956205368042 }, { "auxiliary_loss_clip": 0.01052303, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.03336191, "balance_loss_mlp": 1.03490484, "epoch": 0.4241695475725237, "flos": 70612445272320.0, "grad_norm": 0.7897337987883358, "language_loss": 0.60378659, "learning_rate": 2.5798355752894183e-06, "loss": 0.62467366, "num_input_tokens_seen": 151476015, "step": 7055, "time_per_iteration": 3.155177116394043 }, { "auxiliary_loss_clip": 0.01136773, "auxiliary_loss_mlp": 0.01045155, "balance_loss_clip": 1.05100691, "balance_loss_mlp": 1.0298965, "epoch": 0.42422967082519164, "flos": 14027462824320.0, "grad_norm": 2.6219010938669998, "language_loss": 0.7752226, "learning_rate": 2.5794628293907107e-06, "loss": 0.79704189, "num_input_tokens_seen": 151492035, "step": 7056, "time_per_iteration": 2.5975699424743652 }, { "auxiliary_loss_clip": 0.01129986, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.05187988, "balance_loss_mlp": 1.02583957, "epoch": 0.4242897940778596, "flos": 22345522853760.0, "grad_norm": 2.481094371553488, "language_loss": 0.8406778, "learning_rate": 2.579090061518714e-06, "loss": 0.86240464, "num_input_tokens_seen": 151508970, "step": 7057, "time_per_iteration": 2.690188407897949 }, { "auxiliary_loss_clip": 0.01095967, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.04596114, "balance_loss_mlp": 1.02778184, "epoch": 0.42434991733052757, "flos": 22595855713920.0, "grad_norm": 2.565187046091263, "language_loss": 0.83179426, "learning_rate": 2.5787172716875642e-06, "loss": 0.85319012, "num_input_tokens_seen": 151525295, "step": 7058, "time_per_iteration": 2.9978904724121094 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.0077171, "balance_loss_clip": 1.04935992, "balance_loss_mlp": 1.000875, "epoch": 0.42441004058319554, "flos": 20011437181440.0, "grad_norm": 1.910708490679684, "language_loss": 0.80493343, "learning_rate": 2.5783444599113973e-06, "loss": 0.82372791, "num_input_tokens_seen": 151544435, "step": 7059, "time_per_iteration": 2.7227041721343994 }, { "auxiliary_loss_clip": 0.01137284, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.05036783, "balance_loss_mlp": 1.02469015, "epoch": 0.4244701638358635, "flos": 11144985235200.0, "grad_norm": 2.371195034517477, "language_loss": 0.70500332, "learning_rate": 2.57797162620435e-06, "loss": 0.726789, "num_input_tokens_seen": 151559520, "step": 7060, "time_per_iteration": 2.6058552265167236 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.05295658, "balance_loss_mlp": 1.02370787, "epoch": 0.42453028708853147, "flos": 23987753688960.0, "grad_norm": 1.575928079295092, "language_loss": 0.7634182, "learning_rate": 2.577598770580562e-06, "loss": 0.78509057, "num_input_tokens_seen": 151579790, "step": 7061, "time_per_iteration": 2.6592459678649902 }, { "auxiliary_loss_clip": 0.01127164, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.05133295, "balance_loss_mlp": 1.02308464, "epoch": 0.42459041034119943, "flos": 18406338030720.0, "grad_norm": 2.3470563522902195, "language_loss": 0.73278493, "learning_rate": 2.5772258930541693e-06, "loss": 0.75444901, "num_input_tokens_seen": 151598285, "step": 7062, "time_per_iteration": 2.5925838947296143 }, { "auxiliary_loss_clip": 0.01110528, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.02934098, "epoch": 0.42465053359386745, "flos": 20958006337920.0, "grad_norm": 1.735369540351847, "language_loss": 0.66238403, "learning_rate": 2.5768529936393137e-06, "loss": 0.68393123, "num_input_tokens_seen": 151615430, "step": 7063, "time_per_iteration": 2.618459939956665 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.00773106, "balance_loss_clip": 1.04320812, "balance_loss_mlp": 1.0009284, "epoch": 0.4247106568465354, "flos": 33106190520960.0, "grad_norm": 1.673900676033667, "language_loss": 0.78570068, "learning_rate": 2.5764800723501354e-06, "loss": 0.80440634, "num_input_tokens_seen": 151637030, "step": 7064, "time_per_iteration": 2.7396399974823 }, { "auxiliary_loss_clip": 0.0113726, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.05053115, "balance_loss_mlp": 1.02317119, "epoch": 0.4247707800992034, "flos": 20046916840320.0, "grad_norm": 1.9847642008914126, "language_loss": 0.75471151, "learning_rate": 2.5761071292007736e-06, "loss": 0.77646875, "num_input_tokens_seen": 151655745, "step": 7065, "time_per_iteration": 2.532046318054199 }, { "auxiliary_loss_clip": 0.01124888, "auxiliary_loss_mlp": 0.01038463, "balance_loss_clip": 1.05094182, "balance_loss_mlp": 1.02257848, "epoch": 0.42483090335187135, "flos": 22385132576640.0, "grad_norm": 1.3355357629490912, "language_loss": 0.72402596, "learning_rate": 2.5757341642053725e-06, "loss": 0.74565947, "num_input_tokens_seen": 151678040, "step": 7066, "time_per_iteration": 2.5829319953918457 }, { "auxiliary_loss_clip": 0.01101493, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.04836977, "balance_loss_mlp": 1.02044368, "epoch": 0.4248910266045393, "flos": 21356830022400.0, "grad_norm": 2.4907013500628166, "language_loss": 0.80009657, "learning_rate": 2.5753611773780745e-06, "loss": 0.82148039, "num_input_tokens_seen": 151696410, "step": 7067, "time_per_iteration": 2.6051836013793945 }, { "auxiliary_loss_clip": 0.01053553, "auxiliary_loss_mlp": 0.01005501, "balance_loss_clip": 1.02524805, "balance_loss_mlp": 1.00387979, "epoch": 0.4249511498572073, "flos": 64008114099840.0, "grad_norm": 0.9135939410418532, "language_loss": 0.6341064, "learning_rate": 2.574988168733022e-06, "loss": 0.65469694, "num_input_tokens_seen": 151756365, "step": 7068, "time_per_iteration": 4.699309825897217 }, { "auxiliary_loss_clip": 0.0113454, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04911804, "balance_loss_mlp": 1.02070904, "epoch": 0.42501127310987524, "flos": 19607046888960.0, "grad_norm": 1.9072894618048717, "language_loss": 0.72502887, "learning_rate": 2.574615138284361e-06, "loss": 0.74674189, "num_input_tokens_seen": 151775165, "step": 7069, "time_per_iteration": 5.814046382904053 }, { "auxiliary_loss_clip": 0.01136556, "auxiliary_loss_mlp": 0.01039486, "balance_loss_clip": 1.05074239, "balance_loss_mlp": 1.02286839, "epoch": 0.4250713963625432, "flos": 19462326992640.0, "grad_norm": 2.348420544652142, "language_loss": 0.79105788, "learning_rate": 2.5742420860462364e-06, "loss": 0.81281829, "num_input_tokens_seen": 151792620, "step": 7070, "time_per_iteration": 2.6242294311523438 }, { "auxiliary_loss_clip": 0.0112233, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.04764843, "balance_loss_mlp": 1.01816082, "epoch": 0.4251315196152112, "flos": 25337707557120.0, "grad_norm": 1.7541837021075046, "language_loss": 0.70184052, "learning_rate": 2.573869012032795e-06, "loss": 0.72339666, "num_input_tokens_seen": 151812850, "step": 7071, "time_per_iteration": 2.6695022583007812 }, { "auxiliary_loss_clip": 0.01134965, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.05002129, "balance_loss_mlp": 1.0191201, "epoch": 0.42519164286787914, "flos": 26359186527360.0, "grad_norm": 2.353956848857114, "language_loss": 0.71210682, "learning_rate": 2.5734959162581824e-06, "loss": 0.73379803, "num_input_tokens_seen": 151831785, "step": 7072, "time_per_iteration": 2.654045581817627 }, { "auxiliary_loss_clip": 0.01090703, "auxiliary_loss_mlp": 0.01042672, "balance_loss_clip": 1.04456139, "balance_loss_mlp": 1.02779484, "epoch": 0.4252517661205471, "flos": 26031070765440.0, "grad_norm": 1.5509538260814284, "language_loss": 0.81704801, "learning_rate": 2.5731227987365475e-06, "loss": 0.83838177, "num_input_tokens_seen": 151853885, "step": 7073, "time_per_iteration": 4.4267754554748535 }, { "auxiliary_loss_clip": 0.01117821, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.04660416, "balance_loss_mlp": 1.02130294, "epoch": 0.42531188937321507, "flos": 12713635059840.0, "grad_norm": 2.6569023186466914, "language_loss": 0.91360795, "learning_rate": 2.5727496594820386e-06, "loss": 0.93514109, "num_input_tokens_seen": 151871780, "step": 7074, "time_per_iteration": 2.655850887298584 }, { "auxiliary_loss_clip": 0.01128859, "auxiliary_loss_mlp": 0.00774468, "balance_loss_clip": 1.05061221, "balance_loss_mlp": 1.0009917, "epoch": 0.42537201262588303, "flos": 22091670460800.0, "grad_norm": 1.6066127617392931, "language_loss": 0.64610291, "learning_rate": 2.572376498508805e-06, "loss": 0.66513622, "num_input_tokens_seen": 151891600, "step": 7075, "time_per_iteration": 2.7072041034698486 }, { "auxiliary_loss_clip": 0.01097292, "auxiliary_loss_mlp": 0.01030165, "balance_loss_clip": 1.04872322, "balance_loss_mlp": 1.01664686, "epoch": 0.42543213587855105, "flos": 23003119094400.0, "grad_norm": 1.6801281915446873, "language_loss": 0.736256, "learning_rate": 2.5720033158309973e-06, "loss": 0.75753057, "num_input_tokens_seen": 151911330, "step": 7076, "time_per_iteration": 2.7376084327697754 }, { "auxiliary_loss_clip": 0.01107519, "auxiliary_loss_mlp": 0.01042827, "balance_loss_clip": 1.0442965, "balance_loss_mlp": 1.02684128, "epoch": 0.425492259131219, "flos": 25082454533760.0, "grad_norm": 2.293658429237098, "language_loss": 0.78658164, "learning_rate": 2.571630111462766e-06, "loss": 0.80808508, "num_input_tokens_seen": 151930355, "step": 7077, "time_per_iteration": 2.9069621562957764 }, { "auxiliary_loss_clip": 0.01105315, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.04497409, "balance_loss_mlp": 1.01881242, "epoch": 0.425552382383887, "flos": 22816850140800.0, "grad_norm": 1.6369769525688158, "language_loss": 0.73094088, "learning_rate": 2.571256885418265e-06, "loss": 0.75231481, "num_input_tokens_seen": 151949695, "step": 7078, "time_per_iteration": 2.728288173675537 }, { "auxiliary_loss_clip": 0.01104463, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.04849982, "balance_loss_mlp": 1.02651131, "epoch": 0.42561250563655495, "flos": 13553585671680.0, "grad_norm": 1.8849915988224846, "language_loss": 0.79555357, "learning_rate": 2.5708836377116445e-06, "loss": 0.81699896, "num_input_tokens_seen": 151967640, "step": 7079, "time_per_iteration": 2.6294121742248535 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.05348229, "balance_loss_mlp": 1.02171898, "epoch": 0.4256726288892229, "flos": 46978303023360.0, "grad_norm": 1.3719098160070018, "language_loss": 0.71853465, "learning_rate": 2.5705103683570592e-06, "loss": 0.7401371, "num_input_tokens_seen": 151994020, "step": 7080, "time_per_iteration": 2.8506548404693604 }, { "auxiliary_loss_clip": 0.01130776, "auxiliary_loss_mlp": 0.01033872, "balance_loss_clip": 1.04765022, "balance_loss_mlp": 1.02025867, "epoch": 0.4257327521418909, "flos": 23586451966080.0, "grad_norm": 2.0309872529354283, "language_loss": 0.80102706, "learning_rate": 2.5701370773686646e-06, "loss": 0.82267356, "num_input_tokens_seen": 152013415, "step": 7081, "time_per_iteration": 2.698814868927002 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04303122, "balance_loss_mlp": 1.02065063, "epoch": 0.42579287539455885, "flos": 18989994124800.0, "grad_norm": 1.6770375884870488, "language_loss": 0.81524366, "learning_rate": 2.5697637647606138e-06, "loss": 0.83659089, "num_input_tokens_seen": 152030860, "step": 7082, "time_per_iteration": 2.6388967037200928 }, { "auxiliary_loss_clip": 0.01122609, "auxiliary_loss_mlp": 0.01038264, "balance_loss_clip": 1.05003822, "balance_loss_mlp": 1.02411938, "epoch": 0.4258529986472268, "flos": 25191910252800.0, "grad_norm": 2.777460036178925, "language_loss": 0.70476681, "learning_rate": 2.569390430547065e-06, "loss": 0.72637558, "num_input_tokens_seen": 152050395, "step": 7083, "time_per_iteration": 2.666609048843384 }, { "auxiliary_loss_clip": 0.01045638, "auxiliary_loss_mlp": 0.0101356, "balance_loss_clip": 1.02604496, "balance_loss_mlp": 1.01191545, "epoch": 0.4259131218998948, "flos": 69968280718080.0, "grad_norm": 0.8664420799088798, "language_loss": 0.6701948, "learning_rate": 2.569017074742173e-06, "loss": 0.69078678, "num_input_tokens_seen": 152113555, "step": 7084, "time_per_iteration": 3.25407075881958 }, { "auxiliary_loss_clip": 0.01120239, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.04841447, "balance_loss_mlp": 1.02757668, "epoch": 0.42597324515256274, "flos": 18004964480640.0, "grad_norm": 2.05020327260517, "language_loss": 0.78917986, "learning_rate": 2.5686436973600964e-06, "loss": 0.81080949, "num_input_tokens_seen": 152131575, "step": 7085, "time_per_iteration": 2.6294076442718506 }, { "auxiliary_loss_clip": 0.01123765, "auxiliary_loss_mlp": 0.01045859, "balance_loss_clip": 1.05045295, "balance_loss_mlp": 1.03036761, "epoch": 0.4260333684052307, "flos": 15158792563200.0, "grad_norm": 2.015450242409387, "language_loss": 0.76097858, "learning_rate": 2.568270298414995e-06, "loss": 0.78267479, "num_input_tokens_seen": 152149435, "step": 7086, "time_per_iteration": 2.606201648712158 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.04528451, "balance_loss_mlp": 1.02682662, "epoch": 0.42609349165789867, "flos": 14939342421120.0, "grad_norm": 4.435400492712099, "language_loss": 0.80159658, "learning_rate": 2.5678968779210255e-06, "loss": 0.82308263, "num_input_tokens_seen": 152166860, "step": 7087, "time_per_iteration": 2.6517395973205566 }, { "auxiliary_loss_clip": 0.01113938, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.04980528, "balance_loss_mlp": 1.01878285, "epoch": 0.42615361491056664, "flos": 23731961961600.0, "grad_norm": 1.6700745034234148, "language_loss": 0.65982199, "learning_rate": 2.5675234358923505e-06, "loss": 0.68129885, "num_input_tokens_seen": 152187475, "step": 7088, "time_per_iteration": 2.6658740043640137 }, { "auxiliary_loss_clip": 0.01079891, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.04348373, "balance_loss_mlp": 1.02308249, "epoch": 0.42621373816323466, "flos": 24936441747840.0, "grad_norm": 2.4696048983575376, "language_loss": 0.68491185, "learning_rate": 2.56714997234313e-06, "loss": 0.70609522, "num_input_tokens_seen": 152207235, "step": 7089, "time_per_iteration": 2.816352128982544 }, { "auxiliary_loss_clip": 0.01083453, "auxiliary_loss_mlp": 0.01038038, "balance_loss_clip": 1.04270887, "balance_loss_mlp": 1.02359009, "epoch": 0.4262738614159026, "flos": 13552975140480.0, "grad_norm": 2.0671888191777623, "language_loss": 0.73030579, "learning_rate": 2.566776487287525e-06, "loss": 0.75152063, "num_input_tokens_seen": 152224240, "step": 7090, "time_per_iteration": 2.801116704940796 }, { "auxiliary_loss_clip": 0.01114766, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 1.0483079, "balance_loss_mlp": 1.02875018, "epoch": 0.4263339846685706, "flos": 29748794284800.0, "grad_norm": 1.7852421559677654, "language_loss": 0.75632602, "learning_rate": 2.5664029807396994e-06, "loss": 0.77790952, "num_input_tokens_seen": 152242595, "step": 7091, "time_per_iteration": 2.779731273651123 }, { "auxiliary_loss_clip": 0.01081578, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.04725623, "balance_loss_mlp": 1.01879716, "epoch": 0.42639410792123855, "flos": 16834204586880.0, "grad_norm": 2.1009795194853567, "language_loss": 0.82635152, "learning_rate": 2.5660294527138156e-06, "loss": 0.84748316, "num_input_tokens_seen": 152260840, "step": 7092, "time_per_iteration": 2.7296979427337646 }, { "auxiliary_loss_clip": 0.01113469, "auxiliary_loss_mlp": 0.0104261, "balance_loss_clip": 1.04653692, "balance_loss_mlp": 1.02812648, "epoch": 0.4264542311739065, "flos": 28763118195840.0, "grad_norm": 1.6936837646094385, "language_loss": 0.73936713, "learning_rate": 2.565655903224038e-06, "loss": 0.76092792, "num_input_tokens_seen": 152280580, "step": 7093, "time_per_iteration": 2.738494634628296 }, { "auxiliary_loss_clip": 0.01124772, "auxiliary_loss_mlp": 0.01037897, "balance_loss_clip": 1.05013132, "balance_loss_mlp": 1.02285314, "epoch": 0.4265143544265745, "flos": 24713615727360.0, "grad_norm": 2.248863473367437, "language_loss": 0.69831914, "learning_rate": 2.565282332284532e-06, "loss": 0.71994585, "num_input_tokens_seen": 152298455, "step": 7094, "time_per_iteration": 2.696377754211426 }, { "auxiliary_loss_clip": 0.01102522, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.05082488, "balance_loss_mlp": 1.02069819, "epoch": 0.42657447767924245, "flos": 21865971352320.0, "grad_norm": 1.593904094988334, "language_loss": 0.8160966, "learning_rate": 2.564908739909464e-06, "loss": 0.83747452, "num_input_tokens_seen": 152316995, "step": 7095, "time_per_iteration": 2.7906196117401123 }, { "auxiliary_loss_clip": 0.01135526, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.05080557, "balance_loss_mlp": 1.02575183, "epoch": 0.4266346009319104, "flos": 21470236237440.0, "grad_norm": 1.8045956329002426, "language_loss": 0.80642307, "learning_rate": 2.5645351261129996e-06, "loss": 0.82817698, "num_input_tokens_seen": 152334800, "step": 7096, "time_per_iteration": 2.7473361492156982 }, { "auxiliary_loss_clip": 0.01130201, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.05325663, "balance_loss_mlp": 1.0182128, "epoch": 0.4266947241845784, "flos": 25519379569920.0, "grad_norm": 2.602963129491376, "language_loss": 0.64982784, "learning_rate": 2.5641614909093066e-06, "loss": 0.67145991, "num_input_tokens_seen": 152355175, "step": 7097, "time_per_iteration": 2.683868408203125 }, { "auxiliary_loss_clip": 0.01103674, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.04987097, "balance_loss_mlp": 1.01799679, "epoch": 0.42675484743724634, "flos": 26541217676160.0, "grad_norm": 1.7913732947115202, "language_loss": 0.74682045, "learning_rate": 2.5637878343125535e-06, "loss": 0.76817876, "num_input_tokens_seen": 152377245, "step": 7098, "time_per_iteration": 2.7669501304626465 }, { "auxiliary_loss_clip": 0.0112361, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.05006361, "balance_loss_mlp": 1.0165925, "epoch": 0.4268149706899143, "flos": 23112718467840.0, "grad_norm": 1.7242280164199693, "language_loss": 0.75574845, "learning_rate": 2.5634141563369086e-06, "loss": 0.77728367, "num_input_tokens_seen": 152396985, "step": 7099, "time_per_iteration": 2.652024507522583 }, { "auxiliary_loss_clip": 0.01113615, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.04767907, "balance_loss_mlp": 1.02964246, "epoch": 0.4268750939425823, "flos": 22706532495360.0, "grad_norm": 2.4499059435945956, "language_loss": 0.82854998, "learning_rate": 2.5630404569965432e-06, "loss": 0.85013109, "num_input_tokens_seen": 152415590, "step": 7100, "time_per_iteration": 2.66955304145813 }, { "auxiliary_loss_clip": 0.01114994, "auxiliary_loss_mlp": 0.01038973, "balance_loss_clip": 1.05028403, "balance_loss_mlp": 1.0246973, "epoch": 0.42693521719525024, "flos": 25374875155200.0, "grad_norm": 1.3265740257801202, "language_loss": 0.81932402, "learning_rate": 2.562666736305627e-06, "loss": 0.8408637, "num_input_tokens_seen": 152436735, "step": 7101, "time_per_iteration": 2.734703540802002 }, { "auxiliary_loss_clip": 0.01139197, "auxiliary_loss_mlp": 0.01033271, "balance_loss_clip": 1.0521878, "balance_loss_mlp": 1.01856041, "epoch": 0.42699534044791826, "flos": 18150689957760.0, "grad_norm": 6.39201802797086, "language_loss": 0.72548246, "learning_rate": 2.5622929942783314e-06, "loss": 0.74720716, "num_input_tokens_seen": 152455685, "step": 7102, "time_per_iteration": 2.6193687915802 }, { "auxiliary_loss_clip": 0.01123058, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.05015755, "balance_loss_mlp": 1.01770973, "epoch": 0.4270554637005862, "flos": 13698413308800.0, "grad_norm": 2.0187490499372243, "language_loss": 0.83425319, "learning_rate": 2.5619192309288297e-06, "loss": 0.8557986, "num_input_tokens_seen": 152473500, "step": 7103, "time_per_iteration": 2.6151843070983887 }, { "auxiliary_loss_clip": 0.01108466, "auxiliary_loss_mlp": 0.01042825, "balance_loss_clip": 1.04559612, "balance_loss_mlp": 1.02617157, "epoch": 0.4271155869532542, "flos": 17493596507520.0, "grad_norm": 4.588723714988328, "language_loss": 0.74312592, "learning_rate": 2.561545446271294e-06, "loss": 0.76463884, "num_input_tokens_seen": 152491320, "step": 7104, "time_per_iteration": 2.686087131500244 }, { "auxiliary_loss_clip": 0.01118632, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.04769945, "balance_loss_mlp": 1.01652098, "epoch": 0.42717571020592215, "flos": 32452293381120.0, "grad_norm": 3.9751824788265226, "language_loss": 0.7515536, "learning_rate": 2.5611716403198987e-06, "loss": 0.77304816, "num_input_tokens_seen": 152511970, "step": 7105, "time_per_iteration": 2.69466495513916 }, { "auxiliary_loss_clip": 0.01138696, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.05365109, "balance_loss_mlp": 1.01949859, "epoch": 0.4272358334585901, "flos": 16253062444800.0, "grad_norm": 1.828100931914864, "language_loss": 0.77001148, "learning_rate": 2.560797813088819e-06, "loss": 0.79172766, "num_input_tokens_seen": 152530515, "step": 7106, "time_per_iteration": 2.7470526695251465 }, { "auxiliary_loss_clip": 0.01113386, "auxiliary_loss_mlp": 0.01032071, "balance_loss_clip": 1.05155849, "balance_loss_mlp": 1.01898193, "epoch": 0.4272959567112581, "flos": 24200092938240.0, "grad_norm": 2.105539726439896, "language_loss": 0.79606462, "learning_rate": 2.560423964592229e-06, "loss": 0.81751919, "num_input_tokens_seen": 152549295, "step": 7107, "time_per_iteration": 4.302187919616699 }, { "auxiliary_loss_clip": 0.01084956, "auxiliary_loss_mlp": 0.01035225, "balance_loss_clip": 1.04738021, "balance_loss_mlp": 1.02138472, "epoch": 0.42735607996392605, "flos": 27963495578880.0, "grad_norm": 1.6344878343023064, "language_loss": 0.67924458, "learning_rate": 2.5600500948443075e-06, "loss": 0.70044637, "num_input_tokens_seen": 152570725, "step": 7108, "time_per_iteration": 6.044403314590454 }, { "auxiliary_loss_clip": 0.01110243, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.05136764, "balance_loss_mlp": 1.02539325, "epoch": 0.427416203216594, "flos": 20295597674880.0, "grad_norm": 1.7692691179194058, "language_loss": 0.71223509, "learning_rate": 2.5596762038592294e-06, "loss": 0.73372042, "num_input_tokens_seen": 152588950, "step": 7109, "time_per_iteration": 2.6695122718811035 }, { "auxiliary_loss_clip": 0.01120979, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.048154, "balance_loss_mlp": 1.01738, "epoch": 0.427476326469262, "flos": 26943955943040.0, "grad_norm": 2.0357298685431595, "language_loss": 0.64665484, "learning_rate": 2.559302291651174e-06, "loss": 0.66820359, "num_input_tokens_seen": 152608965, "step": 7110, "time_per_iteration": 2.6609907150268555 }, { "auxiliary_loss_clip": 0.01132801, "auxiliary_loss_mlp": 0.00771481, "balance_loss_clip": 1.04796886, "balance_loss_mlp": 1.00075054, "epoch": 0.42753644972192995, "flos": 25702847262720.0, "grad_norm": 6.311104463147988, "language_loss": 0.76556361, "learning_rate": 2.5589283582343197e-06, "loss": 0.7846064, "num_input_tokens_seen": 152630220, "step": 7111, "time_per_iteration": 2.704688310623169 }, { "auxiliary_loss_clip": 0.01111143, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.05706656, "balance_loss_mlp": 1.01936615, "epoch": 0.4275965729745979, "flos": 18767419499520.0, "grad_norm": 2.0174435424847084, "language_loss": 0.72800988, "learning_rate": 2.558554403622845e-06, "loss": 0.74945462, "num_input_tokens_seen": 152648835, "step": 7112, "time_per_iteration": 4.39399790763855 }, { "auxiliary_loss_clip": 0.01107213, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.04838848, "balance_loss_mlp": 1.02366805, "epoch": 0.4276566962272659, "flos": 23764424878080.0, "grad_norm": 1.714295461522007, "language_loss": 0.71427524, "learning_rate": 2.5581804278309323e-06, "loss": 0.73572093, "num_input_tokens_seen": 152668375, "step": 7113, "time_per_iteration": 2.6834428310394287 }, { "auxiliary_loss_clip": 0.01126637, "auxiliary_loss_mlp": 0.01040655, "balance_loss_clip": 1.05207372, "balance_loss_mlp": 1.02700508, "epoch": 0.42771681947993384, "flos": 22492505306880.0, "grad_norm": 1.6108261365545002, "language_loss": 0.61758566, "learning_rate": 2.5578064308727617e-06, "loss": 0.63925862, "num_input_tokens_seen": 152689725, "step": 7114, "time_per_iteration": 2.7341814041137695 }, { "auxiliary_loss_clip": 0.01131369, "auxiliary_loss_mlp": 0.01042209, "balance_loss_clip": 1.05489218, "balance_loss_mlp": 1.02556777, "epoch": 0.42777694273260186, "flos": 25044712318080.0, "grad_norm": 1.6215320240925026, "language_loss": 0.649822, "learning_rate": 2.5574324127625153e-06, "loss": 0.67155778, "num_input_tokens_seen": 152709375, "step": 7115, "time_per_iteration": 2.6360361576080322 }, { "auxiliary_loss_clip": 0.01110467, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.04954565, "balance_loss_mlp": 1.02359438, "epoch": 0.4278370659852698, "flos": 18661519226880.0, "grad_norm": 1.8869093124336491, "language_loss": 0.74057275, "learning_rate": 2.5570583735143753e-06, "loss": 0.76204848, "num_input_tokens_seen": 152727510, "step": 7116, "time_per_iteration": 2.701413869857788 }, { "auxiliary_loss_clip": 0.01105537, "auxiliary_loss_mlp": 0.01041231, "balance_loss_clip": 1.04539752, "balance_loss_mlp": 1.02783155, "epoch": 0.4278971892379378, "flos": 27308269635840.0, "grad_norm": 1.8577367375008744, "language_loss": 0.69426787, "learning_rate": 2.5566843131425275e-06, "loss": 0.71573555, "num_input_tokens_seen": 152746670, "step": 7117, "time_per_iteration": 2.740729570388794 }, { "auxiliary_loss_clip": 0.01110879, "auxiliary_loss_mlp": 0.0103835, "balance_loss_clip": 1.05176735, "balance_loss_mlp": 1.02402163, "epoch": 0.42795731249060576, "flos": 12888698970240.0, "grad_norm": 2.8863290375892148, "language_loss": 0.69564569, "learning_rate": 2.5563102316611536e-06, "loss": 0.71713799, "num_input_tokens_seen": 152760545, "step": 7118, "time_per_iteration": 2.7086899280548096 }, { "auxiliary_loss_clip": 0.01092131, "auxiliary_loss_mlp": 0.0104544, "balance_loss_clip": 1.04521, "balance_loss_mlp": 1.03076482, "epoch": 0.4280174357432737, "flos": 33401448316800.0, "grad_norm": 2.453050871280299, "language_loss": 0.74826419, "learning_rate": 2.55593612908444e-06, "loss": 0.76963991, "num_input_tokens_seen": 152780970, "step": 7119, "time_per_iteration": 2.805619239807129 }, { "auxiliary_loss_clip": 0.01069167, "auxiliary_loss_mlp": 0.01038035, "balance_loss_clip": 1.0436008, "balance_loss_mlp": 1.02377188, "epoch": 0.4280775589959417, "flos": 18259104182400.0, "grad_norm": 1.842272720773601, "language_loss": 0.75238574, "learning_rate": 2.555562005426573e-06, "loss": 0.77345783, "num_input_tokens_seen": 152798475, "step": 7120, "time_per_iteration": 2.8678669929504395 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.00770364, "balance_loss_clip": 1.05290043, "balance_loss_mlp": 1.00063229, "epoch": 0.42813768224860965, "flos": 21471277731840.0, "grad_norm": 1.7037705311845839, "language_loss": 0.76884449, "learning_rate": 2.5551878607017385e-06, "loss": 0.78767425, "num_input_tokens_seen": 152817555, "step": 7121, "time_per_iteration": 2.776524305343628 }, { "auxiliary_loss_clip": 0.01114442, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.05325198, "balance_loss_mlp": 1.02162266, "epoch": 0.4281978055012776, "flos": 15669262696320.0, "grad_norm": 1.9187062544957278, "language_loss": 0.85698652, "learning_rate": 2.554813694924126e-06, "loss": 0.87847555, "num_input_tokens_seen": 152836295, "step": 7122, "time_per_iteration": 2.7109732627868652 }, { "auxiliary_loss_clip": 0.01083707, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.04868889, "balance_loss_mlp": 1.02191544, "epoch": 0.4282579287539456, "flos": 17712005155200.0, "grad_norm": 2.4146794334180632, "language_loss": 0.81251013, "learning_rate": 2.554439508107921e-06, "loss": 0.83370531, "num_input_tokens_seen": 152854950, "step": 7123, "time_per_iteration": 2.7866828441619873 }, { "auxiliary_loss_clip": 0.01090954, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.04922438, "balance_loss_mlp": 1.02011371, "epoch": 0.42831805200661355, "flos": 19281157770240.0, "grad_norm": 1.7481094896376608, "language_loss": 0.81089389, "learning_rate": 2.5540653002673153e-06, "loss": 0.8321439, "num_input_tokens_seen": 152873995, "step": 7124, "time_per_iteration": 2.733530044555664 }, { "auxiliary_loss_clip": 0.01125145, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.05205929, "balance_loss_mlp": 1.02334404, "epoch": 0.4283781752592815, "flos": 19792633484160.0, "grad_norm": 1.8145132685178345, "language_loss": 0.80230892, "learning_rate": 2.553691071416498e-06, "loss": 0.82393849, "num_input_tokens_seen": 152892925, "step": 7125, "time_per_iteration": 2.635104179382324 }, { "auxiliary_loss_clip": 0.01132021, "auxiliary_loss_mlp": 0.0076966, "balance_loss_clip": 1.05282855, "balance_loss_mlp": 1.00061083, "epoch": 0.4284382985119495, "flos": 16508064072960.0, "grad_norm": 1.8752935538071442, "language_loss": 0.74911773, "learning_rate": 2.553316821569659e-06, "loss": 0.76813453, "num_input_tokens_seen": 152910935, "step": 7126, "time_per_iteration": 2.605344772338867 }, { "auxiliary_loss_clip": 0.01124108, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.05336213, "balance_loss_mlp": 1.01742435, "epoch": 0.42849842176461744, "flos": 23330767979520.0, "grad_norm": 4.135943969267594, "language_loss": 0.80782413, "learning_rate": 2.5529425507409913e-06, "loss": 0.82937926, "num_input_tokens_seen": 152931030, "step": 7127, "time_per_iteration": 2.662910223007202 }, { "auxiliary_loss_clip": 0.01088729, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04972112, "balance_loss_mlp": 1.02753484, "epoch": 0.4285585450172854, "flos": 17274433674240.0, "grad_norm": 2.1393882563291773, "language_loss": 0.76243544, "learning_rate": 2.5525682589446867e-06, "loss": 0.78373742, "num_input_tokens_seen": 152948085, "step": 7128, "time_per_iteration": 2.7230868339538574 }, { "auxiliary_loss_clip": 0.01089264, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.04796708, "balance_loss_mlp": 1.02163041, "epoch": 0.42861866826995343, "flos": 24279599692800.0, "grad_norm": 1.945213992632333, "language_loss": 0.74079603, "learning_rate": 2.552193946194937e-06, "loss": 0.76204789, "num_input_tokens_seen": 152966265, "step": 7129, "time_per_iteration": 2.775891065597534 }, { "auxiliary_loss_clip": 0.01127944, "auxiliary_loss_mlp": 0.00770117, "balance_loss_clip": 1.05684757, "balance_loss_mlp": 1.0005461, "epoch": 0.4286787915226214, "flos": 24353108876160.0, "grad_norm": 1.5710338967277158, "language_loss": 0.77974319, "learning_rate": 2.5518196125059394e-06, "loss": 0.79872382, "num_input_tokens_seen": 152986775, "step": 7130, "time_per_iteration": 2.6977498531341553 }, { "auxiliary_loss_clip": 0.01119463, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.05768883, "balance_loss_mlp": 1.02184367, "epoch": 0.42873891477528936, "flos": 15449992122240.0, "grad_norm": 2.320631391566952, "language_loss": 0.73168224, "learning_rate": 2.551445257891886e-06, "loss": 0.75323212, "num_input_tokens_seen": 153003595, "step": 7131, "time_per_iteration": 2.6973114013671875 }, { "auxiliary_loss_clip": 0.01116554, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.05293584, "balance_loss_mlp": 1.02260923, "epoch": 0.4287990380279573, "flos": 17639573379840.0, "grad_norm": 5.223518802520722, "language_loss": 0.77257997, "learning_rate": 2.551070882366973e-06, "loss": 0.79411221, "num_input_tokens_seen": 153021960, "step": 7132, "time_per_iteration": 2.644556999206543 }, { "auxiliary_loss_clip": 0.01097397, "auxiliary_loss_mlp": 0.00771143, "balance_loss_clip": 1.05195022, "balance_loss_mlp": 1.00064743, "epoch": 0.4288591612806253, "flos": 27162328677120.0, "grad_norm": 2.003525879431933, "language_loss": 0.78719372, "learning_rate": 2.550696485945397e-06, "loss": 0.80587912, "num_input_tokens_seen": 153042110, "step": 7133, "time_per_iteration": 2.7668325901031494 }, { "auxiliary_loss_clip": 0.01111172, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.05302238, "balance_loss_mlp": 1.02091813, "epoch": 0.42891928453329325, "flos": 17163182275200.0, "grad_norm": 1.850568768068126, "language_loss": 0.7449469, "learning_rate": 2.550322068641355e-06, "loss": 0.76639962, "num_input_tokens_seen": 153058925, "step": 7134, "time_per_iteration": 2.714893341064453 }, { "auxiliary_loss_clip": 0.01112422, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.04541016, "balance_loss_mlp": 1.02241349, "epoch": 0.4289794077859612, "flos": 18187031543040.0, "grad_norm": 1.9214467858451951, "language_loss": 0.84098607, "learning_rate": 2.5499476304690455e-06, "loss": 0.86247027, "num_input_tokens_seen": 153078070, "step": 7135, "time_per_iteration": 2.646799325942993 }, { "auxiliary_loss_clip": 0.01060089, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.04197621, "balance_loss_mlp": 1.02555561, "epoch": 0.4290395310386292, "flos": 28256885867520.0, "grad_norm": 2.1625216270915493, "language_loss": 0.75274026, "learning_rate": 2.549573171442666e-06, "loss": 0.77375078, "num_input_tokens_seen": 153096680, "step": 7136, "time_per_iteration": 2.809598207473755 }, { "auxiliary_loss_clip": 0.0112086, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.04999709, "balance_loss_mlp": 1.02323103, "epoch": 0.42909965429129715, "flos": 16216074414720.0, "grad_norm": 2.3663507288699743, "language_loss": 0.79031229, "learning_rate": 2.5491986915764175e-06, "loss": 0.81189406, "num_input_tokens_seen": 153113305, "step": 7137, "time_per_iteration": 2.5979957580566406 }, { "auxiliary_loss_clip": 0.01139951, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.05516219, "balance_loss_mlp": 1.02047372, "epoch": 0.4291597775439651, "flos": 23112862122240.0, "grad_norm": 2.7024255480814166, "language_loss": 0.76951313, "learning_rate": 2.548824190884499e-06, "loss": 0.7912572, "num_input_tokens_seen": 153132735, "step": 7138, "time_per_iteration": 2.659080982208252 }, { "auxiliary_loss_clip": 0.01053167, "auxiliary_loss_mlp": 0.01001874, "balance_loss_clip": 1.04265583, "balance_loss_mlp": 1.000193, "epoch": 0.4292199007966331, "flos": 67546212681600.0, "grad_norm": 0.770527259841848, "language_loss": 0.56189907, "learning_rate": 2.548449669381113e-06, "loss": 0.58244956, "num_input_tokens_seen": 153187925, "step": 7139, "time_per_iteration": 3.10082745552063 }, { "auxiliary_loss_clip": 0.0113097, "auxiliary_loss_mlp": 0.00769947, "balance_loss_clip": 1.05131912, "balance_loss_mlp": 1.00071657, "epoch": 0.42928002404930105, "flos": 22999850956800.0, "grad_norm": 2.111862554587806, "language_loss": 0.80871445, "learning_rate": 2.5480751270804595e-06, "loss": 0.82772362, "num_input_tokens_seen": 153206990, "step": 7140, "time_per_iteration": 2.795779228210449 }, { "auxiliary_loss_clip": 0.01122496, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.05028069, "balance_loss_mlp": 1.01853812, "epoch": 0.429340147301969, "flos": 11544922241280.0, "grad_norm": 1.8811141343222446, "language_loss": 0.82105601, "learning_rate": 2.5477005639967424e-06, "loss": 0.84260583, "num_input_tokens_seen": 153222345, "step": 7141, "time_per_iteration": 2.7634544372558594 }, { "auxiliary_loss_clip": 0.0112355, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.05177212, "balance_loss_mlp": 1.02569723, "epoch": 0.42940027055463703, "flos": 25264988472960.0, "grad_norm": 3.1732751781519566, "language_loss": 0.86466211, "learning_rate": 2.547325980144166e-06, "loss": 0.88629478, "num_input_tokens_seen": 153240570, "step": 7142, "time_per_iteration": 2.73675537109375 }, { "auxiliary_loss_clip": 0.01107323, "auxiliary_loss_mlp": 0.0103324, "balance_loss_clip": 1.05093384, "balance_loss_mlp": 1.02018034, "epoch": 0.429460393807305, "flos": 23805004268160.0, "grad_norm": 2.0666274749088704, "language_loss": 0.78651458, "learning_rate": 2.5469513755369323e-06, "loss": 0.80792016, "num_input_tokens_seen": 153259575, "step": 7143, "time_per_iteration": 2.704951047897339 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.04870784, "balance_loss_mlp": 1.02862692, "epoch": 0.42952051705997296, "flos": 13918294414080.0, "grad_norm": 1.8720341937391007, "language_loss": 0.77237451, "learning_rate": 2.5465767501892484e-06, "loss": 0.79365838, "num_input_tokens_seen": 153276650, "step": 7144, "time_per_iteration": 2.8080482482910156 }, { "auxiliary_loss_clip": 0.01111048, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.05582607, "balance_loss_mlp": 1.01565719, "epoch": 0.4295806403126409, "flos": 26760380509440.0, "grad_norm": 2.7559580952375335, "language_loss": 0.73788631, "learning_rate": 2.54620210411532e-06, "loss": 0.75928855, "num_input_tokens_seen": 153298025, "step": 7145, "time_per_iteration": 2.876610040664673 }, { "auxiliary_loss_clip": 0.01124065, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.02291536, "epoch": 0.4296407635653089, "flos": 20952619297920.0, "grad_norm": 2.2535739124191623, "language_loss": 0.78997326, "learning_rate": 2.545827437329352e-06, "loss": 0.81157696, "num_input_tokens_seen": 153315775, "step": 7146, "time_per_iteration": 4.237323999404907 }, { "auxiliary_loss_clip": 0.01118325, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.04862475, "balance_loss_mlp": 1.02041841, "epoch": 0.42970088681797686, "flos": 15852335339520.0, "grad_norm": 2.134935554118882, "language_loss": 0.83125973, "learning_rate": 2.5454527498455532e-06, "loss": 0.85277522, "num_input_tokens_seen": 153332765, "step": 7147, "time_per_iteration": 4.170353412628174 }, { "auxiliary_loss_clip": 0.01120236, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.05321455, "balance_loss_mlp": 1.02217066, "epoch": 0.4297610100706448, "flos": 22382618624640.0, "grad_norm": 2.0255914888463837, "language_loss": 0.87308717, "learning_rate": 2.545078041678131e-06, "loss": 0.89465714, "num_input_tokens_seen": 153350760, "step": 7148, "time_per_iteration": 4.25404167175293 }, { "auxiliary_loss_clip": 0.01106949, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.0480504, "balance_loss_mlp": 1.02031255, "epoch": 0.4298211333233128, "flos": 27925681536000.0, "grad_norm": 1.5866853205406048, "language_loss": 0.77782673, "learning_rate": 2.5447033128412957e-06, "loss": 0.79923236, "num_input_tokens_seen": 153370765, "step": 7149, "time_per_iteration": 2.7506890296936035 }, { "auxiliary_loss_clip": 0.01089941, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.04399276, "balance_loss_mlp": 1.02023959, "epoch": 0.42988125657598075, "flos": 24425612478720.0, "grad_norm": 1.8521512589115583, "language_loss": 0.80214548, "learning_rate": 2.544328563349256e-06, "loss": 0.8233884, "num_input_tokens_seen": 153390725, "step": 7150, "time_per_iteration": 2.7500832080841064 }, { "auxiliary_loss_clip": 0.01129377, "auxiliary_loss_mlp": 0.01039727, "balance_loss_clip": 1.05486202, "balance_loss_mlp": 1.02441442, "epoch": 0.4299413798286487, "flos": 15850180523520.0, "grad_norm": 1.9985895227285218, "language_loss": 0.75273871, "learning_rate": 2.5439537932162222e-06, "loss": 0.7744298, "num_input_tokens_seen": 153408010, "step": 7151, "time_per_iteration": 5.016021251678467 }, { "auxiliary_loss_clip": 0.01085345, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.0429914, "balance_loss_mlp": 1.02001333, "epoch": 0.4300015030813167, "flos": 22309504490880.0, "grad_norm": 2.1817188720110954, "language_loss": 0.70050609, "learning_rate": 2.543579002456406e-06, "loss": 0.72170389, "num_input_tokens_seen": 153426865, "step": 7152, "time_per_iteration": 2.7800815105438232 }, { "auxiliary_loss_clip": 0.01111211, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.04997575, "balance_loss_mlp": 1.02443016, "epoch": 0.43006162633398465, "flos": 34897666366080.0, "grad_norm": 1.6446083910432685, "language_loss": 0.71179092, "learning_rate": 2.54320419108402e-06, "loss": 0.73327965, "num_input_tokens_seen": 153449410, "step": 7153, "time_per_iteration": 2.829648017883301 }, { "auxiliary_loss_clip": 0.01119902, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.0488553, "balance_loss_mlp": 1.01928604, "epoch": 0.4301217495866526, "flos": 15961575576960.0, "grad_norm": 1.892610527455045, "language_loss": 0.78175116, "learning_rate": 2.542829359113276e-06, "loss": 0.80328226, "num_input_tokens_seen": 153467910, "step": 7154, "time_per_iteration": 2.723484516143799 }, { "auxiliary_loss_clip": 0.01099683, "auxiliary_loss_mlp": 0.01040214, "balance_loss_clip": 1.04681695, "balance_loss_mlp": 1.02599812, "epoch": 0.43018187283932063, "flos": 18770364414720.0, "grad_norm": 1.5463056134535458, "language_loss": 0.78802991, "learning_rate": 2.542454506558389e-06, "loss": 0.80942887, "num_input_tokens_seen": 153487100, "step": 7155, "time_per_iteration": 2.7014451026916504 }, { "auxiliary_loss_clip": 0.01105109, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.04913473, "balance_loss_mlp": 1.01963592, "epoch": 0.4302419960919886, "flos": 20151703791360.0, "grad_norm": 1.7272401238355637, "language_loss": 0.88303947, "learning_rate": 2.5420796334335723e-06, "loss": 0.90441763, "num_input_tokens_seen": 153505565, "step": 7156, "time_per_iteration": 2.696967363357544 }, { "auxiliary_loss_clip": 0.01135167, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.05029023, "balance_loss_mlp": 1.01970661, "epoch": 0.43030211934465656, "flos": 26432731624320.0, "grad_norm": 1.8553568722970555, "language_loss": 0.82653069, "learning_rate": 2.541704739753042e-06, "loss": 0.84821856, "num_input_tokens_seen": 153526130, "step": 7157, "time_per_iteration": 2.706956148147583 }, { "auxiliary_loss_clip": 0.01138655, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.05253196, "balance_loss_mlp": 1.0191586, "epoch": 0.43036224259732453, "flos": 24389234979840.0, "grad_norm": 1.8412394525159426, "language_loss": 0.71535289, "learning_rate": 2.5413298255310132e-06, "loss": 0.73707396, "num_input_tokens_seen": 153546370, "step": 7158, "time_per_iteration": 2.717587471008301 }, { "auxiliary_loss_clip": 0.01122952, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.05053186, "balance_loss_mlp": 1.02094615, "epoch": 0.4304223658499925, "flos": 17201714590080.0, "grad_norm": 2.4063235591116, "language_loss": 0.82592964, "learning_rate": 2.5409548907817034e-06, "loss": 0.84750295, "num_input_tokens_seen": 153562800, "step": 7159, "time_per_iteration": 2.657625436782837 }, { "auxiliary_loss_clip": 0.01105982, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.04629135, "balance_loss_mlp": 1.02073002, "epoch": 0.43048248910266046, "flos": 14903000835840.0, "grad_norm": 2.253245664419059, "language_loss": 0.83222294, "learning_rate": 2.54057993551933e-06, "loss": 0.85362625, "num_input_tokens_seen": 153578395, "step": 7160, "time_per_iteration": 2.6994106769561768 }, { "auxiliary_loss_clip": 0.0112897, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05215347, "balance_loss_mlp": 1.02446771, "epoch": 0.4305426123553284, "flos": 21579835610880.0, "grad_norm": 2.2814219127337236, "language_loss": 0.77506208, "learning_rate": 2.5402049597581116e-06, "loss": 0.79675758, "num_input_tokens_seen": 153596880, "step": 7161, "time_per_iteration": 2.819274425506592 }, { "auxiliary_loss_clip": 0.01120227, "auxiliary_loss_mlp": 0.0103714, "balance_loss_clip": 1.04739952, "balance_loss_mlp": 1.02265632, "epoch": 0.4306027356079964, "flos": 22601278667520.0, "grad_norm": 2.279224529598255, "language_loss": 0.73028505, "learning_rate": 2.5398299635122662e-06, "loss": 0.75185871, "num_input_tokens_seen": 153616570, "step": 7162, "time_per_iteration": 2.62280011177063 }, { "auxiliary_loss_clip": 0.01016488, "auxiliary_loss_mlp": 0.00753107, "balance_loss_clip": 1.02147388, "balance_loss_mlp": 1.00100327, "epoch": 0.43066285886066435, "flos": 70672091806080.0, "grad_norm": 0.7910606346239517, "language_loss": 0.58986276, "learning_rate": 2.5394549467960147e-06, "loss": 0.60755867, "num_input_tokens_seen": 153671450, "step": 7163, "time_per_iteration": 3.1325736045837402 }, { "auxiliary_loss_clip": 0.01104143, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.04593122, "balance_loss_mlp": 1.02948582, "epoch": 0.4307229821133323, "flos": 26720591218560.0, "grad_norm": 1.8311930089659938, "language_loss": 0.79205155, "learning_rate": 2.5390799096235783e-06, "loss": 0.81353945, "num_input_tokens_seen": 153691405, "step": 7164, "time_per_iteration": 2.753256320953369 }, { "auxiliary_loss_clip": 0.01138029, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.0510416, "balance_loss_mlp": 1.02608645, "epoch": 0.4307831053660003, "flos": 26177119464960.0, "grad_norm": 2.032413289263653, "language_loss": 0.67551947, "learning_rate": 2.538704852009177e-06, "loss": 0.69730175, "num_input_tokens_seen": 153711555, "step": 7165, "time_per_iteration": 2.719172477722168 }, { "auxiliary_loss_clip": 0.01106688, "auxiliary_loss_mlp": 0.00771886, "balance_loss_clip": 1.05042744, "balance_loss_mlp": 1.00068462, "epoch": 0.43084322861866825, "flos": 18910343715840.0, "grad_norm": 2.1027726489364436, "language_loss": 0.75451279, "learning_rate": 2.538329773967034e-06, "loss": 0.77329856, "num_input_tokens_seen": 153730095, "step": 7166, "time_per_iteration": 2.710304021835327 }, { "auxiliary_loss_clip": 0.01126475, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.05613852, "balance_loss_mlp": 1.02310109, "epoch": 0.4309033518713362, "flos": 26432911192320.0, "grad_norm": 1.6200122801673495, "language_loss": 0.71809006, "learning_rate": 2.537954675511372e-06, "loss": 0.7397157, "num_input_tokens_seen": 153749320, "step": 7167, "time_per_iteration": 2.676224946975708 }, { "auxiliary_loss_clip": 0.01104337, "auxiliary_loss_mlp": 0.00771035, "balance_loss_clip": 1.04866242, "balance_loss_mlp": 1.00059962, "epoch": 0.43096347512400424, "flos": 21213295274880.0, "grad_norm": 1.6573858575043368, "language_loss": 0.78183687, "learning_rate": 2.537579556656414e-06, "loss": 0.80059052, "num_input_tokens_seen": 153767825, "step": 7168, "time_per_iteration": 2.8030035495758057 }, { "auxiliary_loss_clip": 0.01111425, "auxiliary_loss_mlp": 0.0104262, "balance_loss_clip": 1.05006397, "balance_loss_mlp": 1.02867889, "epoch": 0.4310235983766722, "flos": 16540131939840.0, "grad_norm": 1.8701517899109106, "language_loss": 0.82348084, "learning_rate": 2.537204417416387e-06, "loss": 0.84502125, "num_input_tokens_seen": 153785350, "step": 7169, "time_per_iteration": 2.683119773864746 }, { "auxiliary_loss_clip": 0.01047083, "auxiliary_loss_mlp": 0.01001288, "balance_loss_clip": 1.03727269, "balance_loss_mlp": 0.99934483, "epoch": 0.43108372162934017, "flos": 64775704763520.0, "grad_norm": 0.7280845280825856, "language_loss": 0.60741472, "learning_rate": 2.5368292578055132e-06, "loss": 0.6278984, "num_input_tokens_seen": 153856400, "step": 7170, "time_per_iteration": 3.345574140548706 }, { "auxiliary_loss_clip": 0.01135698, "auxiliary_loss_mlp": 0.01037021, "balance_loss_clip": 1.05163968, "balance_loss_mlp": 1.02352667, "epoch": 0.43114384488200813, "flos": 13444094039040.0, "grad_norm": 1.7903297890514136, "language_loss": 0.75776696, "learning_rate": 2.536454077838021e-06, "loss": 0.77949417, "num_input_tokens_seen": 153875230, "step": 7171, "time_per_iteration": 2.612459897994995 }, { "auxiliary_loss_clip": 0.01120974, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05036652, "balance_loss_mlp": 1.02106678, "epoch": 0.4312039681346761, "flos": 26286682924800.0, "grad_norm": 3.289099345654009, "language_loss": 0.77644551, "learning_rate": 2.5360788775281357e-06, "loss": 0.79800093, "num_input_tokens_seen": 153894740, "step": 7172, "time_per_iteration": 2.69909930229187 }, { "auxiliary_loss_clip": 0.01105721, "auxiliary_loss_mlp": 0.010481, "balance_loss_clip": 1.04574609, "balance_loss_mlp": 1.03119648, "epoch": 0.43126409138734406, "flos": 20376684627840.0, "grad_norm": 2.89880180493229, "language_loss": 0.76759243, "learning_rate": 2.535703656890086e-06, "loss": 0.78913063, "num_input_tokens_seen": 153913230, "step": 7173, "time_per_iteration": 2.6338369846343994 }, { "auxiliary_loss_clip": 0.01130423, "auxiliary_loss_mlp": 0.00772103, "balance_loss_clip": 1.04817533, "balance_loss_mlp": 1.00070202, "epoch": 0.431324214640012, "flos": 22123091882880.0, "grad_norm": 1.4474212501027515, "language_loss": 0.76933503, "learning_rate": 2.5353284159381e-06, "loss": 0.78836024, "num_input_tokens_seen": 153933250, "step": 7174, "time_per_iteration": 2.809385061264038 }, { "auxiliary_loss_clip": 0.01135393, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.0494926, "balance_loss_mlp": 1.02004063, "epoch": 0.43138433789268, "flos": 15231008856960.0, "grad_norm": 1.5868683627972313, "language_loss": 0.8226738, "learning_rate": 2.534953154686407e-06, "loss": 0.84438419, "num_input_tokens_seen": 153951325, "step": 7175, "time_per_iteration": 2.609368324279785 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.01052008, "balance_loss_clip": 1.0459013, "balance_loss_mlp": 1.03422189, "epoch": 0.43144446114534796, "flos": 18150294908160.0, "grad_norm": 2.243705003900615, "language_loss": 0.74261117, "learning_rate": 2.5345778731492366e-06, "loss": 0.76405489, "num_input_tokens_seen": 153966975, "step": 7176, "time_per_iteration": 2.680771827697754 }, { "auxiliary_loss_clip": 0.01122908, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.04637945, "balance_loss_mlp": 1.0215838, "epoch": 0.4315045843980159, "flos": 22929861306240.0, "grad_norm": 1.6403527990581428, "language_loss": 0.73309958, "learning_rate": 2.534202571340819e-06, "loss": 0.754686, "num_input_tokens_seen": 153986695, "step": 7177, "time_per_iteration": 2.760601758956909 }, { "auxiliary_loss_clip": 0.011222, "auxiliary_loss_mlp": 0.01043971, "balance_loss_clip": 1.05072641, "balance_loss_mlp": 1.02720976, "epoch": 0.4315647076506839, "flos": 22126862810880.0, "grad_norm": 1.7813773885441684, "language_loss": 0.81519645, "learning_rate": 2.533827249275387e-06, "loss": 0.83685815, "num_input_tokens_seen": 154004710, "step": 7178, "time_per_iteration": 2.6687469482421875 }, { "auxiliary_loss_clip": 0.01109607, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.04922378, "balance_loss_mlp": 1.02013087, "epoch": 0.43162483090335185, "flos": 26871129118080.0, "grad_norm": 32.445562208198496, "language_loss": 0.84143358, "learning_rate": 2.5334519069671725e-06, "loss": 0.86286741, "num_input_tokens_seen": 154024320, "step": 7179, "time_per_iteration": 2.696716547012329 }, { "auxiliary_loss_clip": 0.01108857, "auxiliary_loss_mlp": 0.010342, "balance_loss_clip": 1.04713559, "balance_loss_mlp": 1.0200026, "epoch": 0.4316849541560198, "flos": 13913122855680.0, "grad_norm": 1.7762155940538253, "language_loss": 0.75679082, "learning_rate": 2.5330765444304075e-06, "loss": 0.77822137, "num_input_tokens_seen": 154041755, "step": 7180, "time_per_iteration": 2.6832194328308105 }, { "auxiliary_loss_clip": 0.01104614, "auxiliary_loss_mlp": 0.00776174, "balance_loss_clip": 1.0417347, "balance_loss_mlp": 1.00057638, "epoch": 0.4317450774086878, "flos": 16435165420800.0, "grad_norm": 1.9971445999801452, "language_loss": 0.81773126, "learning_rate": 2.5327011616793274e-06, "loss": 0.83653915, "num_input_tokens_seen": 154056775, "step": 7181, "time_per_iteration": 2.6499931812286377 }, { "auxiliary_loss_clip": 0.01110303, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.02473664, "epoch": 0.4318052006613558, "flos": 20554980762240.0, "grad_norm": 1.7092925782952597, "language_loss": 0.89020073, "learning_rate": 2.532325758728165e-06, "loss": 0.91170847, "num_input_tokens_seen": 154075015, "step": 7182, "time_per_iteration": 2.6567654609680176 }, { "auxiliary_loss_clip": 0.01121856, "auxiliary_loss_mlp": 0.00772189, "balance_loss_clip": 1.05025744, "balance_loss_mlp": 1.00049865, "epoch": 0.43186532391402377, "flos": 22820046451200.0, "grad_norm": 1.602704996145881, "language_loss": 0.75739694, "learning_rate": 2.5319503355911566e-06, "loss": 0.77633733, "num_input_tokens_seen": 154095170, "step": 7183, "time_per_iteration": 2.6784613132476807 }, { "auxiliary_loss_clip": 0.01123979, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.05125499, "balance_loss_mlp": 1.01853919, "epoch": 0.43192544716669173, "flos": 25556583081600.0, "grad_norm": 1.538308227417617, "language_loss": 0.77589077, "learning_rate": 2.5315748922825393e-06, "loss": 0.7974605, "num_input_tokens_seen": 154116895, "step": 7184, "time_per_iteration": 2.6501550674438477 }, { "auxiliary_loss_clip": 0.01103086, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.04594743, "balance_loss_mlp": 1.02377832, "epoch": 0.4319855704193597, "flos": 30954674701440.0, "grad_norm": 1.7848849500644928, "language_loss": 0.73435313, "learning_rate": 2.5311994288165474e-06, "loss": 0.75576103, "num_input_tokens_seen": 154138395, "step": 7185, "time_per_iteration": 2.766298770904541 }, { "auxiliary_loss_clip": 0.01122479, "auxiliary_loss_mlp": 0.01042205, "balance_loss_clip": 1.05223203, "balance_loss_mlp": 1.02754247, "epoch": 0.43204569367202766, "flos": 24238732993920.0, "grad_norm": 3.4842964823639515, "language_loss": 0.75962853, "learning_rate": 2.530823945207421e-06, "loss": 0.78127533, "num_input_tokens_seen": 154156775, "step": 7186, "time_per_iteration": 4.334157705307007 }, { "auxiliary_loss_clip": 0.01099566, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.04762721, "balance_loss_mlp": 1.02477932, "epoch": 0.43210581692469563, "flos": 18406948561920.0, "grad_norm": 3.9729453010836218, "language_loss": 0.76471615, "learning_rate": 2.5304484414693962e-06, "loss": 0.78610301, "num_input_tokens_seen": 154177500, "step": 7187, "time_per_iteration": 5.956019401550293 }, { "auxiliary_loss_clip": 0.01025499, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.03011787, "balance_loss_mlp": 1.03272867, "epoch": 0.4321659401773636, "flos": 49832378910720.0, "grad_norm": 0.8609493763660439, "language_loss": 0.68115592, "learning_rate": 2.530072917616714e-06, "loss": 0.70175231, "num_input_tokens_seen": 154237110, "step": 7188, "time_per_iteration": 3.246208667755127 }, { "auxiliary_loss_clip": 0.01100014, "auxiliary_loss_mlp": 0.01038065, "balance_loss_clip": 1.0437665, "balance_loss_mlp": 1.02437973, "epoch": 0.43222606343003156, "flos": 17128564542720.0, "grad_norm": 1.9766532511253156, "language_loss": 0.77875316, "learning_rate": 2.529697373663614e-06, "loss": 0.80013394, "num_input_tokens_seen": 154253910, "step": 7189, "time_per_iteration": 2.681076765060425 }, { "auxiliary_loss_clip": 0.01083825, "auxiliary_loss_mlp": 0.01046889, "balance_loss_clip": 1.04553795, "balance_loss_mlp": 1.0314517, "epoch": 0.4322861866826995, "flos": 22749949059840.0, "grad_norm": 1.8062049350419371, "language_loss": 0.71379328, "learning_rate": 2.5293218096243364e-06, "loss": 0.73510039, "num_input_tokens_seen": 154274770, "step": 7190, "time_per_iteration": 2.785278081893921 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.01039244, "balance_loss_clip": 1.04681444, "balance_loss_mlp": 1.02500999, "epoch": 0.4323463099353675, "flos": 27891925729920.0, "grad_norm": 1.4390067860166444, "language_loss": 0.79639554, "learning_rate": 2.5289462255131223e-06, "loss": 0.81788546, "num_input_tokens_seen": 154295035, "step": 7191, "time_per_iteration": 4.571990728378296 }, { "auxiliary_loss_clip": 0.0108611, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.04733062, "balance_loss_mlp": 1.01954126, "epoch": 0.43240643318803546, "flos": 21614740652160.0, "grad_norm": 1.5570148329267672, "language_loss": 0.74904197, "learning_rate": 2.5285706213442146e-06, "loss": 0.77023631, "num_input_tokens_seen": 154314905, "step": 7192, "time_per_iteration": 2.7427282333374023 }, { "auxiliary_loss_clip": 0.01090847, "auxiliary_loss_mlp": 0.01047049, "balance_loss_clip": 1.04693365, "balance_loss_mlp": 1.03140879, "epoch": 0.4324665564407034, "flos": 17558378686080.0, "grad_norm": 2.028484656266998, "language_loss": 0.7934891, "learning_rate": 2.5281949971318557e-06, "loss": 0.81486803, "num_input_tokens_seen": 154331740, "step": 7193, "time_per_iteration": 2.708481550216675 }, { "auxiliary_loss_clip": 0.01114828, "auxiliary_loss_mlp": 0.0104506, "balance_loss_clip": 1.04726183, "balance_loss_mlp": 1.02971745, "epoch": 0.4325266796933714, "flos": 18402423448320.0, "grad_norm": 1.769737496980083, "language_loss": 0.75720823, "learning_rate": 2.5278193528902897e-06, "loss": 0.77880704, "num_input_tokens_seen": 154348740, "step": 7194, "time_per_iteration": 2.685701608657837 }, { "auxiliary_loss_clip": 0.01135356, "auxiliary_loss_mlp": 0.01041388, "balance_loss_clip": 1.05137146, "balance_loss_mlp": 1.02693963, "epoch": 0.4325868029460394, "flos": 22564793427840.0, "grad_norm": 3.855960133728433, "language_loss": 0.59479225, "learning_rate": 2.5274436886337613e-06, "loss": 0.61655968, "num_input_tokens_seen": 154368835, "step": 7195, "time_per_iteration": 2.634310483932495 }, { "auxiliary_loss_clip": 0.01112701, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.04618812, "balance_loss_mlp": 1.02434754, "epoch": 0.43264692619870737, "flos": 14605516396800.0, "grad_norm": 2.711649843090413, "language_loss": 0.65653574, "learning_rate": 2.527068004376515e-06, "loss": 0.67806506, "num_input_tokens_seen": 154384620, "step": 7196, "time_per_iteration": 2.608530044555664 }, { "auxiliary_loss_clip": 0.01141945, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.05338526, "balance_loss_mlp": 1.02316523, "epoch": 0.43270704945137534, "flos": 21501657659520.0, "grad_norm": 1.8654403969935065, "language_loss": 0.72525519, "learning_rate": 2.526692300132797e-06, "loss": 0.74706435, "num_input_tokens_seen": 154402865, "step": 7197, "time_per_iteration": 2.644087791442871 }, { "auxiliary_loss_clip": 0.01124491, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.05245936, "balance_loss_mlp": 1.02619135, "epoch": 0.4327671727040433, "flos": 25155891889920.0, "grad_norm": 1.511486884186769, "language_loss": 0.73146015, "learning_rate": 2.5263165759168547e-06, "loss": 0.75311446, "num_input_tokens_seen": 154423625, "step": 7198, "time_per_iteration": 2.7317864894866943 }, { "auxiliary_loss_clip": 0.0109556, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.04451466, "balance_loss_mlp": 1.02034283, "epoch": 0.43282729595671127, "flos": 25447163276160.0, "grad_norm": 1.539323937310933, "language_loss": 0.80887341, "learning_rate": 2.525940831742934e-06, "loss": 0.8301779, "num_input_tokens_seen": 154444775, "step": 7199, "time_per_iteration": 2.736016035079956 }, { "auxiliary_loss_clip": 0.01121231, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.05255413, "balance_loss_mlp": 1.0201118, "epoch": 0.43288741920937923, "flos": 24126116878080.0, "grad_norm": 2.6908376787400186, "language_loss": 0.68332666, "learning_rate": 2.525565067625286e-06, "loss": 0.70488322, "num_input_tokens_seen": 154460815, "step": 7200, "time_per_iteration": 2.688460350036621 }, { "auxiliary_loss_clip": 0.01114262, "auxiliary_loss_mlp": 0.00772856, "balance_loss_clip": 1.05025625, "balance_loss_mlp": 1.00067294, "epoch": 0.4329475424620472, "flos": 19204955066880.0, "grad_norm": 1.9560728888597885, "language_loss": 0.87379515, "learning_rate": 2.525189283578157e-06, "loss": 0.89266634, "num_input_tokens_seen": 154479145, "step": 7201, "time_per_iteration": 2.7547309398651123 }, { "auxiliary_loss_clip": 0.01086041, "auxiliary_loss_mlp": 0.01040787, "balance_loss_clip": 1.04952443, "balance_loss_mlp": 1.02395487, "epoch": 0.43300766571471516, "flos": 22638374438400.0, "grad_norm": 2.3345355752276706, "language_loss": 0.64547086, "learning_rate": 2.5248134796157974e-06, "loss": 0.66673917, "num_input_tokens_seen": 154498905, "step": 7202, "time_per_iteration": 2.878486156463623 }, { "auxiliary_loss_clip": 0.01082437, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.04730773, "balance_loss_mlp": 1.01676202, "epoch": 0.4330677889673831, "flos": 22121080721280.0, "grad_norm": 2.291722240352509, "language_loss": 0.81795621, "learning_rate": 2.5244376557524586e-06, "loss": 0.83908355, "num_input_tokens_seen": 154517270, "step": 7203, "time_per_iteration": 2.7338409423828125 }, { "auxiliary_loss_clip": 0.01102737, "auxiliary_loss_mlp": 0.01051208, "balance_loss_clip": 1.04656279, "balance_loss_mlp": 1.0357945, "epoch": 0.4331279122200511, "flos": 23221527742080.0, "grad_norm": 1.8864588919547398, "language_loss": 0.81453216, "learning_rate": 2.5240618120023912e-06, "loss": 0.83607161, "num_input_tokens_seen": 154535945, "step": 7204, "time_per_iteration": 2.7719802856445312 }, { "auxiliary_loss_clip": 0.01111895, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04900229, "balance_loss_mlp": 1.02450609, "epoch": 0.43318803547271906, "flos": 18259750627200.0, "grad_norm": 2.1348551022614077, "language_loss": 0.73979616, "learning_rate": 2.5236859483798468e-06, "loss": 0.76130074, "num_input_tokens_seen": 154554935, "step": 7205, "time_per_iteration": 2.73463773727417 }, { "auxiliary_loss_clip": 0.01139834, "auxiliary_loss_mlp": 0.00772219, "balance_loss_clip": 1.05782342, "balance_loss_mlp": 1.00075722, "epoch": 0.433248158725387, "flos": 27418407713280.0, "grad_norm": 1.7497294767683989, "language_loss": 0.75183374, "learning_rate": 2.5233100648990803e-06, "loss": 0.77095425, "num_input_tokens_seen": 154576065, "step": 7206, "time_per_iteration": 2.712897300720215 }, { "auxiliary_loss_clip": 0.01082016, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.04904056, "balance_loss_mlp": 1.02218044, "epoch": 0.433308281978055, "flos": 23218008209280.0, "grad_norm": 5.825458886470942, "language_loss": 0.79041201, "learning_rate": 2.522934161574342e-06, "loss": 0.81159621, "num_input_tokens_seen": 154595110, "step": 7207, "time_per_iteration": 2.7708940505981445 }, { "auxiliary_loss_clip": 0.01104721, "auxiliary_loss_mlp": 0.01039597, "balance_loss_clip": 1.04836667, "balance_loss_mlp": 1.02374804, "epoch": 0.433368405230723, "flos": 15852407166720.0, "grad_norm": 1.8464623058117935, "language_loss": 0.81316662, "learning_rate": 2.5225582384198888e-06, "loss": 0.83460987, "num_input_tokens_seen": 154612255, "step": 7208, "time_per_iteration": 2.869554281234741 }, { "auxiliary_loss_clip": 0.01114033, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.04989004, "balance_loss_mlp": 1.01924682, "epoch": 0.433428528483391, "flos": 19026084314880.0, "grad_norm": 2.1101386955173154, "language_loss": 0.70337081, "learning_rate": 2.5221822954499744e-06, "loss": 0.72484744, "num_input_tokens_seen": 154630440, "step": 7209, "time_per_iteration": 2.692166805267334 }, { "auxiliary_loss_clip": 0.01122508, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.04924512, "balance_loss_mlp": 1.02234209, "epoch": 0.43348865173605894, "flos": 24718248581760.0, "grad_norm": 1.435580666015418, "language_loss": 0.81432891, "learning_rate": 2.5218063326788557e-06, "loss": 0.83593118, "num_input_tokens_seen": 154652515, "step": 7210, "time_per_iteration": 2.7368991374969482 }, { "auxiliary_loss_clip": 0.01111056, "auxiliary_loss_mlp": 0.01040693, "balance_loss_clip": 1.05043674, "balance_loss_mlp": 1.02690065, "epoch": 0.4335487749887269, "flos": 22090664880000.0, "grad_norm": 2.4268266327689005, "language_loss": 0.82382917, "learning_rate": 2.5214303501207885e-06, "loss": 0.84534657, "num_input_tokens_seen": 154670965, "step": 7211, "time_per_iteration": 2.6840522289276123 }, { "auxiliary_loss_clip": 0.01124683, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.04992187, "balance_loss_mlp": 1.02354002, "epoch": 0.43360889824139487, "flos": 22382941847040.0, "grad_norm": 1.7229238689988244, "language_loss": 0.74880648, "learning_rate": 2.521054347790029e-06, "loss": 0.77042103, "num_input_tokens_seen": 154689980, "step": 7212, "time_per_iteration": 2.6535651683807373 }, { "auxiliary_loss_clip": 0.01111992, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.05274439, "balance_loss_mlp": 1.0224421, "epoch": 0.43366902149406283, "flos": 17528286067200.0, "grad_norm": 1.7659929391516203, "language_loss": 0.76887298, "learning_rate": 2.5206783257008375e-06, "loss": 0.7903499, "num_input_tokens_seen": 154706570, "step": 7213, "time_per_iteration": 2.7639784812927246 }, { "auxiliary_loss_clip": 0.01127555, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.05343771, "balance_loss_mlp": 1.0235039, "epoch": 0.4337291447467308, "flos": 19022672522880.0, "grad_norm": 2.352447655586991, "language_loss": 0.64672804, "learning_rate": 2.520302283867471e-06, "loss": 0.66837579, "num_input_tokens_seen": 154725210, "step": 7214, "time_per_iteration": 2.6546545028686523 }, { "auxiliary_loss_clip": 0.01107197, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.04624152, "balance_loss_mlp": 1.02401102, "epoch": 0.43378926799939876, "flos": 27234042180480.0, "grad_norm": 1.8015946289097802, "language_loss": 0.71728516, "learning_rate": 2.519926222304191e-06, "loss": 0.73873264, "num_input_tokens_seen": 154745945, "step": 7215, "time_per_iteration": 2.7694337368011475 }, { "auxiliary_loss_clip": 0.01105367, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.04855013, "balance_loss_mlp": 1.02280354, "epoch": 0.43384939125206673, "flos": 15961108700160.0, "grad_norm": 2.003102925000143, "language_loss": 0.75037885, "learning_rate": 2.519550141025255e-06, "loss": 0.77181542, "num_input_tokens_seen": 154763580, "step": 7216, "time_per_iteration": 2.725843667984009 }, { "auxiliary_loss_clip": 0.01116821, "auxiliary_loss_mlp": 0.01045067, "balance_loss_clip": 1.05096495, "balance_loss_mlp": 1.02885413, "epoch": 0.4339095145047347, "flos": 21793216354560.0, "grad_norm": 2.430460894289381, "language_loss": 0.75723612, "learning_rate": 2.519174040044927e-06, "loss": 0.77885503, "num_input_tokens_seen": 154776825, "step": 7217, "time_per_iteration": 2.7089385986328125 }, { "auxiliary_loss_clip": 0.01100856, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04884839, "balance_loss_mlp": 1.02414465, "epoch": 0.43396963775740266, "flos": 14209853109120.0, "grad_norm": 1.9588734650761437, "language_loss": 0.74091554, "learning_rate": 2.5187979193774664e-06, "loss": 0.76231682, "num_input_tokens_seen": 154794025, "step": 7218, "time_per_iteration": 2.6733574867248535 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.05125904, "balance_loss_mlp": 1.01892698, "epoch": 0.4340297610100706, "flos": 19719052473600.0, "grad_norm": 1.867471044964119, "language_loss": 0.69258481, "learning_rate": 2.5184217790371367e-06, "loss": 0.71399873, "num_input_tokens_seen": 154813105, "step": 7219, "time_per_iteration": 2.6384527683258057 }, { "auxiliary_loss_clip": 0.01103251, "auxiliary_loss_mlp": 0.01039305, "balance_loss_clip": 1.04848611, "balance_loss_mlp": 1.02513123, "epoch": 0.4340898842627386, "flos": 18953508885120.0, "grad_norm": 2.2592610231798274, "language_loss": 0.77296734, "learning_rate": 2.518045619038202e-06, "loss": 0.79439294, "num_input_tokens_seen": 154833525, "step": 7220, "time_per_iteration": 2.693434476852417 }, { "auxiliary_loss_clip": 0.01068716, "auxiliary_loss_mlp": 0.01037568, "balance_loss_clip": 1.04492617, "balance_loss_mlp": 1.02248216, "epoch": 0.4341500075154066, "flos": 22018304931840.0, "grad_norm": 2.0152794755447183, "language_loss": 0.6924417, "learning_rate": 2.5176694393949243e-06, "loss": 0.71350455, "num_input_tokens_seen": 154853090, "step": 7221, "time_per_iteration": 2.8318276405334473 }, { "auxiliary_loss_clip": 0.01126059, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.04850173, "balance_loss_mlp": 1.02628446, "epoch": 0.4342101307680746, "flos": 23582465556480.0, "grad_norm": 2.7538415889200554, "language_loss": 0.65288424, "learning_rate": 2.51729324012157e-06, "loss": 0.67454779, "num_input_tokens_seen": 154872055, "step": 7222, "time_per_iteration": 2.6848082542419434 }, { "auxiliary_loss_clip": 0.01095727, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.04434943, "balance_loss_mlp": 1.01868868, "epoch": 0.43427025402074254, "flos": 17967976450560.0, "grad_norm": 2.2547341093747884, "language_loss": 0.72800291, "learning_rate": 2.5169170212324053e-06, "loss": 0.74928898, "num_input_tokens_seen": 154886645, "step": 7223, "time_per_iteration": 2.6691431999206543 }, { "auxiliary_loss_clip": 0.0113251, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.04656434, "balance_loss_mlp": 1.02130401, "epoch": 0.4343303772734105, "flos": 26286395616000.0, "grad_norm": 1.8756720282844566, "language_loss": 0.93602765, "learning_rate": 2.516540782741694e-06, "loss": 0.95771086, "num_input_tokens_seen": 154906775, "step": 7224, "time_per_iteration": 2.667450189590454 }, { "auxiliary_loss_clip": 0.01092783, "auxiliary_loss_mlp": 0.01039248, "balance_loss_clip": 1.04234195, "balance_loss_mlp": 1.02426362, "epoch": 0.43439050052607847, "flos": 26833961520000.0, "grad_norm": 1.4167248746748424, "language_loss": 0.61521256, "learning_rate": 2.5161645246637056e-06, "loss": 0.63653284, "num_input_tokens_seen": 154926990, "step": 7225, "time_per_iteration": 4.334634304046631 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.00773069, "balance_loss_clip": 1.04763186, "balance_loss_mlp": 1.00081611, "epoch": 0.43445062377874644, "flos": 21397660807680.0, "grad_norm": 1.859930915167877, "language_loss": 0.77928364, "learning_rate": 2.5157882470127054e-06, "loss": 0.79808456, "num_input_tokens_seen": 154946210, "step": 7226, "time_per_iteration": 5.937607765197754 }, { "auxiliary_loss_clip": 0.01118617, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.047508, "balance_loss_mlp": 1.02045417, "epoch": 0.4345107470314144, "flos": 19901945548800.0, "grad_norm": 1.6822192052663985, "language_loss": 0.84638822, "learning_rate": 2.515411949802964e-06, "loss": 0.86791462, "num_input_tokens_seen": 154964995, "step": 7227, "time_per_iteration": 2.6521942615509033 }, { "auxiliary_loss_clip": 0.01117348, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.04574108, "balance_loss_mlp": 1.02328634, "epoch": 0.43457087028408237, "flos": 26432623883520.0, "grad_norm": 1.9493500401331498, "language_loss": 0.76725572, "learning_rate": 2.5150356330487498e-06, "loss": 0.78881335, "num_input_tokens_seen": 154984775, "step": 7228, "time_per_iteration": 2.6870598793029785 }, { "auxiliary_loss_clip": 0.01089608, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.04927957, "balance_loss_mlp": 1.02599132, "epoch": 0.43463099353675033, "flos": 31868816855040.0, "grad_norm": 1.513481048537933, "language_loss": 0.80442667, "learning_rate": 2.5146592967643324e-06, "loss": 0.82572889, "num_input_tokens_seen": 155008125, "step": 7229, "time_per_iteration": 2.9437830448150635 }, { "auxiliary_loss_clip": 0.01121336, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.04673219, "balance_loss_mlp": 1.03047252, "epoch": 0.4346911167894183, "flos": 24571266128640.0, "grad_norm": 2.5474712737755016, "language_loss": 0.81467843, "learning_rate": 2.5142829409639834e-06, "loss": 0.83634758, "num_input_tokens_seen": 155027885, "step": 7230, "time_per_iteration": 4.6465747356414795 }, { "auxiliary_loss_clip": 0.0111898, "auxiliary_loss_mlp": 0.01049467, "balance_loss_clip": 1.04806113, "balance_loss_mlp": 1.03399396, "epoch": 0.43475124004208626, "flos": 17090678672640.0, "grad_norm": 2.126712012780947, "language_loss": 0.76608211, "learning_rate": 2.513906565661973e-06, "loss": 0.78776658, "num_input_tokens_seen": 155043375, "step": 7231, "time_per_iteration": 2.668262243270874 }, { "auxiliary_loss_clip": 0.01085236, "auxiliary_loss_mlp": 0.010365, "balance_loss_clip": 1.04462624, "balance_loss_mlp": 1.02319062, "epoch": 0.4348113632947542, "flos": 26104615862400.0, "grad_norm": 1.4622957052763208, "language_loss": 0.6875934, "learning_rate": 2.513530170872575e-06, "loss": 0.70881081, "num_input_tokens_seen": 155062930, "step": 7232, "time_per_iteration": 2.7392327785491943 }, { "auxiliary_loss_clip": 0.01098662, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.04562938, "balance_loss_mlp": 1.02119923, "epoch": 0.4348714865474222, "flos": 34200496316160.0, "grad_norm": 1.6380302947056737, "language_loss": 0.72123957, "learning_rate": 2.5131537566100605e-06, "loss": 0.74259216, "num_input_tokens_seen": 155084980, "step": 7233, "time_per_iteration": 2.8322300910949707 }, { "auxiliary_loss_clip": 0.01073793, "auxiliary_loss_mlp": 0.01045709, "balance_loss_clip": 1.04429805, "balance_loss_mlp": 1.02930558, "epoch": 0.43493160980009016, "flos": 31537468869120.0, "grad_norm": 1.5095585359817736, "language_loss": 0.74440682, "learning_rate": 2.5127773228887053e-06, "loss": 0.76560181, "num_input_tokens_seen": 155107260, "step": 7234, "time_per_iteration": 2.9071762561798096 }, { "auxiliary_loss_clip": 0.011103, "auxiliary_loss_mlp": 0.01043772, "balance_loss_clip": 1.04619622, "balance_loss_mlp": 1.02835774, "epoch": 0.4349917330527582, "flos": 24061334699520.0, "grad_norm": 2.005736270415063, "language_loss": 0.59333825, "learning_rate": 2.512400869722782e-06, "loss": 0.61487895, "num_input_tokens_seen": 155126720, "step": 7235, "time_per_iteration": 2.6738569736480713 }, { "auxiliary_loss_clip": 0.01064764, "auxiliary_loss_mlp": 0.01055431, "balance_loss_clip": 1.03919065, "balance_loss_mlp": 1.03892064, "epoch": 0.43505185630542614, "flos": 30519329863680.0, "grad_norm": 1.6349929491691664, "language_loss": 0.77779961, "learning_rate": 2.512024397126566e-06, "loss": 0.79900157, "num_input_tokens_seen": 155148640, "step": 7236, "time_per_iteration": 2.8045287132263184 }, { "auxiliary_loss_clip": 0.01129354, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.04843307, "balance_loss_mlp": 1.02221155, "epoch": 0.4351119795580941, "flos": 15735158196480.0, "grad_norm": 1.6962419767837338, "language_loss": 0.81330889, "learning_rate": 2.5116479051143345e-06, "loss": 0.83497024, "num_input_tokens_seen": 155165870, "step": 7237, "time_per_iteration": 2.648671865463257 }, { "auxiliary_loss_clip": 0.01115513, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.04350662, "balance_loss_mlp": 1.02228153, "epoch": 0.4351721028107621, "flos": 18731760272640.0, "grad_norm": 3.1516026268664485, "language_loss": 0.62781835, "learning_rate": 2.5112713937003623e-06, "loss": 0.64933956, "num_input_tokens_seen": 155185315, "step": 7238, "time_per_iteration": 2.708812713623047 }, { "auxiliary_loss_clip": 0.01093861, "auxiliary_loss_mlp": 0.00771839, "balance_loss_clip": 1.04551601, "balance_loss_mlp": 1.00081944, "epoch": 0.43523222606343004, "flos": 25226887121280.0, "grad_norm": 1.9011673436513334, "language_loss": 0.85935599, "learning_rate": 2.510894862898928e-06, "loss": 0.87801301, "num_input_tokens_seen": 155205790, "step": 7239, "time_per_iteration": 2.7664706707000732 }, { "auxiliary_loss_clip": 0.01108836, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.04520702, "balance_loss_mlp": 1.01814556, "epoch": 0.435292349316098, "flos": 22709190101760.0, "grad_norm": 1.536559176560054, "language_loss": 0.7257551, "learning_rate": 2.510518312724309e-06, "loss": 0.747168, "num_input_tokens_seen": 155226475, "step": 7240, "time_per_iteration": 2.7275354862213135 }, { "auxiliary_loss_clip": 0.01096929, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.04623103, "balance_loss_mlp": 1.01821971, "epoch": 0.43535247256876597, "flos": 25775889569280.0, "grad_norm": 2.0741794573690613, "language_loss": 0.8174212, "learning_rate": 2.5101417431907842e-06, "loss": 0.83872074, "num_input_tokens_seen": 155247110, "step": 7241, "time_per_iteration": 2.7412314414978027 }, { "auxiliary_loss_clip": 0.01104486, "auxiliary_loss_mlp": 0.00773075, "balance_loss_clip": 1.04755354, "balance_loss_mlp": 1.000664, "epoch": 0.43541259582143393, "flos": 17528142412800.0, "grad_norm": 2.5029472103375627, "language_loss": 0.7954601, "learning_rate": 2.5097651543126345e-06, "loss": 0.81423575, "num_input_tokens_seen": 155261335, "step": 7242, "time_per_iteration": 2.7832155227661133 }, { "auxiliary_loss_clip": 0.01105652, "auxiliary_loss_mlp": 0.01038715, "balance_loss_clip": 1.04170573, "balance_loss_mlp": 1.0224551, "epoch": 0.4354727190741019, "flos": 15195205975680.0, "grad_norm": 5.632863009629144, "language_loss": 0.68174016, "learning_rate": 2.509388546104138e-06, "loss": 0.70318383, "num_input_tokens_seen": 155278510, "step": 7243, "time_per_iteration": 2.731621742248535 }, { "auxiliary_loss_clip": 0.01070337, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.04518962, "balance_loss_mlp": 1.02096963, "epoch": 0.43553284232676986, "flos": 16649264436480.0, "grad_norm": 1.737599591064028, "language_loss": 0.81023276, "learning_rate": 2.5090119185795766e-06, "loss": 0.83128881, "num_input_tokens_seen": 155296450, "step": 7244, "time_per_iteration": 2.869999885559082 }, { "auxiliary_loss_clip": 0.0107405, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.04502463, "balance_loss_mlp": 1.01974106, "epoch": 0.43559296557943783, "flos": 23400865370880.0, "grad_norm": 1.7613354011100055, "language_loss": 0.73543227, "learning_rate": 2.508635271753234e-06, "loss": 0.75650311, "num_input_tokens_seen": 155316080, "step": 7245, "time_per_iteration": 2.8238213062286377 }, { "auxiliary_loss_clip": 0.01073655, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.042413, "balance_loss_mlp": 1.02626252, "epoch": 0.4356530888321058, "flos": 22419067950720.0, "grad_norm": 1.8556670419976653, "language_loss": 0.76651436, "learning_rate": 2.508258605639389e-06, "loss": 0.78765202, "num_input_tokens_seen": 155336765, "step": 7246, "time_per_iteration": 2.74566912651062 }, { "auxiliary_loss_clip": 0.01117733, "auxiliary_loss_mlp": 0.01046964, "balance_loss_clip": 1.04482377, "balance_loss_mlp": 1.03185987, "epoch": 0.43571321208477376, "flos": 21616141282560.0, "grad_norm": 1.8292531725431629, "language_loss": 0.85409153, "learning_rate": 2.5078819202523275e-06, "loss": 0.8757385, "num_input_tokens_seen": 155356440, "step": 7247, "time_per_iteration": 2.6183457374572754 }, { "auxiliary_loss_clip": 0.01130523, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.047526, "balance_loss_mlp": 1.02565122, "epoch": 0.4357733353374418, "flos": 23987358639360.0, "grad_norm": 1.611147300467871, "language_loss": 0.72544634, "learning_rate": 2.507505215606333e-06, "loss": 0.74714351, "num_input_tokens_seen": 155377070, "step": 7248, "time_per_iteration": 2.614370822906494 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.0502224, "balance_loss_mlp": 1.0246768, "epoch": 0.43583345859010975, "flos": 25264737077760.0, "grad_norm": 1.6765876969892934, "language_loss": 0.87089729, "learning_rate": 2.5071284917156893e-06, "loss": 0.89248699, "num_input_tokens_seen": 155398415, "step": 7249, "time_per_iteration": 2.6826605796813965 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.03150034, "epoch": 0.4358935818427777, "flos": 23696302734720.0, "grad_norm": 2.0541786270405495, "language_loss": 0.81998801, "learning_rate": 2.506751748594683e-06, "loss": 0.84157008, "num_input_tokens_seen": 155415625, "step": 7250, "time_per_iteration": 2.6470022201538086 }, { "auxiliary_loss_clip": 0.01124271, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.05197597, "balance_loss_mlp": 1.02089727, "epoch": 0.4359537050954457, "flos": 29532827761920.0, "grad_norm": 1.9267289360456135, "language_loss": 0.84933323, "learning_rate": 2.5063749862575988e-06, "loss": 0.87092638, "num_input_tokens_seen": 155435505, "step": 7251, "time_per_iteration": 2.665776014328003 }, { "auxiliary_loss_clip": 0.01108984, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.04255629, "balance_loss_mlp": 1.02783751, "epoch": 0.43601382834811364, "flos": 22711273090560.0, "grad_norm": 2.7582881981862335, "language_loss": 0.69538188, "learning_rate": 2.5059982047187245e-06, "loss": 0.71690303, "num_input_tokens_seen": 155455425, "step": 7252, "time_per_iteration": 2.644498825073242 }, { "auxiliary_loss_clip": 0.01102038, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.04452658, "balance_loss_mlp": 1.02410412, "epoch": 0.4360739516007816, "flos": 19098731571840.0, "grad_norm": 2.1859211403409717, "language_loss": 0.83621645, "learning_rate": 2.505621403992348e-06, "loss": 0.85763383, "num_input_tokens_seen": 155474250, "step": 7253, "time_per_iteration": 2.662623882293701 }, { "auxiliary_loss_clip": 0.01119158, "auxiliary_loss_mlp": 0.01041761, "balance_loss_clip": 1.04809666, "balance_loss_mlp": 1.0271399, "epoch": 0.43613407485344957, "flos": 23404420817280.0, "grad_norm": 1.5459938146205512, "language_loss": 0.70561367, "learning_rate": 2.505244584092757e-06, "loss": 0.7272228, "num_input_tokens_seen": 155494685, "step": 7254, "time_per_iteration": 2.677427053451538 }, { "auxiliary_loss_clip": 0.01106538, "auxiliary_loss_mlp": 0.01041179, "balance_loss_clip": 1.04567051, "balance_loss_mlp": 1.02734506, "epoch": 0.43619419810611754, "flos": 22637799820800.0, "grad_norm": 1.8056505398017555, "language_loss": 0.812729, "learning_rate": 2.5048677450342406e-06, "loss": 0.83420616, "num_input_tokens_seen": 155513040, "step": 7255, "time_per_iteration": 2.7150163650512695 }, { "auxiliary_loss_clip": 0.01132135, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.04807031, "balance_loss_mlp": 1.02626252, "epoch": 0.4362543213587855, "flos": 20047958334720.0, "grad_norm": 1.9676871720710198, "language_loss": 0.7780782, "learning_rate": 2.504490886831089e-06, "loss": 0.79980761, "num_input_tokens_seen": 155530100, "step": 7256, "time_per_iteration": 2.551403522491455 }, { "auxiliary_loss_clip": 0.0112974, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.04864502, "balance_loss_mlp": 1.02721334, "epoch": 0.43631444461145347, "flos": 21361319222400.0, "grad_norm": 1.9475980639851616, "language_loss": 0.76180404, "learning_rate": 2.5041140094975922e-06, "loss": 0.78351521, "num_input_tokens_seen": 155549375, "step": 7257, "time_per_iteration": 2.6217384338378906 }, { "auxiliary_loss_clip": 0.01120044, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04656029, "balance_loss_mlp": 1.02711391, "epoch": 0.43637456786412143, "flos": 22418529246720.0, "grad_norm": 1.6554456872207661, "language_loss": 0.73254454, "learning_rate": 2.5037371130480417e-06, "loss": 0.75416678, "num_input_tokens_seen": 155569395, "step": 7258, "time_per_iteration": 2.7399442195892334 }, { "auxiliary_loss_clip": 0.01107425, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.0456903, "balance_loss_mlp": 1.02084827, "epoch": 0.4364346911167894, "flos": 28548839612160.0, "grad_norm": 2.059273423297749, "language_loss": 0.76950562, "learning_rate": 2.5033601974967297e-06, "loss": 0.79092765, "num_input_tokens_seen": 155589090, "step": 7259, "time_per_iteration": 2.814030647277832 }, { "auxiliary_loss_clip": 0.01025258, "auxiliary_loss_mlp": 0.01002872, "balance_loss_clip": 1.02231717, "balance_loss_mlp": 1.0011797, "epoch": 0.43649481436945736, "flos": 62659345380480.0, "grad_norm": 0.7406116287283647, "language_loss": 0.56990582, "learning_rate": 2.5029832628579483e-06, "loss": 0.59018713, "num_input_tokens_seen": 155648660, "step": 7260, "time_per_iteration": 3.184105396270752 }, { "auxiliary_loss_clip": 0.01114574, "auxiliary_loss_mlp": 0.01046133, "balance_loss_clip": 1.04780877, "balance_loss_mlp": 1.03077888, "epoch": 0.4365549376221254, "flos": 30592120775040.0, "grad_norm": 2.4789338774629024, "language_loss": 0.71279275, "learning_rate": 2.5026063091459907e-06, "loss": 0.73439986, "num_input_tokens_seen": 155669945, "step": 7261, "time_per_iteration": 2.781569242477417 }, { "auxiliary_loss_clip": 0.01084597, "auxiliary_loss_mlp": 0.01054365, "balance_loss_clip": 1.04558206, "balance_loss_mlp": 1.0377475, "epoch": 0.43661506087479335, "flos": 17165875795200.0, "grad_norm": 1.8767730803011844, "language_loss": 0.69520628, "learning_rate": 2.5022293363751522e-06, "loss": 0.71659589, "num_input_tokens_seen": 155688555, "step": 7262, "time_per_iteration": 2.73209810256958 }, { "auxiliary_loss_clip": 0.0106364, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.04300487, "balance_loss_mlp": 1.02154875, "epoch": 0.4366751841274613, "flos": 22047499710720.0, "grad_norm": 1.5954483681391127, "language_loss": 0.79909682, "learning_rate": 2.501852344559726e-06, "loss": 0.82007402, "num_input_tokens_seen": 155705370, "step": 7263, "time_per_iteration": 2.7780513763427734 }, { "auxiliary_loss_clip": 0.01093795, "auxiliary_loss_mlp": 0.01046831, "balance_loss_clip": 1.0481534, "balance_loss_mlp": 1.03220403, "epoch": 0.4367353073801293, "flos": 15997306631040.0, "grad_norm": 1.6219151151282696, "language_loss": 0.7545082, "learning_rate": 2.50147533371401e-06, "loss": 0.77591443, "num_input_tokens_seen": 155721890, "step": 7264, "time_per_iteration": 4.158029079437256 }, { "auxiliary_loss_clip": 0.01079604, "auxiliary_loss_mlp": 0.01037561, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02243328, "epoch": 0.43679543063279724, "flos": 38217535868160.0, "grad_norm": 2.5655359697781854, "language_loss": 0.61799812, "learning_rate": 2.501098303852298e-06, "loss": 0.63916975, "num_input_tokens_seen": 155743970, "step": 7265, "time_per_iteration": 4.454209804534912 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.04521823, "balance_loss_mlp": 1.01762891, "epoch": 0.4368555538854652, "flos": 15193230727680.0, "grad_norm": 2.0447032285328004, "language_loss": 0.72610664, "learning_rate": 2.5007212549888884e-06, "loss": 0.74747527, "num_input_tokens_seen": 155761830, "step": 7266, "time_per_iteration": 4.213090181350708 }, { "auxiliary_loss_clip": 0.0110385, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.04488015, "balance_loss_mlp": 1.02356541, "epoch": 0.4369156771381332, "flos": 23069086421760.0, "grad_norm": 1.8602157597317315, "language_loss": 0.82307518, "learning_rate": 2.5003441871380794e-06, "loss": 0.84449285, "num_input_tokens_seen": 155779610, "step": 7267, "time_per_iteration": 2.6675074100494385 }, { "auxiliary_loss_clip": 0.01126927, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.04546976, "balance_loss_mlp": 1.01499796, "epoch": 0.43697580039080114, "flos": 23441085624960.0, "grad_norm": 2.021840044875845, "language_loss": 0.74740797, "learning_rate": 2.4999671003141674e-06, "loss": 0.76896226, "num_input_tokens_seen": 155798765, "step": 7268, "time_per_iteration": 2.6228766441345215 }, { "auxiliary_loss_clip": 0.01135364, "auxiliary_loss_mlp": 0.01041324, "balance_loss_clip": 1.04851401, "balance_loss_mlp": 1.02567148, "epoch": 0.4370359236434691, "flos": 18514680428160.0, "grad_norm": 2.5093195722714365, "language_loss": 0.80133688, "learning_rate": 2.499589994531454e-06, "loss": 0.82310379, "num_input_tokens_seen": 155817750, "step": 7269, "time_per_iteration": 4.289510726928711 }, { "auxiliary_loss_clip": 0.01110775, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.02253354, "epoch": 0.43709604689613707, "flos": 23222497409280.0, "grad_norm": 1.7772509501505356, "language_loss": 0.74977714, "learning_rate": 2.499212869804237e-06, "loss": 0.77125382, "num_input_tokens_seen": 155836490, "step": 7270, "time_per_iteration": 2.7519397735595703 }, { "auxiliary_loss_clip": 0.01068873, "auxiliary_loss_mlp": 0.01045139, "balance_loss_clip": 1.04005837, "balance_loss_mlp": 1.02886677, "epoch": 0.43715617014880503, "flos": 23803711378560.0, "grad_norm": 1.9652522706029574, "language_loss": 0.79716229, "learning_rate": 2.4988357261468182e-06, "loss": 0.81830239, "num_input_tokens_seen": 155856225, "step": 7271, "time_per_iteration": 2.8002872467041016 }, { "auxiliary_loss_clip": 0.01036128, "auxiliary_loss_mlp": 0.01021454, "balance_loss_clip": 1.01824927, "balance_loss_mlp": 1.01974964, "epoch": 0.437216293401473, "flos": 61941204766080.0, "grad_norm": 0.7022630698763936, "language_loss": 0.54855651, "learning_rate": 2.4984585635734993e-06, "loss": 0.56913233, "num_input_tokens_seen": 155916770, "step": 7272, "time_per_iteration": 3.1893959045410156 }, { "auxiliary_loss_clip": 0.0113475, "auxiliary_loss_mlp": 0.01041916, "balance_loss_clip": 1.0497241, "balance_loss_mlp": 1.02704489, "epoch": 0.43727641665414096, "flos": 21982250655360.0, "grad_norm": 1.6582852351426143, "language_loss": 0.69981074, "learning_rate": 2.498081382098581e-06, "loss": 0.72157741, "num_input_tokens_seen": 155936490, "step": 7273, "time_per_iteration": 2.622006893157959 }, { "auxiliary_loss_clip": 0.01109468, "auxiliary_loss_mlp": 0.01050566, "balance_loss_clip": 1.04725552, "balance_loss_mlp": 1.03434145, "epoch": 0.437336539906809, "flos": 39530860842240.0, "grad_norm": 7.356047522605187, "language_loss": 0.75699592, "learning_rate": 2.497704181736367e-06, "loss": 0.77859622, "num_input_tokens_seen": 155957595, "step": 7274, "time_per_iteration": 2.850834846496582 }, { "auxiliary_loss_clip": 0.0111429, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.04778564, "balance_loss_mlp": 1.01473844, "epoch": 0.43739666315947695, "flos": 17457147181440.0, "grad_norm": 1.6567651402589496, "language_loss": 0.80280751, "learning_rate": 2.49732696250116e-06, "loss": 0.82422453, "num_input_tokens_seen": 155975710, "step": 7275, "time_per_iteration": 2.638493776321411 }, { "auxiliary_loss_clip": 0.01107442, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.04763556, "balance_loss_mlp": 1.02628231, "epoch": 0.4374567864121449, "flos": 16358747235840.0, "grad_norm": 1.960961081760492, "language_loss": 0.81285107, "learning_rate": 2.496949724407266e-06, "loss": 0.83432496, "num_input_tokens_seen": 155993090, "step": 7276, "time_per_iteration": 2.665069341659546 }, { "auxiliary_loss_clip": 0.01119385, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.05310118, "balance_loss_mlp": 1.01923609, "epoch": 0.4375169096648129, "flos": 30587523834240.0, "grad_norm": 1.9041346547019917, "language_loss": 0.7327143, "learning_rate": 2.496572467468988e-06, "loss": 0.75424743, "num_input_tokens_seen": 156013685, "step": 7277, "time_per_iteration": 2.7329320907592773 }, { "auxiliary_loss_clip": 0.01109724, "auxiliary_loss_mlp": 0.0077177, "balance_loss_clip": 1.04805493, "balance_loss_mlp": 1.00070667, "epoch": 0.43757703291748085, "flos": 30555599621760.0, "grad_norm": 1.7992627956176412, "language_loss": 0.73366892, "learning_rate": 2.4961951917006317e-06, "loss": 0.7524839, "num_input_tokens_seen": 156034300, "step": 7278, "time_per_iteration": 2.7531094551086426 }, { "auxiliary_loss_clip": 0.01094743, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.0471983, "balance_loss_mlp": 1.02677512, "epoch": 0.4376371561701488, "flos": 21397373498880.0, "grad_norm": 1.4932293615412522, "language_loss": 0.66024888, "learning_rate": 2.4958178971165046e-06, "loss": 0.68159282, "num_input_tokens_seen": 156053805, "step": 7279, "time_per_iteration": 2.671842336654663 }, { "auxiliary_loss_clip": 0.01139939, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.05298817, "balance_loss_mlp": 1.02337885, "epoch": 0.4376972794228168, "flos": 23404384903680.0, "grad_norm": 1.7693107777348598, "language_loss": 0.81793606, "learning_rate": 2.4954405837309126e-06, "loss": 0.83971423, "num_input_tokens_seen": 156073295, "step": 7280, "time_per_iteration": 2.588303565979004 }, { "auxiliary_loss_clip": 0.01106326, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.04587424, "balance_loss_mlp": 1.01867414, "epoch": 0.43775740267548474, "flos": 22892945103360.0, "grad_norm": 1.5627499875085749, "language_loss": 0.77005875, "learning_rate": 2.4950632515581653e-06, "loss": 0.79144037, "num_input_tokens_seen": 156094540, "step": 7281, "time_per_iteration": 2.6939706802368164 }, { "auxiliary_loss_clip": 0.011079, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.04824066, "balance_loss_mlp": 1.02360058, "epoch": 0.4378175259281527, "flos": 23294390480640.0, "grad_norm": 1.8010941727109018, "language_loss": 0.75983417, "learning_rate": 2.494685900612569e-06, "loss": 0.78128588, "num_input_tokens_seen": 156114070, "step": 7282, "time_per_iteration": 2.6834237575531006 }, { "auxiliary_loss_clip": 0.01092611, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.02654076, "epoch": 0.43787764918082067, "flos": 23876897339520.0, "grad_norm": 2.1500126437968925, "language_loss": 0.85044593, "learning_rate": 2.4943085309084333e-06, "loss": 0.87177879, "num_input_tokens_seen": 156132130, "step": 7283, "time_per_iteration": 2.7042722702026367 }, { "auxiliary_loss_clip": 0.01111303, "auxiliary_loss_mlp": 0.01037633, "balance_loss_clip": 1.04814124, "balance_loss_mlp": 1.02266598, "epoch": 0.43793777243348864, "flos": 23988148738560.0, "grad_norm": 14.144168664775597, "language_loss": 0.80311596, "learning_rate": 2.49393114246007e-06, "loss": 0.82460535, "num_input_tokens_seen": 156150820, "step": 7284, "time_per_iteration": 2.676689863204956 }, { "auxiliary_loss_clip": 0.01123026, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.02514315, "epoch": 0.4379978956861566, "flos": 18624064320000.0, "grad_norm": 2.0075840095153925, "language_loss": 0.80086255, "learning_rate": 2.493553735281787e-06, "loss": 0.82247692, "num_input_tokens_seen": 156170125, "step": 7285, "time_per_iteration": 2.6446423530578613 }, { "auxiliary_loss_clip": 0.01121831, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.04847312, "balance_loss_mlp": 1.0175761, "epoch": 0.43805801893882457, "flos": 21981388728960.0, "grad_norm": 2.1352627983894545, "language_loss": 0.7498579, "learning_rate": 2.493176309387897e-06, "loss": 0.77138615, "num_input_tokens_seen": 156187320, "step": 7286, "time_per_iteration": 2.6779184341430664 }, { "auxiliary_loss_clip": 0.01095439, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.04372525, "balance_loss_mlp": 1.0179832, "epoch": 0.43811814219149253, "flos": 26393337383040.0, "grad_norm": 1.5473009908217328, "language_loss": 0.73641115, "learning_rate": 2.492798864792712e-06, "loss": 0.75768864, "num_input_tokens_seen": 156207455, "step": 7287, "time_per_iteration": 2.867501735687256 }, { "auxiliary_loss_clip": 0.0111224, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.05047917, "balance_loss_mlp": 1.03040457, "epoch": 0.43817826544416055, "flos": 17493309198720.0, "grad_norm": 1.6804566494971647, "language_loss": 0.8243767, "learning_rate": 2.492421401510545e-06, "loss": 0.84594917, "num_input_tokens_seen": 156226560, "step": 7288, "time_per_iteration": 2.677922010421753 }, { "auxiliary_loss_clip": 0.01094679, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.04326773, "balance_loss_mlp": 1.01793718, "epoch": 0.4382383886968285, "flos": 21581020759680.0, "grad_norm": 1.441403002582157, "language_loss": 0.84301102, "learning_rate": 2.4920439195557093e-06, "loss": 0.86427689, "num_input_tokens_seen": 156246740, "step": 7289, "time_per_iteration": 2.8586435317993164 }, { "auxiliary_loss_clip": 0.0109844, "auxiliary_loss_mlp": 0.01052991, "balance_loss_clip": 1.04162121, "balance_loss_mlp": 1.03685021, "epoch": 0.4382985119494965, "flos": 27923742201600.0, "grad_norm": 1.6202567248687665, "language_loss": 0.78218126, "learning_rate": 2.4916664189425183e-06, "loss": 0.80369556, "num_input_tokens_seen": 156266440, "step": 7290, "time_per_iteration": 2.7211575508117676 }, { "auxiliary_loss_clip": 0.01132305, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.05053866, "balance_loss_mlp": 1.02617884, "epoch": 0.43835863520216445, "flos": 24936836797440.0, "grad_norm": 1.8734686520238957, "language_loss": 0.78314757, "learning_rate": 2.491288899685288e-06, "loss": 0.80486739, "num_input_tokens_seen": 156286900, "step": 7291, "time_per_iteration": 2.629904270172119 }, { "auxiliary_loss_clip": 0.0109159, "auxiliary_loss_mlp": 0.01033172, "balance_loss_clip": 1.04265332, "balance_loss_mlp": 1.0194335, "epoch": 0.4384187584548324, "flos": 33510293504640.0, "grad_norm": 1.5839432646062752, "language_loss": 0.6487931, "learning_rate": 2.4909113617983325e-06, "loss": 0.67004073, "num_input_tokens_seen": 156307690, "step": 7292, "time_per_iteration": 2.7952499389648438 }, { "auxiliary_loss_clip": 0.01112801, "auxiliary_loss_mlp": 0.01036982, "balance_loss_clip": 1.04319155, "balance_loss_mlp": 1.0226171, "epoch": 0.4384788817075004, "flos": 23951052967680.0, "grad_norm": 1.6336411060838572, "language_loss": 0.74232095, "learning_rate": 2.49053380529597e-06, "loss": 0.7638188, "num_input_tokens_seen": 156326620, "step": 7293, "time_per_iteration": 2.636462688446045 }, { "auxiliary_loss_clip": 0.01098755, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.0494585, "balance_loss_mlp": 1.02318609, "epoch": 0.43853900496016834, "flos": 19098516090240.0, "grad_norm": 4.136423906080754, "language_loss": 0.78758669, "learning_rate": 2.490156230192516e-06, "loss": 0.80895221, "num_input_tokens_seen": 156345495, "step": 7294, "time_per_iteration": 2.670069456100464 }, { "auxiliary_loss_clip": 0.01089917, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.04422832, "balance_loss_mlp": 1.02485299, "epoch": 0.4385991282128363, "flos": 13225362168960.0, "grad_norm": 1.7954692393859477, "language_loss": 0.7296086, "learning_rate": 2.4897786365022883e-06, "loss": 0.75089628, "num_input_tokens_seen": 156363155, "step": 7295, "time_per_iteration": 2.7159199714660645 }, { "auxiliary_loss_clip": 0.01090098, "auxiliary_loss_mlp": 0.01044926, "balance_loss_clip": 1.04397202, "balance_loss_mlp": 1.02860653, "epoch": 0.4386592514655043, "flos": 14319883445760.0, "grad_norm": 1.6136170201094728, "language_loss": 0.75463378, "learning_rate": 2.4894010242396063e-06, "loss": 0.77598405, "num_input_tokens_seen": 156380940, "step": 7296, "time_per_iteration": 2.7475438117980957 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.04859519, "balance_loss_mlp": 1.0183568, "epoch": 0.43871937471817224, "flos": 22784423137920.0, "grad_norm": 1.7142829102326689, "language_loss": 0.69474953, "learning_rate": 2.4890233934187873e-06, "loss": 0.71626163, "num_input_tokens_seen": 156400415, "step": 7297, "time_per_iteration": 2.6689095497131348 }, { "auxiliary_loss_clip": 0.01111936, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04589987, "balance_loss_mlp": 1.02004242, "epoch": 0.4387794979708402, "flos": 28072304853120.0, "grad_norm": 2.137486700340973, "language_loss": 0.70327055, "learning_rate": 2.4886457440541535e-06, "loss": 0.72472441, "num_input_tokens_seen": 156421120, "step": 7298, "time_per_iteration": 2.7896294593811035 }, { "auxiliary_loss_clip": 0.01117974, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.0481534, "balance_loss_mlp": 1.01508379, "epoch": 0.43883962122350817, "flos": 26249551240320.0, "grad_norm": 1.5518132007083414, "language_loss": 0.72407347, "learning_rate": 2.4882680761600238e-06, "loss": 0.74553907, "num_input_tokens_seen": 156441535, "step": 7299, "time_per_iteration": 2.724134922027588 }, { "auxiliary_loss_clip": 0.01100992, "auxiliary_loss_mlp": 0.00773554, "balance_loss_clip": 1.04556322, "balance_loss_mlp": 1.00063753, "epoch": 0.43889974447617613, "flos": 25883765089920.0, "grad_norm": 1.9116194577137513, "language_loss": 0.7702527, "learning_rate": 2.487890389750719e-06, "loss": 0.78899813, "num_input_tokens_seen": 156462015, "step": 7300, "time_per_iteration": 2.754582166671753 }, { "auxiliary_loss_clip": 0.01105938, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.04505253, "balance_loss_mlp": 1.01922047, "epoch": 0.43895986772884416, "flos": 25046615738880.0, "grad_norm": 1.6899733258560021, "language_loss": 0.70417237, "learning_rate": 2.4875126848405626e-06, "loss": 0.72556305, "num_input_tokens_seen": 156482165, "step": 7301, "time_per_iteration": 2.8213343620300293 }, { "auxiliary_loss_clip": 0.01082543, "auxiliary_loss_mlp": 0.01042943, "balance_loss_clip": 1.04282618, "balance_loss_mlp": 1.0270884, "epoch": 0.4390199909815121, "flos": 25994585525760.0, "grad_norm": 1.824867215084726, "language_loss": 0.70808041, "learning_rate": 2.4871349614438757e-06, "loss": 0.72933531, "num_input_tokens_seen": 156503170, "step": 7302, "time_per_iteration": 2.7875969409942627 }, { "auxiliary_loss_clip": 0.01107602, "auxiliary_loss_mlp": 0.01039104, "balance_loss_clip": 1.04878247, "balance_loss_mlp": 1.02599669, "epoch": 0.4390801142341801, "flos": 29022249888000.0, "grad_norm": 1.5936626078522842, "language_loss": 0.82381457, "learning_rate": 2.486757219574983e-06, "loss": 0.8452816, "num_input_tokens_seen": 156523005, "step": 7303, "time_per_iteration": 2.838871717453003 }, { "auxiliary_loss_clip": 0.01116821, "auxiliary_loss_mlp": 0.01046972, "balance_loss_clip": 1.04648411, "balance_loss_mlp": 1.03164792, "epoch": 0.43914023748684805, "flos": 33438544087680.0, "grad_norm": 10.027739157490931, "language_loss": 0.69036293, "learning_rate": 2.4863794592482067e-06, "loss": 0.71200085, "num_input_tokens_seen": 156544440, "step": 7304, "time_per_iteration": 5.9847636222839355 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.00770446, "balance_loss_clip": 1.04475939, "balance_loss_mlp": 1.0005337, "epoch": 0.439200360739516, "flos": 34531844302080.0, "grad_norm": 1.5264108649470638, "language_loss": 0.78100759, "learning_rate": 2.486001680477873e-06, "loss": 0.79976428, "num_input_tokens_seen": 156565410, "step": 7305, "time_per_iteration": 4.283693313598633 }, { "auxiliary_loss_clip": 0.01102752, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.04440284, "balance_loss_mlp": 1.02097106, "epoch": 0.439260483992184, "flos": 21907843632000.0, "grad_norm": 1.7445713343884877, "language_loss": 0.68756545, "learning_rate": 2.485623883278308e-06, "loss": 0.70893979, "num_input_tokens_seen": 156584210, "step": 7306, "time_per_iteration": 2.7069246768951416 }, { "auxiliary_loss_clip": 0.01089881, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.0450325, "balance_loss_mlp": 1.01757789, "epoch": 0.43932060724485195, "flos": 20996430912000.0, "grad_norm": 2.2471251539247428, "language_loss": 0.62507868, "learning_rate": 2.4852460676638344e-06, "loss": 0.64629447, "num_input_tokens_seen": 156602730, "step": 7307, "time_per_iteration": 2.719836950302124 }, { "auxiliary_loss_clip": 0.01130769, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 1.04645061, "balance_loss_mlp": 1.0188818, "epoch": 0.4393807304975199, "flos": 17747053850880.0, "grad_norm": 1.9621539490577573, "language_loss": 0.71752089, "learning_rate": 2.4848682336487828e-06, "loss": 0.73915237, "num_input_tokens_seen": 156619405, "step": 7308, "time_per_iteration": 4.218705892562866 }, { "auxiliary_loss_clip": 0.0110959, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.0438807, "balance_loss_mlp": 1.020859, "epoch": 0.4394408537501879, "flos": 22528523669760.0, "grad_norm": 1.855171270613647, "language_loss": 0.76671213, "learning_rate": 2.4844903812474787e-06, "loss": 0.78814828, "num_input_tokens_seen": 156638165, "step": 7309, "time_per_iteration": 2.726790428161621 }, { "auxiliary_loss_clip": 0.01111334, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.04383993, "balance_loss_mlp": 1.01888466, "epoch": 0.43950097700285584, "flos": 23440654661760.0, "grad_norm": 1.9900388133775502, "language_loss": 0.7067014, "learning_rate": 2.484112510474251e-06, "loss": 0.72813171, "num_input_tokens_seen": 156658845, "step": 7310, "time_per_iteration": 2.644737958908081 }, { "auxiliary_loss_clip": 0.01099363, "auxiliary_loss_mlp": 0.00771301, "balance_loss_clip": 1.04282653, "balance_loss_mlp": 1.00065351, "epoch": 0.4395611002555238, "flos": 23180696956800.0, "grad_norm": 2.0308560550957813, "language_loss": 0.76245713, "learning_rate": 2.483734621343429e-06, "loss": 0.78116381, "num_input_tokens_seen": 156677275, "step": 7311, "time_per_iteration": 2.676393985748291 }, { "auxiliary_loss_clip": 0.01118807, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.04605961, "balance_loss_mlp": 1.02365649, "epoch": 0.43962122350819177, "flos": 22127365601280.0, "grad_norm": 1.941188934607737, "language_loss": 0.81554043, "learning_rate": 2.483356713869341e-06, "loss": 0.83709824, "num_input_tokens_seen": 156695815, "step": 7312, "time_per_iteration": 2.734691858291626 }, { "auxiliary_loss_clip": 0.01099053, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.04661798, "balance_loss_mlp": 1.01802182, "epoch": 0.43968134676085974, "flos": 17420554200960.0, "grad_norm": 4.309677618927981, "language_loss": 0.85387003, "learning_rate": 2.482978788066318e-06, "loss": 0.8751691, "num_input_tokens_seen": 156714385, "step": 7313, "time_per_iteration": 2.7130918502807617 }, { "auxiliary_loss_clip": 0.01101603, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.04015613, "balance_loss_mlp": 1.02131104, "epoch": 0.43974147001352776, "flos": 18952646958720.0, "grad_norm": 1.7624997398560822, "language_loss": 0.67982185, "learning_rate": 2.4826008439486904e-06, "loss": 0.70118284, "num_input_tokens_seen": 156732615, "step": 7314, "time_per_iteration": 2.660019636154175 }, { "auxiliary_loss_clip": 0.01107647, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.04436517, "balance_loss_mlp": 1.01645088, "epoch": 0.4398015932661957, "flos": 18953508885120.0, "grad_norm": 1.864599678602129, "language_loss": 0.76799178, "learning_rate": 2.4822228815307915e-06, "loss": 0.78936785, "num_input_tokens_seen": 156750920, "step": 7315, "time_per_iteration": 2.6958022117614746 }, { "auxiliary_loss_clip": 0.01103713, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.04664755, "balance_loss_mlp": 1.02002192, "epoch": 0.4398617165188637, "flos": 24199913370240.0, "grad_norm": 2.581770130909348, "language_loss": 0.74439812, "learning_rate": 2.4818449008269523e-06, "loss": 0.76576865, "num_input_tokens_seen": 156768520, "step": 7316, "time_per_iteration": 2.7142746448516846 }, { "auxiliary_loss_clip": 0.01091829, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.04720306, "balance_loss_mlp": 1.02546883, "epoch": 0.43992183977153165, "flos": 22236677665920.0, "grad_norm": 2.381148700310756, "language_loss": 0.64676511, "learning_rate": 2.481466901851506e-06, "loss": 0.66806751, "num_input_tokens_seen": 156788700, "step": 7317, "time_per_iteration": 2.6647984981536865 }, { "auxiliary_loss_clip": 0.01100358, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.04315925, "balance_loss_mlp": 1.02252579, "epoch": 0.4399819630241996, "flos": 18697465762560.0, "grad_norm": 2.00656387252293, "language_loss": 0.79769003, "learning_rate": 2.4810888846187865e-06, "loss": 0.81905675, "num_input_tokens_seen": 156806470, "step": 7318, "time_per_iteration": 2.6569128036499023 }, { "auxiliary_loss_clip": 0.01085209, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.03973842, "balance_loss_mlp": 1.02808332, "epoch": 0.4400420862768676, "flos": 23879375377920.0, "grad_norm": 1.4911827600564649, "language_loss": 0.79173744, "learning_rate": 2.4807108491431283e-06, "loss": 0.81301975, "num_input_tokens_seen": 156825895, "step": 7319, "time_per_iteration": 2.7476212978363037 }, { "auxiliary_loss_clip": 0.01110516, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.0416882, "balance_loss_mlp": 1.02647328, "epoch": 0.44010220952953555, "flos": 28037615293440.0, "grad_norm": 1.9147413156076512, "language_loss": 0.80129063, "learning_rate": 2.4803327954388667e-06, "loss": 0.82280946, "num_input_tokens_seen": 156845990, "step": 7320, "time_per_iteration": 2.716813802719116 }, { "auxiliary_loss_clip": 0.01088202, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.04271483, "balance_loss_mlp": 1.02871788, "epoch": 0.4401623327822035, "flos": 23768985905280.0, "grad_norm": 3.0980421986856777, "language_loss": 0.69580001, "learning_rate": 2.4799547235203376e-06, "loss": 0.71709728, "num_input_tokens_seen": 156866685, "step": 7321, "time_per_iteration": 2.753053903579712 }, { "auxiliary_loss_clip": 0.01016924, "auxiliary_loss_mlp": 0.01013574, "balance_loss_clip": 1.02610326, "balance_loss_mlp": 1.01153517, "epoch": 0.4402224560348715, "flos": 70774583264640.0, "grad_norm": 0.8888992176827548, "language_loss": 0.56922823, "learning_rate": 2.4795766334018763e-06, "loss": 0.58953327, "num_input_tokens_seen": 156923450, "step": 7322, "time_per_iteration": 3.3513524532318115 }, { "auxiliary_loss_clip": 0.01073209, "auxiliary_loss_mlp": 0.01039777, "balance_loss_clip": 1.03671217, "balance_loss_mlp": 1.02677715, "epoch": 0.44028257928753944, "flos": 22891795868160.0, "grad_norm": 1.5589182914821764, "language_loss": 0.76272774, "learning_rate": 2.479198525097822e-06, "loss": 0.78385758, "num_input_tokens_seen": 156944795, "step": 7323, "time_per_iteration": 2.7524306774139404 }, { "auxiliary_loss_clip": 0.01119465, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.0296607, "epoch": 0.4403427025402074, "flos": 17895760156800.0, "grad_norm": 1.5124862196762965, "language_loss": 0.80590653, "learning_rate": 2.478820398622511e-06, "loss": 0.82753503, "num_input_tokens_seen": 156962755, "step": 7324, "time_per_iteration": 2.6558468341827393 }, { "auxiliary_loss_clip": 0.01025531, "auxiliary_loss_mlp": 0.0100492, "balance_loss_clip": 1.02356136, "balance_loss_mlp": 1.00322747, "epoch": 0.4404028257928754, "flos": 69562525708800.0, "grad_norm": 0.6843753140185513, "language_loss": 0.54592586, "learning_rate": 2.478442253990283e-06, "loss": 0.5662303, "num_input_tokens_seen": 157028095, "step": 7325, "time_per_iteration": 3.228588819503784 }, { "auxiliary_loss_clip": 0.01128033, "auxiliary_loss_mlp": 0.01028317, "balance_loss_clip": 1.04957604, "balance_loss_mlp": 1.0163784, "epoch": 0.44046294904554334, "flos": 20923675914240.0, "grad_norm": 1.4618535572581854, "language_loss": 0.70052326, "learning_rate": 2.4780640912154766e-06, "loss": 0.72208667, "num_input_tokens_seen": 157048365, "step": 7326, "time_per_iteration": 2.643843650817871 }, { "auxiliary_loss_clip": 0.01081906, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.03812075, "balance_loss_mlp": 1.01949978, "epoch": 0.44052307229821136, "flos": 23623475909760.0, "grad_norm": 1.533904509031544, "language_loss": 0.76754719, "learning_rate": 2.477685910312432e-06, "loss": 0.78869128, "num_input_tokens_seen": 157069130, "step": 7327, "time_per_iteration": 2.7409613132476807 }, { "auxiliary_loss_clip": 0.01097799, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04025364, "balance_loss_mlp": 1.0256505, "epoch": 0.4405831955508793, "flos": 17597665186560.0, "grad_norm": 1.9457575580966853, "language_loss": 0.8413341, "learning_rate": 2.4773077112954897e-06, "loss": 0.86269557, "num_input_tokens_seen": 157084940, "step": 7328, "time_per_iteration": 2.6578822135925293 }, { "auxiliary_loss_clip": 0.01102477, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.04432774, "balance_loss_mlp": 1.01576972, "epoch": 0.4406433188035473, "flos": 21463376739840.0, "grad_norm": 2.377465022226765, "language_loss": 0.77753079, "learning_rate": 2.4769294941789908e-06, "loss": 0.79884225, "num_input_tokens_seen": 157102770, "step": 7329, "time_per_iteration": 2.6732001304626465 }, { "auxiliary_loss_clip": 0.01114069, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.04399741, "balance_loss_mlp": 1.02568269, "epoch": 0.44070344205621526, "flos": 22673566788480.0, "grad_norm": 1.63533295854216, "language_loss": 0.73525596, "learning_rate": 2.476551258977278e-06, "loss": 0.75678968, "num_input_tokens_seen": 157122035, "step": 7330, "time_per_iteration": 2.6258528232574463 }, { "auxiliary_loss_clip": 0.01104463, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.04494476, "balance_loss_mlp": 1.02678585, "epoch": 0.4407635653088832, "flos": 23441193365760.0, "grad_norm": 1.852759340776506, "language_loss": 0.74862218, "learning_rate": 2.4761730057046936e-06, "loss": 0.77005959, "num_input_tokens_seen": 157142800, "step": 7331, "time_per_iteration": 2.767972469329834 }, { "auxiliary_loss_clip": 0.01075234, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 1.04043937, "balance_loss_mlp": 1.02114189, "epoch": 0.4408236885615512, "flos": 24021294013440.0, "grad_norm": 1.4106194210898035, "language_loss": 0.76326358, "learning_rate": 2.475794734375581e-06, "loss": 0.78436339, "num_input_tokens_seen": 157163295, "step": 7332, "time_per_iteration": 2.7810683250427246 }, { "auxiliary_loss_clip": 0.01099425, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.04447377, "balance_loss_mlp": 1.02958584, "epoch": 0.44088381181421915, "flos": 12676826597760.0, "grad_norm": 1.919719554260373, "language_loss": 0.73795688, "learning_rate": 2.475416445004285e-06, "loss": 0.75936526, "num_input_tokens_seen": 157180890, "step": 7333, "time_per_iteration": 2.661736488342285 }, { "auxiliary_loss_clip": 0.01086658, "auxiliary_loss_mlp": 0.01034222, "balance_loss_clip": 1.04458117, "balance_loss_mlp": 1.02134728, "epoch": 0.4409439350668871, "flos": 24569865498240.0, "grad_norm": 1.5776913121160454, "language_loss": 0.79113179, "learning_rate": 2.4750381376051493e-06, "loss": 0.81234062, "num_input_tokens_seen": 157200580, "step": 7334, "time_per_iteration": 2.8023018836975098 }, { "auxiliary_loss_clip": 0.01102091, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.04475522, "balance_loss_mlp": 1.02343714, "epoch": 0.4410040583195551, "flos": 22668574798080.0, "grad_norm": 2.426268589885391, "language_loss": 0.75184131, "learning_rate": 2.47465981219252e-06, "loss": 0.77325642, "num_input_tokens_seen": 157218345, "step": 7335, "time_per_iteration": 2.7240371704101562 }, { "auxiliary_loss_clip": 0.01101432, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04350579, "balance_loss_mlp": 1.02189362, "epoch": 0.44106418157222305, "flos": 10852528700160.0, "grad_norm": 1.9825426915131346, "language_loss": 0.72498572, "learning_rate": 2.4742814687807423e-06, "loss": 0.74635154, "num_input_tokens_seen": 157234395, "step": 7336, "time_per_iteration": 2.6489880084991455 }, { "auxiliary_loss_clip": 0.01118861, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.04398608, "balance_loss_mlp": 1.02684367, "epoch": 0.441124304824891, "flos": 21726710323200.0, "grad_norm": 2.2630715311051617, "language_loss": 0.62847346, "learning_rate": 2.473903107384165e-06, "loss": 0.65006793, "num_input_tokens_seen": 157254805, "step": 7337, "time_per_iteration": 2.632335901260376 }, { "auxiliary_loss_clip": 0.01029242, "auxiliary_loss_mlp": 0.00753616, "balance_loss_clip": 1.0181427, "balance_loss_mlp": 1.00070596, "epoch": 0.441184428077559, "flos": 63220486625280.0, "grad_norm": 0.7364595311582042, "language_loss": 0.52639711, "learning_rate": 2.473524728017134e-06, "loss": 0.54422569, "num_input_tokens_seen": 157317870, "step": 7338, "time_per_iteration": 3.253746509552002 }, { "auxiliary_loss_clip": 0.01106453, "auxiliary_loss_mlp": 0.01046288, "balance_loss_clip": 1.04105973, "balance_loss_mlp": 1.03120804, "epoch": 0.44124455133022694, "flos": 21177959270400.0, "grad_norm": 2.22639682548465, "language_loss": 0.70776093, "learning_rate": 2.473146330693997e-06, "loss": 0.7292884, "num_input_tokens_seen": 157336505, "step": 7339, "time_per_iteration": 2.655733823776245 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.03682137, "balance_loss_mlp": 1.02918971, "epoch": 0.4413046745828949, "flos": 17457865453440.0, "grad_norm": 1.5022359473102205, "language_loss": 0.70075929, "learning_rate": 2.472767915429105e-06, "loss": 0.72172678, "num_input_tokens_seen": 157354995, "step": 7340, "time_per_iteration": 2.767920970916748 }, { "auxiliary_loss_clip": 0.01030747, "auxiliary_loss_mlp": 0.01003789, "balance_loss_clip": 1.02245617, "balance_loss_mlp": 1.00190568, "epoch": 0.4413647978355629, "flos": 61586153804160.0, "grad_norm": 0.8827965218567749, "language_loss": 0.63983381, "learning_rate": 2.4723894822368054e-06, "loss": 0.66017926, "num_input_tokens_seen": 157404260, "step": 7341, "time_per_iteration": 3.049508810043335 }, { "auxiliary_loss_clip": 0.01091178, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.0418849, "balance_loss_mlp": 1.02682424, "epoch": 0.4414249210882309, "flos": 27527001505920.0, "grad_norm": 2.055823294856648, "language_loss": 0.73636287, "learning_rate": 2.47201103113145e-06, "loss": 0.75767612, "num_input_tokens_seen": 157423045, "step": 7342, "time_per_iteration": 2.795201063156128 }, { "auxiliary_loss_clip": 0.01125069, "auxiliary_loss_mlp": 0.01041127, "balance_loss_clip": 1.04345822, "balance_loss_mlp": 1.02709007, "epoch": 0.44148504434089886, "flos": 23513984277120.0, "grad_norm": 2.2044048255358515, "language_loss": 0.79979384, "learning_rate": 2.4716325621273886e-06, "loss": 0.82145584, "num_input_tokens_seen": 157441815, "step": 7343, "time_per_iteration": 5.804108142852783 }, { "auxiliary_loss_clip": 0.010937, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.04503846, "balance_loss_mlp": 1.02072287, "epoch": 0.4415451675935668, "flos": 21580589796480.0, "grad_norm": 2.707350721832692, "language_loss": 0.76721787, "learning_rate": 2.4712540752389725e-06, "loss": 0.78849834, "num_input_tokens_seen": 157460470, "step": 7344, "time_per_iteration": 2.7370471954345703 }, { "auxiliary_loss_clip": 0.01038191, "auxiliary_loss_mlp": 0.01020913, "balance_loss_clip": 1.0274384, "balance_loss_mlp": 1.01902914, "epoch": 0.4416052908462348, "flos": 59006368126080.0, "grad_norm": 0.7980536604903562, "language_loss": 0.63813043, "learning_rate": 2.470875570480556e-06, "loss": 0.65872145, "num_input_tokens_seen": 157512655, "step": 7345, "time_per_iteration": 4.502060890197754 }, { "auxiliary_loss_clip": 0.01130065, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.04656529, "balance_loss_mlp": 1.02670372, "epoch": 0.44166541409890275, "flos": 26357642242560.0, "grad_norm": 1.8234046338758734, "language_loss": 0.86094856, "learning_rate": 2.470497047866489e-06, "loss": 0.88265538, "num_input_tokens_seen": 157533700, "step": 7346, "time_per_iteration": 2.697648763656616 }, { "auxiliary_loss_clip": 0.01119294, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.04583025, "balance_loss_mlp": 1.02862179, "epoch": 0.4417255373515707, "flos": 20192678231040.0, "grad_norm": 1.7966519054380148, "language_loss": 0.80474353, "learning_rate": 2.470118507411128e-06, "loss": 0.8263666, "num_input_tokens_seen": 157551105, "step": 7347, "time_per_iteration": 4.3498101234436035 }, { "auxiliary_loss_clip": 0.01107859, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.04878783, "balance_loss_mlp": 1.02088118, "epoch": 0.4417856606042387, "flos": 17887895078400.0, "grad_norm": 1.7585337264872751, "language_loss": 0.83156574, "learning_rate": 2.4697399491288263e-06, "loss": 0.85299683, "num_input_tokens_seen": 157568285, "step": 7348, "time_per_iteration": 2.6866180896759033 }, { "auxiliary_loss_clip": 0.01119234, "auxiliary_loss_mlp": 0.01035311, "balance_loss_clip": 1.04732084, "balance_loss_mlp": 1.02139926, "epoch": 0.44184578385690665, "flos": 27964034282880.0, "grad_norm": 2.0657656881846505, "language_loss": 0.70507312, "learning_rate": 2.469361373033938e-06, "loss": 0.72661853, "num_input_tokens_seen": 157590405, "step": 7349, "time_per_iteration": 2.7241854667663574 }, { "auxiliary_loss_clip": 0.0109864, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.04184258, "balance_loss_mlp": 1.01935983, "epoch": 0.4419059071095746, "flos": 23367899664000.0, "grad_norm": 1.9069897602324009, "language_loss": 0.74060279, "learning_rate": 2.468982779140819e-06, "loss": 0.76192582, "num_input_tokens_seen": 157607420, "step": 7350, "time_per_iteration": 2.724295139312744 }, { "auxiliary_loss_clip": 0.01129716, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.04692149, "balance_loss_mlp": 1.02279782, "epoch": 0.4419660303622426, "flos": 15012169246080.0, "grad_norm": 4.28906993354027, "language_loss": 0.81133771, "learning_rate": 2.468604167463827e-06, "loss": 0.83299923, "num_input_tokens_seen": 157624990, "step": 7351, "time_per_iteration": 2.6151175498962402 }, { "auxiliary_loss_clip": 0.01077442, "auxiliary_loss_mlp": 0.00770493, "balance_loss_clip": 1.03664398, "balance_loss_mlp": 1.00027013, "epoch": 0.44202615361491054, "flos": 25371750672000.0, "grad_norm": 1.4842739809833707, "language_loss": 0.72872806, "learning_rate": 2.4682255380173176e-06, "loss": 0.7472074, "num_input_tokens_seen": 157645300, "step": 7352, "time_per_iteration": 2.822618007659912 }, { "auxiliary_loss_clip": 0.01105652, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.05031562, "balance_loss_mlp": 1.01625896, "epoch": 0.4420862768675785, "flos": 24681116897280.0, "grad_norm": 2.2734813659209316, "language_loss": 0.87014645, "learning_rate": 2.467846890815649e-06, "loss": 0.89150345, "num_input_tokens_seen": 157664060, "step": 7353, "time_per_iteration": 2.8141496181488037 }, { "auxiliary_loss_clip": 0.01131466, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.04851007, "balance_loss_mlp": 1.02385104, "epoch": 0.44214640012024653, "flos": 19528437974400.0, "grad_norm": 2.0005767830632464, "language_loss": 0.75907683, "learning_rate": 2.4674682258731795e-06, "loss": 0.78076005, "num_input_tokens_seen": 157680905, "step": 7354, "time_per_iteration": 2.6416475772857666 }, { "auxiliary_loss_clip": 0.01087376, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.04345286, "balance_loss_mlp": 1.02218962, "epoch": 0.4422065233729145, "flos": 47557434003840.0, "grad_norm": 1.702490286843937, "language_loss": 0.64954734, "learning_rate": 2.467089543204268e-06, "loss": 0.67077219, "num_input_tokens_seen": 157701980, "step": 7355, "time_per_iteration": 2.9349570274353027 }, { "auxiliary_loss_clip": 0.01133882, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.04775596, "balance_loss_mlp": 1.02121234, "epoch": 0.44226664662558246, "flos": 19281050029440.0, "grad_norm": 1.8300716428477437, "language_loss": 0.78527248, "learning_rate": 2.466710842823274e-06, "loss": 0.80696642, "num_input_tokens_seen": 157720555, "step": 7356, "time_per_iteration": 2.5932910442352295 }, { "auxiliary_loss_clip": 0.01109756, "auxiliary_loss_mlp": 0.00771729, "balance_loss_clip": 1.04629183, "balance_loss_mlp": 1.0004859, "epoch": 0.4423267698782504, "flos": 17821820010240.0, "grad_norm": 1.6708598029973696, "language_loss": 0.77472621, "learning_rate": 2.4663321247445577e-06, "loss": 0.79354107, "num_input_tokens_seen": 157739160, "step": 7357, "time_per_iteration": 2.7050111293792725 }, { "auxiliary_loss_clip": 0.01102733, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.04357672, "balance_loss_mlp": 1.02280128, "epoch": 0.4423868931309184, "flos": 29204424691200.0, "grad_norm": 1.492131344457668, "language_loss": 0.73277801, "learning_rate": 2.465953388982481e-06, "loss": 0.75417769, "num_input_tokens_seen": 157760020, "step": 7358, "time_per_iteration": 2.7339792251586914 }, { "auxiliary_loss_clip": 0.01108517, "auxiliary_loss_mlp": 0.01035507, "balance_loss_clip": 1.04953265, "balance_loss_mlp": 1.02198911, "epoch": 0.44244701638358636, "flos": 29713135057920.0, "grad_norm": 1.890703165597896, "language_loss": 0.75731266, "learning_rate": 2.465574635551405e-06, "loss": 0.77875292, "num_input_tokens_seen": 157780435, "step": 7359, "time_per_iteration": 2.7597005367279053 }, { "auxiliary_loss_clip": 0.01106411, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.04658461, "balance_loss_mlp": 1.02315068, "epoch": 0.4425071396362543, "flos": 22930040874240.0, "grad_norm": 1.6679218305876244, "language_loss": 0.69988406, "learning_rate": 2.4651958644656923e-06, "loss": 0.72132587, "num_input_tokens_seen": 157799420, "step": 7360, "time_per_iteration": 2.7118403911590576 }, { "auxiliary_loss_clip": 0.01104133, "auxiliary_loss_mlp": 0.01032941, "balance_loss_clip": 1.04686546, "balance_loss_mlp": 1.01859379, "epoch": 0.4425672628889223, "flos": 19792346175360.0, "grad_norm": 3.404305353939149, "language_loss": 0.69860107, "learning_rate": 2.4648170757397053e-06, "loss": 0.71997184, "num_input_tokens_seen": 157817025, "step": 7361, "time_per_iteration": 2.672388792037964 }, { "auxiliary_loss_clip": 0.01105237, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.04377937, "balance_loss_mlp": 1.02539372, "epoch": 0.44262738614159025, "flos": 13662215377920.0, "grad_norm": 2.0698565080434888, "language_loss": 0.82494795, "learning_rate": 2.464438269387809e-06, "loss": 0.84640616, "num_input_tokens_seen": 157834345, "step": 7362, "time_per_iteration": 2.6258609294891357 }, { "auxiliary_loss_clip": 0.01102915, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.04801464, "balance_loss_mlp": 1.02494311, "epoch": 0.4426875093942582, "flos": 14210212245120.0, "grad_norm": 1.7089384580193987, "language_loss": 0.74628377, "learning_rate": 2.464059445424366e-06, "loss": 0.76771677, "num_input_tokens_seen": 157852290, "step": 7363, "time_per_iteration": 2.7868857383728027 }, { "auxiliary_loss_clip": 0.01008645, "auxiliary_loss_mlp": 0.01003596, "balance_loss_clip": 1.02228582, "balance_loss_mlp": 1.0016526, "epoch": 0.4427476326469262, "flos": 70117525728000.0, "grad_norm": 0.6804595751696751, "language_loss": 0.55677116, "learning_rate": 2.463680603863743e-06, "loss": 0.57689351, "num_input_tokens_seen": 157923060, "step": 7364, "time_per_iteration": 3.3737823963165283 }, { "auxiliary_loss_clip": 0.01109131, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.04670477, "balance_loss_mlp": 1.02778566, "epoch": 0.44280775589959415, "flos": 25445080287360.0, "grad_norm": 1.640155581598939, "language_loss": 0.74618137, "learning_rate": 2.463301744720305e-06, "loss": 0.76768118, "num_input_tokens_seen": 157944110, "step": 7365, "time_per_iteration": 2.789905071258545 }, { "auxiliary_loss_clip": 0.01099825, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.04348397, "balance_loss_mlp": 1.0287931, "epoch": 0.4428678791522621, "flos": 22857214049280.0, "grad_norm": 1.5674103047703387, "language_loss": 0.74297303, "learning_rate": 2.4629228680084184e-06, "loss": 0.76440525, "num_input_tokens_seen": 157964295, "step": 7366, "time_per_iteration": 2.700286626815796 }, { "auxiliary_loss_clip": 0.01108412, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.04708481, "balance_loss_mlp": 1.02240598, "epoch": 0.44292800240493013, "flos": 25812446636160.0, "grad_norm": 3.271133633367276, "language_loss": 0.73245466, "learning_rate": 2.46254397374245e-06, "loss": 0.75390375, "num_input_tokens_seen": 157983970, "step": 7367, "time_per_iteration": 2.6946957111358643 }, { "auxiliary_loss_clip": 0.01130142, "auxiliary_loss_mlp": 0.01040167, "balance_loss_clip": 1.04803169, "balance_loss_mlp": 1.02645779, "epoch": 0.4429881256575981, "flos": 32416885549440.0, "grad_norm": 1.566124307945558, "language_loss": 0.73996794, "learning_rate": 2.4621650619367677e-06, "loss": 0.76167101, "num_input_tokens_seen": 158006515, "step": 7368, "time_per_iteration": 2.7544407844543457 }, { "auxiliary_loss_clip": 0.01100906, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.04302347, "balance_loss_mlp": 1.01735687, "epoch": 0.44304824891026606, "flos": 22163707186560.0, "grad_norm": 2.0120848529023334, "language_loss": 0.7961669, "learning_rate": 2.4617861326057403e-06, "loss": 0.81748605, "num_input_tokens_seen": 158025565, "step": 7369, "time_per_iteration": 2.697190046310425 }, { "auxiliary_loss_clip": 0.010901, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.04244113, "balance_loss_mlp": 1.02251637, "epoch": 0.443108372162934, "flos": 25338569483520.0, "grad_norm": 1.9393131166495303, "language_loss": 0.72057104, "learning_rate": 2.461407185763737e-06, "loss": 0.74182796, "num_input_tokens_seen": 158045620, "step": 7370, "time_per_iteration": 2.7959940433502197 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.0103749, "balance_loss_clip": 1.04668999, "balance_loss_mlp": 1.02349448, "epoch": 0.443168495415602, "flos": 23330947547520.0, "grad_norm": 1.8535232870502223, "language_loss": 0.70380038, "learning_rate": 2.461028221425126e-06, "loss": 0.72546607, "num_input_tokens_seen": 158063505, "step": 7371, "time_per_iteration": 2.677718162536621 }, { "auxiliary_loss_clip": 0.01119855, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.0492835, "balance_loss_mlp": 1.01867962, "epoch": 0.44322861866826996, "flos": 21871502046720.0, "grad_norm": 2.0883513439310577, "language_loss": 0.68410224, "learning_rate": 2.4606492396042786e-06, "loss": 0.70561314, "num_input_tokens_seen": 158080335, "step": 7372, "time_per_iteration": 2.6676101684570312 }, { "auxiliary_loss_clip": 0.01096245, "auxiliary_loss_mlp": 0.0103489, "balance_loss_clip": 1.04236257, "balance_loss_mlp": 1.0203104, "epoch": 0.4432887419209379, "flos": 20084407660800.0, "grad_norm": 1.830573306058503, "language_loss": 0.83560812, "learning_rate": 2.4602702403155664e-06, "loss": 0.85691947, "num_input_tokens_seen": 158098955, "step": 7373, "time_per_iteration": 2.706554651260376 }, { "auxiliary_loss_clip": 0.0103821, "auxiliary_loss_mlp": 0.0100315, "balance_loss_clip": 1.01858282, "balance_loss_mlp": 1.00125432, "epoch": 0.4433488651736059, "flos": 70035540935040.0, "grad_norm": 0.769882260063621, "language_loss": 0.55201387, "learning_rate": 2.4598912235733604e-06, "loss": 0.57242751, "num_input_tokens_seen": 158164110, "step": 7374, "time_per_iteration": 3.2373340129852295 }, { "auxiliary_loss_clip": 0.01078736, "auxiliary_loss_mlp": 0.01042384, "balance_loss_clip": 1.04519641, "balance_loss_mlp": 1.02773309, "epoch": 0.44340898842627385, "flos": 16282472705280.0, "grad_norm": 2.3490774090653592, "language_loss": 0.8289665, "learning_rate": 2.4595121893920327e-06, "loss": 0.85017765, "num_input_tokens_seen": 158179850, "step": 7375, "time_per_iteration": 2.7468464374542236 }, { "auxiliary_loss_clip": 0.01129641, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.04680073, "balance_loss_mlp": 1.02032566, "epoch": 0.4434691116789418, "flos": 16611989097600.0, "grad_norm": 1.9296092769688273, "language_loss": 0.84076023, "learning_rate": 2.4591331377859578e-06, "loss": 0.86239868, "num_input_tokens_seen": 158196590, "step": 7376, "time_per_iteration": 2.5597686767578125 }, { "auxiliary_loss_clip": 0.01105366, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.04541779, "balance_loss_mlp": 1.02299011, "epoch": 0.4435292349316098, "flos": 19063251912960.0, "grad_norm": 1.7983383352892115, "language_loss": 0.77172405, "learning_rate": 2.4587540687695077e-06, "loss": 0.79314244, "num_input_tokens_seen": 158216355, "step": 7377, "time_per_iteration": 2.7065727710723877 }, { "auxiliary_loss_clip": 0.01111732, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.04586828, "balance_loss_mlp": 1.01916027, "epoch": 0.44358935818427775, "flos": 21251324799360.0, "grad_norm": 2.2025516465061568, "language_loss": 0.76422131, "learning_rate": 2.458374982357057e-06, "loss": 0.78566432, "num_input_tokens_seen": 158235825, "step": 7378, "time_per_iteration": 2.6680550575256348 }, { "auxiliary_loss_clip": 0.01104625, "auxiliary_loss_mlp": 0.01055785, "balance_loss_clip": 1.04471672, "balance_loss_mlp": 1.0404191, "epoch": 0.4436494814369457, "flos": 12495298239360.0, "grad_norm": 1.9484405267541265, "language_loss": 0.69165838, "learning_rate": 2.457995878562982e-06, "loss": 0.7132625, "num_input_tokens_seen": 158254230, "step": 7379, "time_per_iteration": 2.6700775623321533 }, { "auxiliary_loss_clip": 0.01063579, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.03913927, "balance_loss_mlp": 1.0297358, "epoch": 0.44370960468961373, "flos": 23659853408640.0, "grad_norm": 2.073474855716146, "language_loss": 0.7288872, "learning_rate": 2.457616757401656e-06, "loss": 0.74997967, "num_input_tokens_seen": 158273400, "step": 7380, "time_per_iteration": 2.8017635345458984 }, { "auxiliary_loss_clip": 0.01110205, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.04831696, "balance_loss_mlp": 1.02124155, "epoch": 0.4437697279422817, "flos": 32416849635840.0, "grad_norm": 1.6338701103198854, "language_loss": 0.64961064, "learning_rate": 2.457237618887458e-06, "loss": 0.67106432, "num_input_tokens_seen": 158296840, "step": 7381, "time_per_iteration": 2.791595458984375 }, { "auxiliary_loss_clip": 0.01120176, "auxiliary_loss_mlp": 0.0104083, "balance_loss_clip": 1.04781485, "balance_loss_mlp": 1.02696049, "epoch": 0.44382985119494966, "flos": 18112875914880.0, "grad_norm": 5.151492667638541, "language_loss": 0.80450714, "learning_rate": 2.456858463034763e-06, "loss": 0.82611728, "num_input_tokens_seen": 158314935, "step": 7382, "time_per_iteration": 4.177164316177368 }, { "auxiliary_loss_clip": 0.0112542, "auxiliary_loss_mlp": 0.01039884, "balance_loss_clip": 1.05130458, "balance_loss_mlp": 1.02599657, "epoch": 0.44388997444761763, "flos": 30774151923840.0, "grad_norm": 1.842434773727105, "language_loss": 0.65955621, "learning_rate": 2.456479289857949e-06, "loss": 0.68120921, "num_input_tokens_seen": 158334620, "step": 7383, "time_per_iteration": 4.142000436782837 }, { "auxiliary_loss_clip": 0.01104406, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.04357898, "balance_loss_mlp": 1.02228832, "epoch": 0.4439500977002856, "flos": 20339157893760.0, "grad_norm": 2.431816949897044, "language_loss": 0.76046586, "learning_rate": 2.4561000993713953e-06, "loss": 0.78187954, "num_input_tokens_seen": 158350550, "step": 7384, "time_per_iteration": 4.309042453765869 }, { "auxiliary_loss_clip": 0.01132692, "auxiliary_loss_mlp": 0.01040021, "balance_loss_clip": 1.04878867, "balance_loss_mlp": 1.02595425, "epoch": 0.44401022095295356, "flos": 20371225760640.0, "grad_norm": 1.6001418974541146, "language_loss": 0.81145859, "learning_rate": 2.4557208915894796e-06, "loss": 0.83318579, "num_input_tokens_seen": 158369555, "step": 7385, "time_per_iteration": 2.6569409370422363 }, { "auxiliary_loss_clip": 0.01085589, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04551208, "balance_loss_mlp": 1.02062619, "epoch": 0.4440703442056215, "flos": 20230635928320.0, "grad_norm": 1.8953258070837995, "language_loss": 0.81531972, "learning_rate": 2.455341666526582e-06, "loss": 0.8365339, "num_input_tokens_seen": 158388045, "step": 7386, "time_per_iteration": 2.757857084274292 }, { "auxiliary_loss_clip": 0.01092623, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04583073, "balance_loss_mlp": 1.01829553, "epoch": 0.4441304674582895, "flos": 39494698824960.0, "grad_norm": 2.1898431457791827, "language_loss": 0.70026255, "learning_rate": 2.4549624241970832e-06, "loss": 0.72152579, "num_input_tokens_seen": 158410115, "step": 7387, "time_per_iteration": 4.4056620597839355 }, { "auxiliary_loss_clip": 0.01064296, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.04571772, "balance_loss_mlp": 1.02586579, "epoch": 0.44419059071095746, "flos": 14829671220480.0, "grad_norm": 1.9497255625781733, "language_loss": 0.71838999, "learning_rate": 2.4545831646153628e-06, "loss": 0.73943412, "num_input_tokens_seen": 158427765, "step": 7388, "time_per_iteration": 2.7504312992095947 }, { "auxiliary_loss_clip": 0.01120562, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 1.04769969, "balance_loss_mlp": 1.02277958, "epoch": 0.4442507139636254, "flos": 22637835734400.0, "grad_norm": 1.8353800507100826, "language_loss": 0.6930418, "learning_rate": 2.4542038877958044e-06, "loss": 0.71461499, "num_input_tokens_seen": 158446375, "step": 7389, "time_per_iteration": 2.620847702026367 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.04713047, "balance_loss_mlp": 1.02149689, "epoch": 0.4443108372162934, "flos": 38290721829120.0, "grad_norm": 1.8033342781314554, "language_loss": 0.75145507, "learning_rate": 2.453824593752788e-06, "loss": 0.77301002, "num_input_tokens_seen": 158467260, "step": 7390, "time_per_iteration": 2.794739246368408 }, { "auxiliary_loss_clip": 0.01112569, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.04474115, "balance_loss_mlp": 1.0285244, "epoch": 0.44437096046896135, "flos": 17748993185280.0, "grad_norm": 2.757944013002859, "language_loss": 0.8139115, "learning_rate": 2.4534452825006988e-06, "loss": 0.83547109, "num_input_tokens_seen": 158486720, "step": 7391, "time_per_iteration": 2.62081241607666 }, { "auxiliary_loss_clip": 0.01100157, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.04489446, "balance_loss_mlp": 1.02436733, "epoch": 0.4444310837216293, "flos": 13732348682880.0, "grad_norm": 1.7057692393428199, "language_loss": 0.73885345, "learning_rate": 2.4530659540539185e-06, "loss": 0.76025033, "num_input_tokens_seen": 158502530, "step": 7392, "time_per_iteration": 2.619123935699463 }, { "auxiliary_loss_clip": 0.01116796, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.04451931, "balance_loss_mlp": 1.01976895, "epoch": 0.44449120697429734, "flos": 25010238240000.0, "grad_norm": 1.6244243517648933, "language_loss": 0.79316819, "learning_rate": 2.4526866084268313e-06, "loss": 0.81466603, "num_input_tokens_seen": 158522715, "step": 7393, "time_per_iteration": 2.761636257171631 }, { "auxiliary_loss_clip": 0.01123845, "auxiliary_loss_mlp": 0.01034263, "balance_loss_clip": 1.04784608, "balance_loss_mlp": 1.02036357, "epoch": 0.4445513302269653, "flos": 32671707609600.0, "grad_norm": 1.7936817608261026, "language_loss": 0.80767369, "learning_rate": 2.4523072456338226e-06, "loss": 0.82925481, "num_input_tokens_seen": 158543615, "step": 7394, "time_per_iteration": 2.731896162033081 }, { "auxiliary_loss_clip": 0.01101431, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.04235363, "balance_loss_mlp": 1.02805972, "epoch": 0.44461145347963327, "flos": 11655814504320.0, "grad_norm": 2.5483522979722886, "language_loss": 0.79701138, "learning_rate": 2.4519278656892785e-06, "loss": 0.81843174, "num_input_tokens_seen": 158560330, "step": 7395, "time_per_iteration": 2.6799733638763428 }, { "auxiliary_loss_clip": 0.0110231, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.04210639, "balance_loss_mlp": 1.02630031, "epoch": 0.44467157673230123, "flos": 20886759711360.0, "grad_norm": 1.725775342310971, "language_loss": 0.68280721, "learning_rate": 2.451548468607584e-06, "loss": 0.70423067, "num_input_tokens_seen": 158579735, "step": 7396, "time_per_iteration": 2.7539262771606445 }, { "auxiliary_loss_clip": 0.01115853, "auxiliary_loss_mlp": 0.00771942, "balance_loss_clip": 1.04396296, "balance_loss_mlp": 1.00035286, "epoch": 0.4447316999849692, "flos": 18546137763840.0, "grad_norm": 1.749232481773879, "language_loss": 0.80780083, "learning_rate": 2.451169054403126e-06, "loss": 0.82667875, "num_input_tokens_seen": 158597075, "step": 7397, "time_per_iteration": 2.6620333194732666 }, { "auxiliary_loss_clip": 0.01119828, "auxiliary_loss_mlp": 0.01038203, "balance_loss_clip": 1.04740441, "balance_loss_mlp": 1.02525663, "epoch": 0.44479182323763716, "flos": 23769057732480.0, "grad_norm": 1.6626939297991263, "language_loss": 0.67383635, "learning_rate": 2.450789623090293e-06, "loss": 0.69541669, "num_input_tokens_seen": 158616650, "step": 7398, "time_per_iteration": 2.671193838119507 }, { "auxiliary_loss_clip": 0.01097104, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.04477727, "balance_loss_mlp": 1.0271976, "epoch": 0.44485194649030513, "flos": 16543831040640.0, "grad_norm": 1.7055478439146432, "language_loss": 0.69250667, "learning_rate": 2.450410174683472e-06, "loss": 0.71388054, "num_input_tokens_seen": 158634515, "step": 7399, "time_per_iteration": 2.6823384761810303 }, { "auxiliary_loss_clip": 0.01097596, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.04475021, "balance_loss_mlp": 1.0225575, "epoch": 0.4449120697429731, "flos": 22600955445120.0, "grad_norm": 1.8287170900617375, "language_loss": 0.72332168, "learning_rate": 2.4500307091970514e-06, "loss": 0.74465525, "num_input_tokens_seen": 158653760, "step": 7400, "time_per_iteration": 2.7227253913879395 }, { "auxiliary_loss_clip": 0.01076093, "auxiliary_loss_mlp": 0.00770024, "balance_loss_clip": 1.04184151, "balance_loss_mlp": 1.00039887, "epoch": 0.44497219299564106, "flos": 20004864992640.0, "grad_norm": 1.6814996958378423, "language_loss": 0.85252142, "learning_rate": 2.449651226645422e-06, "loss": 0.87098259, "num_input_tokens_seen": 158672190, "step": 7401, "time_per_iteration": 2.757293701171875 }, { "auxiliary_loss_clip": 0.01102171, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.04564703, "balance_loss_mlp": 1.02497375, "epoch": 0.445032316248309, "flos": 25594253470080.0, "grad_norm": 1.6805452055908299, "language_loss": 0.83201802, "learning_rate": 2.449271727042973e-06, "loss": 0.85341299, "num_input_tokens_seen": 158694115, "step": 7402, "time_per_iteration": 2.7132928371429443 }, { "auxiliary_loss_clip": 0.01107267, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.0461576, "balance_loss_mlp": 1.0188688, "epoch": 0.445092439500977, "flos": 21250426959360.0, "grad_norm": 1.9019306445781163, "language_loss": 0.7714172, "learning_rate": 2.4488922104040947e-06, "loss": 0.79281807, "num_input_tokens_seen": 158711000, "step": 7403, "time_per_iteration": 2.6282217502593994 }, { "auxiliary_loss_clip": 0.01023728, "auxiliary_loss_mlp": 0.01005808, "balance_loss_clip": 1.0202831, "balance_loss_mlp": 1.00413918, "epoch": 0.44515256275364495, "flos": 57764900309760.0, "grad_norm": 0.7456605721636542, "language_loss": 0.59988129, "learning_rate": 2.4485126767431793e-06, "loss": 0.62017667, "num_input_tokens_seen": 158769675, "step": 7404, "time_per_iteration": 3.173560619354248 }, { "auxiliary_loss_clip": 0.01105136, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.04419279, "balance_loss_mlp": 1.02934957, "epoch": 0.4452126860063129, "flos": 15596004908160.0, "grad_norm": 1.6768296122026118, "language_loss": 0.82246673, "learning_rate": 2.4481331260746177e-06, "loss": 0.8439644, "num_input_tokens_seen": 158788215, "step": 7405, "time_per_iteration": 2.6669278144836426 }, { "auxiliary_loss_clip": 0.01104648, "auxiliary_loss_mlp": 0.01029929, "balance_loss_clip": 1.04628932, "balance_loss_mlp": 1.01669657, "epoch": 0.4452728092589809, "flos": 21617398258560.0, "grad_norm": 4.56209401129754, "language_loss": 0.75126898, "learning_rate": 2.4477535584128036e-06, "loss": 0.77261472, "num_input_tokens_seen": 158809090, "step": 7406, "time_per_iteration": 2.6722404956817627 }, { "auxiliary_loss_clip": 0.01091029, "auxiliary_loss_mlp": 0.01030298, "balance_loss_clip": 1.0434047, "balance_loss_mlp": 1.01746488, "epoch": 0.4453329325116489, "flos": 29497491757440.0, "grad_norm": 1.6633570284980403, "language_loss": 0.6572476, "learning_rate": 2.447373973772129e-06, "loss": 0.67846084, "num_input_tokens_seen": 158828320, "step": 7407, "time_per_iteration": 2.819289207458496 }, { "auxiliary_loss_clip": 0.01102137, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.04499328, "balance_loss_mlp": 1.0179081, "epoch": 0.44539305576431687, "flos": 21361139654400.0, "grad_norm": 1.6186505097592758, "language_loss": 0.67861688, "learning_rate": 2.4469943721669887e-06, "loss": 0.69995308, "num_input_tokens_seen": 158847040, "step": 7408, "time_per_iteration": 2.6846649646759033 }, { "auxiliary_loss_clip": 0.01128678, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.04559541, "balance_loss_mlp": 1.02121043, "epoch": 0.44545317901698483, "flos": 41427626428800.0, "grad_norm": 1.4740715510068387, "language_loss": 0.72127414, "learning_rate": 2.4466147536117776e-06, "loss": 0.74291599, "num_input_tokens_seen": 158870490, "step": 7409, "time_per_iteration": 2.7701869010925293 }, { "auxiliary_loss_clip": 0.01107577, "auxiliary_loss_mlp": 0.010375, "balance_loss_clip": 1.04669523, "balance_loss_mlp": 1.02308798, "epoch": 0.4455133022696528, "flos": 22055005653120.0, "grad_norm": 1.9118661854704846, "language_loss": 0.65146017, "learning_rate": 2.4462351181208895e-06, "loss": 0.67291093, "num_input_tokens_seen": 158889920, "step": 7410, "time_per_iteration": 2.780905246734619 }, { "auxiliary_loss_clip": 0.01104956, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.04414868, "balance_loss_mlp": 1.02369618, "epoch": 0.44557342552232077, "flos": 23476960333440.0, "grad_norm": 2.076728084707015, "language_loss": 0.73772335, "learning_rate": 2.4458554657087217e-06, "loss": 0.75915742, "num_input_tokens_seen": 158909580, "step": 7411, "time_per_iteration": 2.745547294616699 }, { "auxiliary_loss_clip": 0.01061885, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.04457641, "balance_loss_mlp": 1.01967764, "epoch": 0.44563354877498873, "flos": 19134678107520.0, "grad_norm": 1.7330985507109689, "language_loss": 0.79373199, "learning_rate": 2.4454757963896695e-06, "loss": 0.81468445, "num_input_tokens_seen": 158924600, "step": 7412, "time_per_iteration": 2.76361346244812 }, { "auxiliary_loss_clip": 0.01108589, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.04357016, "balance_loss_mlp": 1.02453899, "epoch": 0.4456936720276567, "flos": 13621420506240.0, "grad_norm": 1.9356381581130233, "language_loss": 0.80161285, "learning_rate": 2.4450961101781304e-06, "loss": 0.82307845, "num_input_tokens_seen": 158939345, "step": 7413, "time_per_iteration": 2.619915008544922 }, { "auxiliary_loss_clip": 0.01113419, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.0433104, "balance_loss_mlp": 1.01962328, "epoch": 0.44575379528032466, "flos": 14713715139840.0, "grad_norm": 1.9889124982728665, "language_loss": 0.76648301, "learning_rate": 2.4447164070885026e-06, "loss": 0.78794879, "num_input_tokens_seen": 158955855, "step": 7414, "time_per_iteration": 2.5959794521331787 }, { "auxiliary_loss_clip": 0.01096052, "auxiliary_loss_mlp": 0.01040946, "balance_loss_clip": 1.0415467, "balance_loss_mlp": 1.02701616, "epoch": 0.4458139185329926, "flos": 24170682677760.0, "grad_norm": 1.6599120729875612, "language_loss": 0.83765483, "learning_rate": 2.4443366871351837e-06, "loss": 0.85902476, "num_input_tokens_seen": 158976315, "step": 7415, "time_per_iteration": 2.785512685775757 }, { "auxiliary_loss_clip": 0.01124247, "auxiliary_loss_mlp": 0.01043831, "balance_loss_clip": 1.04321933, "balance_loss_mlp": 1.03093266, "epoch": 0.4458740417856606, "flos": 21762225895680.0, "grad_norm": 2.1888037109264933, "language_loss": 0.84245199, "learning_rate": 2.4439569503325732e-06, "loss": 0.86413276, "num_input_tokens_seen": 158996725, "step": 7416, "time_per_iteration": 2.60307240486145 }, { "auxiliary_loss_clip": 0.01095417, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.04398692, "balance_loss_mlp": 1.01991272, "epoch": 0.44593416503832856, "flos": 21068790860160.0, "grad_norm": 1.494230693182331, "language_loss": 0.81091261, "learning_rate": 2.4435771966950706e-06, "loss": 0.83220685, "num_input_tokens_seen": 159017255, "step": 7417, "time_per_iteration": 2.7423362731933594 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.01040379, "balance_loss_clip": 1.04227042, "balance_loss_mlp": 1.02650881, "epoch": 0.4459942882909965, "flos": 22600488568320.0, "grad_norm": 2.47121292521638, "language_loss": 0.81035185, "learning_rate": 2.443197426237077e-06, "loss": 0.8318069, "num_input_tokens_seen": 159035010, "step": 7418, "time_per_iteration": 2.67476487159729 }, { "auxiliary_loss_clip": 0.01120234, "auxiliary_loss_mlp": 0.007712, "balance_loss_clip": 1.04618478, "balance_loss_mlp": 1.00049162, "epoch": 0.4460544115436645, "flos": 26505486622080.0, "grad_norm": 2.084312717643635, "language_loss": 0.77342117, "learning_rate": 2.442817638972991e-06, "loss": 0.79233551, "num_input_tokens_seen": 159055345, "step": 7419, "time_per_iteration": 2.760847806930542 }, { "auxiliary_loss_clip": 0.0108993, "auxiliary_loss_mlp": 0.0103388, "balance_loss_clip": 1.03954124, "balance_loss_mlp": 1.02063632, "epoch": 0.4461145347963325, "flos": 17604021893760.0, "grad_norm": 1.824664612180611, "language_loss": 0.72570968, "learning_rate": 2.4424378349172176e-06, "loss": 0.74694777, "num_input_tokens_seen": 159074225, "step": 7420, "time_per_iteration": 2.6990244388580322 }, { "auxiliary_loss_clip": 0.01104512, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.0432508, "balance_loss_mlp": 1.01793802, "epoch": 0.44617465804900047, "flos": 27268193036160.0, "grad_norm": 1.5590654083825235, "language_loss": 0.75280499, "learning_rate": 2.442058014084156e-06, "loss": 0.77417064, "num_input_tokens_seen": 159095415, "step": 7421, "time_per_iteration": 2.751757860183716 }, { "auxiliary_loss_clip": 0.01059239, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.03808808, "balance_loss_mlp": 1.02374959, "epoch": 0.44623478130166844, "flos": 17786412178560.0, "grad_norm": 1.7359325284030627, "language_loss": 0.75753498, "learning_rate": 2.44167817648821e-06, "loss": 0.77850193, "num_input_tokens_seen": 159114615, "step": 7422, "time_per_iteration": 4.3189520835876465 }, { "auxiliary_loss_clip": 0.01125756, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.04443765, "balance_loss_mlp": 1.02083755, "epoch": 0.4462949045543364, "flos": 23003011353600.0, "grad_norm": 1.436007196155178, "language_loss": 0.65393054, "learning_rate": 2.441298322143784e-06, "loss": 0.67552686, "num_input_tokens_seen": 159134370, "step": 7423, "time_per_iteration": 4.272382020950317 }, { "auxiliary_loss_clip": 0.01096555, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.04093194, "balance_loss_mlp": 1.02195287, "epoch": 0.44635502780700437, "flos": 17820096157440.0, "grad_norm": 1.6490570846190094, "language_loss": 0.79002917, "learning_rate": 2.4409184510652807e-06, "loss": 0.8113389, "num_input_tokens_seen": 159152540, "step": 7424, "time_per_iteration": 2.6641786098480225 }, { "auxiliary_loss_clip": 0.01109138, "auxiliary_loss_mlp": 0.01031872, "balance_loss_clip": 1.04272473, "balance_loss_mlp": 1.01960564, "epoch": 0.44641515105967233, "flos": 26688020561280.0, "grad_norm": 1.5476168372337398, "language_loss": 0.80515361, "learning_rate": 2.4405385632671063e-06, "loss": 0.82656378, "num_input_tokens_seen": 159173425, "step": 7425, "time_per_iteration": 2.677921772003174 }, { "auxiliary_loss_clip": 0.01111593, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.04249597, "balance_loss_mlp": 1.01805556, "epoch": 0.4464752743123403, "flos": 18913324544640.0, "grad_norm": 1.7505920916906397, "language_loss": 0.77314126, "learning_rate": 2.4401586587636655e-06, "loss": 0.79456341, "num_input_tokens_seen": 159191210, "step": 7426, "time_per_iteration": 4.264745712280273 }, { "auxiliary_loss_clip": 0.01098153, "auxiliary_loss_mlp": 0.00770786, "balance_loss_clip": 1.04180968, "balance_loss_mlp": 1.00042045, "epoch": 0.44653539756500826, "flos": 29570318582400.0, "grad_norm": 2.512425150903693, "language_loss": 0.64678168, "learning_rate": 2.4397787375693634e-06, "loss": 0.66547108, "num_input_tokens_seen": 159211755, "step": 7427, "time_per_iteration": 2.746807336807251 }, { "auxiliary_loss_clip": 0.01114285, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.04756093, "balance_loss_mlp": 1.01968026, "epoch": 0.44659552081767623, "flos": 21468979261440.0, "grad_norm": 1.6794687580888963, "language_loss": 0.7564522, "learning_rate": 2.439398799698608e-06, "loss": 0.77792168, "num_input_tokens_seen": 159230315, "step": 7428, "time_per_iteration": 2.675830364227295 }, { "auxiliary_loss_clip": 0.01089417, "auxiliary_loss_mlp": 0.0103803, "balance_loss_clip": 1.03992331, "balance_loss_mlp": 1.0244813, "epoch": 0.4466556440703442, "flos": 17931886260480.0, "grad_norm": 2.160723316992149, "language_loss": 0.77906388, "learning_rate": 2.439018845165806e-06, "loss": 0.80033839, "num_input_tokens_seen": 159249810, "step": 7429, "time_per_iteration": 2.6864819526672363 }, { "auxiliary_loss_clip": 0.01117759, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.04573584, "balance_loss_mlp": 1.02222157, "epoch": 0.44671576732301216, "flos": 21107430915840.0, "grad_norm": 1.6783165407459442, "language_loss": 0.91421354, "learning_rate": 2.438638873985366e-06, "loss": 0.93575251, "num_input_tokens_seen": 159271715, "step": 7430, "time_per_iteration": 2.6472880840301514 }, { "auxiliary_loss_clip": 0.01105427, "auxiliary_loss_mlp": 0.00772764, "balance_loss_clip": 1.04418826, "balance_loss_mlp": 1.000386, "epoch": 0.4467758905756801, "flos": 23508920459520.0, "grad_norm": 1.918378394995702, "language_loss": 0.79452366, "learning_rate": 2.4382588861716954e-06, "loss": 0.8133055, "num_input_tokens_seen": 159290690, "step": 7431, "time_per_iteration": 2.7096598148345947 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.04568875, "balance_loss_mlp": 1.02245188, "epoch": 0.4468360138283481, "flos": 18734022829440.0, "grad_norm": 1.6794320575098944, "language_loss": 0.79817986, "learning_rate": 2.437878881739204e-06, "loss": 0.81963724, "num_input_tokens_seen": 159309400, "step": 7432, "time_per_iteration": 2.676522970199585 }, { "auxiliary_loss_clip": 0.01094927, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.04654121, "balance_loss_mlp": 1.02803755, "epoch": 0.4468961370810161, "flos": 23477139901440.0, "grad_norm": 1.8261946877850768, "language_loss": 0.76878047, "learning_rate": 2.437498860702301e-06, "loss": 0.79013455, "num_input_tokens_seen": 159327425, "step": 7433, "time_per_iteration": 2.6820082664489746 }, { "auxiliary_loss_clip": 0.01106089, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.04236984, "balance_loss_mlp": 1.02372587, "epoch": 0.4469562603336841, "flos": 30075042539520.0, "grad_norm": 1.6244691365264956, "language_loss": 0.77377415, "learning_rate": 2.437118823075398e-06, "loss": 0.79518431, "num_input_tokens_seen": 159345805, "step": 7434, "time_per_iteration": 2.7471024990081787 }, { "auxiliary_loss_clip": 0.01118898, "auxiliary_loss_mlp": 0.01031979, "balance_loss_clip": 1.04707336, "balance_loss_mlp": 1.01909828, "epoch": 0.44701638358635204, "flos": 22456415116800.0, "grad_norm": 1.6740796261727897, "language_loss": 0.64705265, "learning_rate": 2.436738768872905e-06, "loss": 0.6685614, "num_input_tokens_seen": 159364595, "step": 7435, "time_per_iteration": 2.649425983428955 }, { "auxiliary_loss_clip": 0.01112389, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.01587653, "epoch": 0.44707650683902, "flos": 24057851080320.0, "grad_norm": 1.6005542791240868, "language_loss": 0.83477545, "learning_rate": 2.4363586981092346e-06, "loss": 0.85619318, "num_input_tokens_seen": 159385265, "step": 7436, "time_per_iteration": 2.6727020740509033 }, { "auxiliary_loss_clip": 0.01073439, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.0402267, "balance_loss_mlp": 1.02884197, "epoch": 0.44713663009168797, "flos": 23766938830080.0, "grad_norm": 2.1717582772549995, "language_loss": 0.79815632, "learning_rate": 2.435978610798798e-06, "loss": 0.81934428, "num_input_tokens_seen": 159405080, "step": 7437, "time_per_iteration": 2.7589898109436035 }, { "auxiliary_loss_clip": 0.01079969, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.0433023, "balance_loss_mlp": 1.02375364, "epoch": 0.44719675334435594, "flos": 24499265316480.0, "grad_norm": 1.7231807337022225, "language_loss": 0.71860999, "learning_rate": 2.435598506956009e-06, "loss": 0.7397815, "num_input_tokens_seen": 159424595, "step": 7438, "time_per_iteration": 2.794978380203247 }, { "auxiliary_loss_clip": 0.01084835, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.04564655, "balance_loss_mlp": 1.02180314, "epoch": 0.4472568765970239, "flos": 29781759991680.0, "grad_norm": 1.556366888574876, "language_loss": 0.67619812, "learning_rate": 2.4352183865952808e-06, "loss": 0.69740379, "num_input_tokens_seen": 159443865, "step": 7439, "time_per_iteration": 2.9251644611358643 }, { "auxiliary_loss_clip": 0.01102346, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.0403614, "balance_loss_mlp": 1.02436376, "epoch": 0.44731699984969187, "flos": 24643123286400.0, "grad_norm": 1.714649831944237, "language_loss": 0.73915118, "learning_rate": 2.4348382497310285e-06, "loss": 0.760571, "num_input_tokens_seen": 159464525, "step": 7440, "time_per_iteration": 2.773106813430786 }, { "auxiliary_loss_clip": 0.01072825, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.03706956, "balance_loss_mlp": 1.02789354, "epoch": 0.44737712310235983, "flos": 29455691304960.0, "grad_norm": 1.740924989183362, "language_loss": 0.74161476, "learning_rate": 2.4344580963776655e-06, "loss": 0.76276582, "num_input_tokens_seen": 159486385, "step": 7441, "time_per_iteration": 2.9042701721191406 }, { "auxiliary_loss_clip": 0.01096694, "auxiliary_loss_mlp": 0.01036467, "balance_loss_clip": 1.04596698, "balance_loss_mlp": 1.0220542, "epoch": 0.4474372463550278, "flos": 24896832024960.0, "grad_norm": 1.9641422641471569, "language_loss": 0.75060695, "learning_rate": 2.4340779265496082e-06, "loss": 0.77193856, "num_input_tokens_seen": 159503880, "step": 7442, "time_per_iteration": 2.776219129562378 }, { "auxiliary_loss_clip": 0.01131095, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.04641354, "balance_loss_mlp": 1.01900017, "epoch": 0.44749736960769576, "flos": 33181603125120.0, "grad_norm": 1.741320347682455, "language_loss": 0.74572098, "learning_rate": 2.433697740261273e-06, "loss": 0.76737112, "num_input_tokens_seen": 159522980, "step": 7443, "time_per_iteration": 2.783189058303833 }, { "auxiliary_loss_clip": 0.01099877, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.03843653, "balance_loss_mlp": 1.01699591, "epoch": 0.4475574928603637, "flos": 21071807602560.0, "grad_norm": 1.581803518054495, "language_loss": 0.77928406, "learning_rate": 2.4333175375270748e-06, "loss": 0.80060327, "num_input_tokens_seen": 159543340, "step": 7444, "time_per_iteration": 2.750493049621582 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.04501557, "balance_loss_mlp": 1.01988959, "epoch": 0.4476176161130317, "flos": 21862523646720.0, "grad_norm": 2.5006881318170917, "language_loss": 0.85238421, "learning_rate": 2.4329373183614333e-06, "loss": 0.87380457, "num_input_tokens_seen": 159558210, "step": 7445, "time_per_iteration": 2.6802477836608887 }, { "auxiliary_loss_clip": 0.01087309, "auxiliary_loss_mlp": 0.0104165, "balance_loss_clip": 1.04073787, "balance_loss_mlp": 1.02471042, "epoch": 0.4476777393656997, "flos": 22528667324160.0, "grad_norm": 3.110631371373827, "language_loss": 0.63355798, "learning_rate": 2.432557082778765e-06, "loss": 0.65484762, "num_input_tokens_seen": 159577920, "step": 7446, "time_per_iteration": 2.746697187423706 }, { "auxiliary_loss_clip": 0.01039011, "auxiliary_loss_mlp": 0.01002627, "balance_loss_clip": 1.02036047, "balance_loss_mlp": 1.00081527, "epoch": 0.4477378626183677, "flos": 49017133877760.0, "grad_norm": 0.738380684617154, "language_loss": 0.50261772, "learning_rate": 2.4321768307934884e-06, "loss": 0.5230341, "num_input_tokens_seen": 159632295, "step": 7447, "time_per_iteration": 3.0176138877868652 }, { "auxiliary_loss_clip": 0.01047805, "auxiliary_loss_mlp": 0.0099926, "balance_loss_clip": 1.0195471, "balance_loss_mlp": 0.9976145, "epoch": 0.44779798587103564, "flos": 56542179392640.0, "grad_norm": 0.7822716011451579, "language_loss": 0.59427667, "learning_rate": 2.4317965624200235e-06, "loss": 0.61474735, "num_input_tokens_seen": 159698435, "step": 7448, "time_per_iteration": 3.1922085285186768 }, { "auxiliary_loss_clip": 0.01093955, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.04417181, "balance_loss_mlp": 1.01983976, "epoch": 0.4478581091237036, "flos": 46498536040320.0, "grad_norm": 1.6983811072489297, "language_loss": 0.58952618, "learning_rate": 2.431416277672789e-06, "loss": 0.61079222, "num_input_tokens_seen": 159722150, "step": 7449, "time_per_iteration": 2.9170258045196533 }, { "auxiliary_loss_clip": 0.01096033, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.04244077, "balance_loss_mlp": 1.01851141, "epoch": 0.4479182323763716, "flos": 20814363849600.0, "grad_norm": 2.0305308033418497, "language_loss": 0.8022064, "learning_rate": 2.4310359765662065e-06, "loss": 0.82348317, "num_input_tokens_seen": 159740550, "step": 7450, "time_per_iteration": 2.640101671218872 }, { "auxiliary_loss_clip": 0.01128944, "auxiliary_loss_mlp": 0.0103919, "balance_loss_clip": 1.04747844, "balance_loss_mlp": 1.02609515, "epoch": 0.44797835562903954, "flos": 14245979212800.0, "grad_norm": 2.0706353062233878, "language_loss": 0.79404807, "learning_rate": 2.430655659114697e-06, "loss": 0.81572944, "num_input_tokens_seen": 159758245, "step": 7451, "time_per_iteration": 2.6094324588775635 }, { "auxiliary_loss_clip": 0.01008441, "auxiliary_loss_mlp": 0.01004662, "balance_loss_clip": 1.02162147, "balance_loss_mlp": 1.00313568, "epoch": 0.4480384788817075, "flos": 63534560169600.0, "grad_norm": 0.8263901394620045, "language_loss": 0.62780499, "learning_rate": 2.430275325332681e-06, "loss": 0.64793605, "num_input_tokens_seen": 159826790, "step": 7452, "time_per_iteration": 3.3816721439361572 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.04587567, "balance_loss_mlp": 1.01958907, "epoch": 0.44809860213437547, "flos": 21652626522240.0, "grad_norm": 1.717773614702603, "language_loss": 0.62656605, "learning_rate": 2.429894975234582e-06, "loss": 0.64818835, "num_input_tokens_seen": 159845805, "step": 7453, "time_per_iteration": 2.6495423316955566 }, { "auxiliary_loss_clip": 0.0102644, "auxiliary_loss_mlp": 0.01007957, "balance_loss_clip": 1.01617622, "balance_loss_mlp": 1.00627661, "epoch": 0.44815872538704343, "flos": 69190634246400.0, "grad_norm": 0.7452851567935764, "language_loss": 0.57032764, "learning_rate": 2.4295146088348224e-06, "loss": 0.59067166, "num_input_tokens_seen": 159898860, "step": 7454, "time_per_iteration": 3.0483179092407227 }, { "auxiliary_loss_clip": 0.0110232, "auxiliary_loss_mlp": 0.0104097, "balance_loss_clip": 1.04301405, "balance_loss_mlp": 1.02651, "epoch": 0.4482188486397114, "flos": 12598289510400.0, "grad_norm": 2.1814246614415795, "language_loss": 0.75516129, "learning_rate": 2.4291342261478255e-06, "loss": 0.77659416, "num_input_tokens_seen": 159911555, "step": 7455, "time_per_iteration": 2.639425039291382 }, { "auxiliary_loss_clip": 0.01103634, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.0440948, "balance_loss_mlp": 1.02343822, "epoch": 0.44827897189237936, "flos": 34058182631040.0, "grad_norm": 1.8295063999245702, "language_loss": 0.75630772, "learning_rate": 2.428753827188016e-06, "loss": 0.7777077, "num_input_tokens_seen": 159931470, "step": 7456, "time_per_iteration": 2.809356451034546 }, { "auxiliary_loss_clip": 0.01130195, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.05033028, "balance_loss_mlp": 1.02355289, "epoch": 0.44833909514504733, "flos": 25147416280320.0, "grad_norm": 60.5899352460765, "language_loss": 0.76306677, "learning_rate": 2.428373411969818e-06, "loss": 0.78473306, "num_input_tokens_seen": 159946115, "step": 7457, "time_per_iteration": 2.632532835006714 }, { "auxiliary_loss_clip": 0.01111792, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.04215193, "balance_loss_mlp": 1.01695263, "epoch": 0.4483992183977153, "flos": 16179984224640.0, "grad_norm": 2.8627685619088203, "language_loss": 0.68479908, "learning_rate": 2.4279929805076576e-06, "loss": 0.70623147, "num_input_tokens_seen": 159963915, "step": 7458, "time_per_iteration": 2.6376359462738037 }, { "auxiliary_loss_clip": 0.01091284, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.04267764, "balance_loss_mlp": 1.018332, "epoch": 0.44845934165038326, "flos": 17746048270080.0, "grad_norm": 1.5800915665139277, "language_loss": 0.71851492, "learning_rate": 2.427612532815961e-06, "loss": 0.73975933, "num_input_tokens_seen": 159982140, "step": 7459, "time_per_iteration": 2.713164806365967 }, { "auxiliary_loss_clip": 0.01108578, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.04210949, "balance_loss_mlp": 1.02282834, "epoch": 0.4485194649030513, "flos": 21835914647040.0, "grad_norm": 1.672173614468041, "language_loss": 0.70216429, "learning_rate": 2.427232068909154e-06, "loss": 0.72361535, "num_input_tokens_seen": 160002280, "step": 7460, "time_per_iteration": 2.6243271827697754 }, { "auxiliary_loss_clip": 0.01129261, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.04698896, "balance_loss_mlp": 1.02463329, "epoch": 0.44857958815571924, "flos": 20084515401600.0, "grad_norm": 1.9532472719910148, "language_loss": 0.77566743, "learning_rate": 2.4268515888016635e-06, "loss": 0.79734743, "num_input_tokens_seen": 160020260, "step": 7461, "time_per_iteration": 4.114460468292236 }, { "auxiliary_loss_clip": 0.01128704, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.0455538, "balance_loss_mlp": 1.02091575, "epoch": 0.4486397114083872, "flos": 27053519402880.0, "grad_norm": 1.943200777150693, "language_loss": 0.67738903, "learning_rate": 2.4264710925079184e-06, "loss": 0.69901872, "num_input_tokens_seen": 160040240, "step": 7462, "time_per_iteration": 5.671550035476685 }, { "auxiliary_loss_clip": 0.01046056, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.0179913, "balance_loss_mlp": 1.0006094, "epoch": 0.4486998346610552, "flos": 67321195931520.0, "grad_norm": 0.7528637907126196, "language_loss": 0.5449208, "learning_rate": 2.4260905800423462e-06, "loss": 0.5654031, "num_input_tokens_seen": 160093865, "step": 7463, "time_per_iteration": 3.132819890975952 }, { "auxiliary_loss_clip": 0.01117188, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.04449058, "balance_loss_mlp": 1.01758814, "epoch": 0.44875995791372314, "flos": 27636816360960.0, "grad_norm": 2.3886431821168954, "language_loss": 0.7580359, "learning_rate": 2.4257100514193775e-06, "loss": 0.77951968, "num_input_tokens_seen": 160113590, "step": 7464, "time_per_iteration": 2.7005674839019775 }, { "auxiliary_loss_clip": 0.01116572, "auxiliary_loss_mlp": 0.01037604, "balance_loss_clip": 1.04709184, "balance_loss_mlp": 1.02484834, "epoch": 0.4488200811663911, "flos": 13005947940480.0, "grad_norm": 1.7787597626645963, "language_loss": 0.74147099, "learning_rate": 2.425329506653441e-06, "loss": 0.76301277, "num_input_tokens_seen": 160131795, "step": 7465, "time_per_iteration": 4.423643112182617 }, { "auxiliary_loss_clip": 0.01110783, "auxiliary_loss_mlp": 0.01040781, "balance_loss_clip": 1.04708648, "balance_loss_mlp": 1.02503395, "epoch": 0.44888020441905907, "flos": 27489977562240.0, "grad_norm": 2.0439366025173347, "language_loss": 0.7991035, "learning_rate": 2.424948945758966e-06, "loss": 0.82061917, "num_input_tokens_seen": 160150635, "step": 7466, "time_per_iteration": 2.7003092765808105 }, { "auxiliary_loss_clip": 0.01110719, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.04898739, "balance_loss_mlp": 1.02141774, "epoch": 0.44894032767172704, "flos": 18259678800000.0, "grad_norm": 2.4307522297147357, "language_loss": 0.81000906, "learning_rate": 2.4245683687503844e-06, "loss": 0.83146888, "num_input_tokens_seen": 160168615, "step": 7467, "time_per_iteration": 2.6656453609466553 }, { "auxiliary_loss_clip": 0.01074952, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.04580259, "balance_loss_mlp": 1.01924217, "epoch": 0.449000450924395, "flos": 21579835610880.0, "grad_norm": 2.1126461235100726, "language_loss": 0.74707794, "learning_rate": 2.424187775642129e-06, "loss": 0.76815045, "num_input_tokens_seen": 160187295, "step": 7468, "time_per_iteration": 2.7112534046173096 }, { "auxiliary_loss_clip": 0.01097239, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.04224133, "balance_loss_mlp": 1.01881611, "epoch": 0.44906057417706297, "flos": 17967904623360.0, "grad_norm": 1.845085412210932, "language_loss": 0.71481991, "learning_rate": 2.4238071664486297e-06, "loss": 0.7361052, "num_input_tokens_seen": 160205115, "step": 7469, "time_per_iteration": 2.680678606033325 }, { "auxiliary_loss_clip": 0.01115577, "auxiliary_loss_mlp": 0.01040939, "balance_loss_clip": 1.04739857, "balance_loss_mlp": 1.02700388, "epoch": 0.44912069742973093, "flos": 20047347803520.0, "grad_norm": 1.9353970520381958, "language_loss": 0.71990728, "learning_rate": 2.4234265411843203e-06, "loss": 0.74147248, "num_input_tokens_seen": 160222580, "step": 7470, "time_per_iteration": 2.6266865730285645 }, { "auxiliary_loss_clip": 0.01085169, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 1.04166925, "balance_loss_mlp": 1.02263546, "epoch": 0.4491808206823989, "flos": 21033526682880.0, "grad_norm": 1.7352200929350259, "language_loss": 0.76839507, "learning_rate": 2.423045899863634e-06, "loss": 0.78962576, "num_input_tokens_seen": 160241520, "step": 7471, "time_per_iteration": 2.692333698272705 }, { "auxiliary_loss_clip": 0.0112922, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.04736388, "balance_loss_mlp": 1.02259803, "epoch": 0.44924094393506686, "flos": 22967136645120.0, "grad_norm": 1.6949435247941296, "language_loss": 0.70284784, "learning_rate": 2.4226652425010048e-06, "loss": 0.72449803, "num_input_tokens_seen": 160261815, "step": 7472, "time_per_iteration": 2.714059829711914 }, { "auxiliary_loss_clip": 0.01033495, "auxiliary_loss_mlp": 0.01004013, "balance_loss_clip": 1.01477528, "balance_loss_mlp": 1.00226104, "epoch": 0.4493010671877349, "flos": 59233467864960.0, "grad_norm": 0.7390973196636706, "language_loss": 0.6168009, "learning_rate": 2.4222845691108676e-06, "loss": 0.63717604, "num_input_tokens_seen": 160317070, "step": 7473, "time_per_iteration": 3.1489851474761963 }, { "auxiliary_loss_clip": 0.01131224, "auxiliary_loss_mlp": 0.00771593, "balance_loss_clip": 1.04812014, "balance_loss_mlp": 1.0004611, "epoch": 0.44936119044040285, "flos": 18004892653440.0, "grad_norm": 2.3114379148666817, "language_loss": 0.78279471, "learning_rate": 2.421903879707657e-06, "loss": 0.80182284, "num_input_tokens_seen": 160334980, "step": 7474, "time_per_iteration": 2.5561118125915527 }, { "auxiliary_loss_clip": 0.01074804, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.03983307, "balance_loss_mlp": 1.0254494, "epoch": 0.4494213136930708, "flos": 21251827589760.0, "grad_norm": 1.6204554836894525, "language_loss": 0.72024751, "learning_rate": 2.4215231743058086e-06, "loss": 0.74139607, "num_input_tokens_seen": 160354500, "step": 7475, "time_per_iteration": 2.7745461463928223 }, { "auxiliary_loss_clip": 0.01080301, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.04167461, "balance_loss_mlp": 1.02563405, "epoch": 0.4494814369457388, "flos": 27418695022080.0, "grad_norm": 2.241823557245511, "language_loss": 0.76592773, "learning_rate": 2.4211424529197594e-06, "loss": 0.78714132, "num_input_tokens_seen": 160373650, "step": 7476, "time_per_iteration": 2.7856860160827637 }, { "auxiliary_loss_clip": 0.01122132, "auxiliary_loss_mlp": 0.00773102, "balance_loss_clip": 1.04493368, "balance_loss_mlp": 1.00047529, "epoch": 0.44954156019840674, "flos": 22854053652480.0, "grad_norm": 4.385259299883037, "language_loss": 0.72134888, "learning_rate": 2.4207617155639464e-06, "loss": 0.74030131, "num_input_tokens_seen": 160393430, "step": 7477, "time_per_iteration": 2.641645669937134 }, { "auxiliary_loss_clip": 0.01103781, "auxiliary_loss_mlp": 0.01047956, "balance_loss_clip": 1.04083133, "balance_loss_mlp": 1.03148091, "epoch": 0.4496016834510747, "flos": 17201570935680.0, "grad_norm": 2.795464855062127, "language_loss": 0.67799896, "learning_rate": 2.4203809622528062e-06, "loss": 0.69951636, "num_input_tokens_seen": 160410545, "step": 7478, "time_per_iteration": 2.6307947635650635 }, { "auxiliary_loss_clip": 0.01102543, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.04405093, "balance_loss_mlp": 1.02537441, "epoch": 0.4496618067037427, "flos": 18916628595840.0, "grad_norm": 1.8532543047361745, "language_loss": 0.89243561, "learning_rate": 2.420000193000779e-06, "loss": 0.91385025, "num_input_tokens_seen": 160428105, "step": 7479, "time_per_iteration": 2.733828544616699 }, { "auxiliary_loss_clip": 0.01068922, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.04273605, "balance_loss_mlp": 1.02804279, "epoch": 0.44972192995641064, "flos": 21031659175680.0, "grad_norm": 2.916606412127397, "language_loss": 0.75539804, "learning_rate": 2.419619407822302e-06, "loss": 0.77652001, "num_input_tokens_seen": 160448815, "step": 7480, "time_per_iteration": 2.8518130779266357 }, { "auxiliary_loss_clip": 0.01095249, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.04253781, "balance_loss_mlp": 1.02012968, "epoch": 0.4497820532090786, "flos": 20777088510720.0, "grad_norm": 1.9829776726262367, "language_loss": 0.79885375, "learning_rate": 2.419238606731815e-06, "loss": 0.82015675, "num_input_tokens_seen": 160465940, "step": 7481, "time_per_iteration": 2.7299835681915283 }, { "auxiliary_loss_clip": 0.01102494, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.04328001, "balance_loss_mlp": 1.02454567, "epoch": 0.44984217646174657, "flos": 33802606385280.0, "grad_norm": 1.6381608125682177, "language_loss": 0.68340528, "learning_rate": 2.418857789743758e-06, "loss": 0.70482588, "num_input_tokens_seen": 160486710, "step": 7482, "time_per_iteration": 2.8123154640197754 }, { "auxiliary_loss_clip": 0.01122196, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.04835725, "balance_loss_mlp": 1.02638626, "epoch": 0.44990229971441453, "flos": 15518365660800.0, "grad_norm": 2.0379383366397232, "language_loss": 0.84707004, "learning_rate": 2.418476956872571e-06, "loss": 0.86869979, "num_input_tokens_seen": 160503405, "step": 7483, "time_per_iteration": 2.718548536300659 }, { "auxiliary_loss_clip": 0.01099077, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.04296637, "balance_loss_mlp": 1.03027594, "epoch": 0.4499624229670825, "flos": 29861913191040.0, "grad_norm": 1.8017494037756971, "language_loss": 0.80644262, "learning_rate": 2.4180961081326967e-06, "loss": 0.82789552, "num_input_tokens_seen": 160525080, "step": 7484, "time_per_iteration": 2.8435990810394287 }, { "auxiliary_loss_clip": 0.01075163, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.03809166, "balance_loss_mlp": 1.02145171, "epoch": 0.45002254621975046, "flos": 18513674847360.0, "grad_norm": 2.526248303429359, "language_loss": 0.75311351, "learning_rate": 2.4177152435385754e-06, "loss": 0.77424026, "num_input_tokens_seen": 160540895, "step": 7485, "time_per_iteration": 2.7453646659851074 }, { "auxiliary_loss_clip": 0.01027401, "auxiliary_loss_mlp": 0.0100295, "balance_loss_clip": 1.01817155, "balance_loss_mlp": 1.00125754, "epoch": 0.4500826694724185, "flos": 70420394229120.0, "grad_norm": 0.7859680562883086, "language_loss": 0.58644986, "learning_rate": 2.4173343631046504e-06, "loss": 0.60675335, "num_input_tokens_seen": 160598270, "step": 7486, "time_per_iteration": 3.2535924911499023 }, { "auxiliary_loss_clip": 0.0111614, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04657292, "balance_loss_mlp": 1.02917325, "epoch": 0.45014279272508645, "flos": 15778897983360.0, "grad_norm": 2.484631064514228, "language_loss": 0.83677804, "learning_rate": 2.4169534668453654e-06, "loss": 0.85839128, "num_input_tokens_seen": 160614720, "step": 7487, "time_per_iteration": 2.7236413955688477 }, { "auxiliary_loss_clip": 0.01128709, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.04632056, "balance_loss_mlp": 1.02443182, "epoch": 0.4502029159777544, "flos": 21799573061760.0, "grad_norm": 1.5508029399024128, "language_loss": 0.77568138, "learning_rate": 2.4165725547751622e-06, "loss": 0.79735959, "num_input_tokens_seen": 160635170, "step": 7488, "time_per_iteration": 2.6660585403442383 }, { "auxiliary_loss_clip": 0.0112874, "auxiliary_loss_mlp": 0.01045145, "balance_loss_clip": 1.04882014, "balance_loss_mlp": 1.02954042, "epoch": 0.4502630392304224, "flos": 28767966531840.0, "grad_norm": 1.97851616048007, "language_loss": 0.72073781, "learning_rate": 2.4161916269084858e-06, "loss": 0.74247664, "num_input_tokens_seen": 160654490, "step": 7489, "time_per_iteration": 2.7274820804595947 }, { "auxiliary_loss_clip": 0.01109274, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.04584038, "balance_loss_mlp": 1.02314413, "epoch": 0.45032316248309034, "flos": 15844182952320.0, "grad_norm": 2.9737823054207926, "language_loss": 0.6968661, "learning_rate": 2.4158106832597817e-06, "loss": 0.71835679, "num_input_tokens_seen": 160669400, "step": 7490, "time_per_iteration": 2.650700569152832 }, { "auxiliary_loss_clip": 0.01026171, "auxiliary_loss_mlp": 0.01004705, "balance_loss_clip": 1.0231657, "balance_loss_mlp": 1.00323248, "epoch": 0.4503832857357583, "flos": 57853600945920.0, "grad_norm": 0.7292674820176653, "language_loss": 0.56675166, "learning_rate": 2.415429723843495e-06, "loss": 0.58706039, "num_input_tokens_seen": 160733820, "step": 7491, "time_per_iteration": 3.1893656253814697 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.04746497, "balance_loss_mlp": 1.02327061, "epoch": 0.4504434089884263, "flos": 23878082488320.0, "grad_norm": 1.6154687272881363, "language_loss": 0.7939685, "learning_rate": 2.4150487486740713e-06, "loss": 0.81550771, "num_input_tokens_seen": 160753175, "step": 7492, "time_per_iteration": 2.7314138412475586 }, { "auxiliary_loss_clip": 0.010986, "auxiliary_loss_mlp": 0.00775969, "balance_loss_clip": 1.04425228, "balance_loss_mlp": 1.000494, "epoch": 0.45050353224109424, "flos": 17785083375360.0, "grad_norm": 2.875303360797025, "language_loss": 0.92825645, "learning_rate": 2.4146677577659573e-06, "loss": 0.94700211, "num_input_tokens_seen": 160768310, "step": 7493, "time_per_iteration": 2.7123935222625732 }, { "auxiliary_loss_clip": 0.01039208, "auxiliary_loss_mlp": 0.01001589, "balance_loss_clip": 1.02041435, "balance_loss_mlp": 0.99994355, "epoch": 0.4505636554937622, "flos": 65063420703360.0, "grad_norm": 0.8110713299155351, "language_loss": 0.62929082, "learning_rate": 2.4142867511336e-06, "loss": 0.64969873, "num_input_tokens_seen": 160827370, "step": 7494, "time_per_iteration": 3.289635181427002 }, { "auxiliary_loss_clip": 0.01129658, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.04754305, "balance_loss_mlp": 1.02150989, "epoch": 0.45062377874643017, "flos": 22200084685440.0, "grad_norm": 1.7474777674384385, "language_loss": 0.82263976, "learning_rate": 2.4139057287914484e-06, "loss": 0.84428668, "num_input_tokens_seen": 160849140, "step": 7495, "time_per_iteration": 2.659642219543457 }, { "auxiliary_loss_clip": 0.01115544, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.04483461, "balance_loss_mlp": 1.02449155, "epoch": 0.45068390199909814, "flos": 37670293186560.0, "grad_norm": 1.8332713503860085, "language_loss": 0.86039978, "learning_rate": 2.41352469075395e-06, "loss": 0.8819716, "num_input_tokens_seen": 160871280, "step": 7496, "time_per_iteration": 2.798741579055786 }, { "auxiliary_loss_clip": 0.01134499, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.04969478, "balance_loss_mlp": 1.02054274, "epoch": 0.4507440252517661, "flos": 22302501338880.0, "grad_norm": 2.0558646291387066, "language_loss": 0.76101983, "learning_rate": 2.4131436370355534e-06, "loss": 0.78272235, "num_input_tokens_seen": 160888625, "step": 7497, "time_per_iteration": 2.6553680896759033 }, { "auxiliary_loss_clip": 0.01098074, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.04377723, "balance_loss_mlp": 1.02352023, "epoch": 0.45080414850443407, "flos": 13188374138880.0, "grad_norm": 2.277785969464064, "language_loss": 0.75305939, "learning_rate": 2.4127625676507088e-06, "loss": 0.77441967, "num_input_tokens_seen": 160907040, "step": 7498, "time_per_iteration": 2.6950063705444336 }, { "auxiliary_loss_clip": 0.01133264, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.04848719, "balance_loss_mlp": 1.02897298, "epoch": 0.4508642717571021, "flos": 21944939402880.0, "grad_norm": 3.3346599205762826, "language_loss": 0.70080638, "learning_rate": 2.4123814826138663e-06, "loss": 0.72258794, "num_input_tokens_seen": 160927115, "step": 7499, "time_per_iteration": 2.6134774684906006 }, { "auxiliary_loss_clip": 0.01084574, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.04212165, "balance_loss_mlp": 1.02309906, "epoch": 0.45092439500977005, "flos": 23367468700800.0, "grad_norm": 1.9346658302408082, "language_loss": 0.77361268, "learning_rate": 2.412000381939477e-06, "loss": 0.79483902, "num_input_tokens_seen": 160944405, "step": 7500, "time_per_iteration": 4.306777000427246 }, { "auxiliary_loss_clip": 0.01084228, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.04249573, "balance_loss_mlp": 1.02007651, "epoch": 0.450984518262438, "flos": 20772958446720.0, "grad_norm": 1.9176241989159464, "language_loss": 0.63056326, "learning_rate": 2.411619265641992e-06, "loss": 0.65175211, "num_input_tokens_seen": 160961345, "step": 7501, "time_per_iteration": 5.803133487701416 }, { "auxiliary_loss_clip": 0.01135547, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04915273, "balance_loss_mlp": 1.02445376, "epoch": 0.451044641515106, "flos": 17707372300800.0, "grad_norm": 1.9532762899000093, "language_loss": 0.84446234, "learning_rate": 2.411238133735863e-06, "loss": 0.86621827, "num_input_tokens_seen": 160977330, "step": 7502, "time_per_iteration": 2.604753017425537 }, { "auxiliary_loss_clip": 0.01105383, "auxiliary_loss_mlp": 0.01036548, "balance_loss_clip": 1.04670203, "balance_loss_mlp": 1.02238584, "epoch": 0.45110476476777395, "flos": 20594698225920.0, "grad_norm": 1.3813112457968315, "language_loss": 0.79642487, "learning_rate": 2.4108569862355418e-06, "loss": 0.81784415, "num_input_tokens_seen": 160997280, "step": 7503, "time_per_iteration": 2.666677236557007 }, { "auxiliary_loss_clip": 0.01104325, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.04764807, "balance_loss_mlp": 1.02240419, "epoch": 0.4511648880204419, "flos": 16034043265920.0, "grad_norm": 2.051596804130354, "language_loss": 0.81191939, "learning_rate": 2.410475823155484e-06, "loss": 0.83333045, "num_input_tokens_seen": 161014235, "step": 7504, "time_per_iteration": 4.276456117630005 }, { "auxiliary_loss_clip": 0.01087433, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.04069161, "balance_loss_mlp": 1.02469158, "epoch": 0.4512250112731099, "flos": 23978811202560.0, "grad_norm": 1.5834485358881918, "language_loss": 0.63315797, "learning_rate": 2.4100946445101405e-06, "loss": 0.65441537, "num_input_tokens_seen": 161032360, "step": 7505, "time_per_iteration": 2.947556734085083 }, { "auxiliary_loss_clip": 0.01014942, "auxiliary_loss_mlp": 0.01003244, "balance_loss_clip": 1.02198029, "balance_loss_mlp": 1.00188541, "epoch": 0.45128513452577784, "flos": 71462308037760.0, "grad_norm": 0.8317919198459461, "language_loss": 0.58857071, "learning_rate": 2.409713450313968e-06, "loss": 0.60875255, "num_input_tokens_seen": 161091360, "step": 7506, "time_per_iteration": 3.395158052444458 }, { "auxiliary_loss_clip": 0.01075605, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.04096067, "balance_loss_mlp": 1.02287173, "epoch": 0.4513452577784458, "flos": 22090844448000.0, "grad_norm": 1.7149339287343461, "language_loss": 0.79334831, "learning_rate": 2.40933224058142e-06, "loss": 0.81447887, "num_input_tokens_seen": 161110825, "step": 7507, "time_per_iteration": 2.8281381130218506 }, { "auxiliary_loss_clip": 0.01091142, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.0425905, "balance_loss_mlp": 1.02066064, "epoch": 0.4514053810311138, "flos": 24276403382400.0, "grad_norm": 1.5823194059388275, "language_loss": 0.73703611, "learning_rate": 2.4089510153269526e-06, "loss": 0.75831455, "num_input_tokens_seen": 161130685, "step": 7508, "time_per_iteration": 2.75742506980896 }, { "auxiliary_loss_clip": 0.01118642, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.04927611, "balance_loss_mlp": 1.02279091, "epoch": 0.45146550428378174, "flos": 17886781756800.0, "grad_norm": 2.075832981658432, "language_loss": 0.79118419, "learning_rate": 2.4085697745650217e-06, "loss": 0.81273252, "num_input_tokens_seen": 161147555, "step": 7509, "time_per_iteration": 2.6641790866851807 }, { "auxiliary_loss_clip": 0.01130929, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05022097, "balance_loss_mlp": 1.02104306, "epoch": 0.4515256275364497, "flos": 24243437675520.0, "grad_norm": 1.9616298828862797, "language_loss": 0.73389792, "learning_rate": 2.4081885183100837e-06, "loss": 0.75555289, "num_input_tokens_seen": 161166255, "step": 7510, "time_per_iteration": 2.754516839981079 }, { "auxiliary_loss_clip": 0.01129503, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.04575419, "balance_loss_mlp": 1.01789534, "epoch": 0.45158575078911767, "flos": 20631039811200.0, "grad_norm": 1.8899584921112549, "language_loss": 0.77046561, "learning_rate": 2.4078072465765964e-06, "loss": 0.79208767, "num_input_tokens_seen": 161184720, "step": 7511, "time_per_iteration": 2.633896589279175 }, { "auxiliary_loss_clip": 0.01119455, "auxiliary_loss_mlp": 0.01033368, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.01832986, "epoch": 0.45164587404178563, "flos": 23327751237120.0, "grad_norm": 1.8239087865443961, "language_loss": 0.78791374, "learning_rate": 2.4074259593790174e-06, "loss": 0.80944192, "num_input_tokens_seen": 161204360, "step": 7512, "time_per_iteration": 2.701643466949463 }, { "auxiliary_loss_clip": 0.01094327, "auxiliary_loss_mlp": 0.01039327, "balance_loss_clip": 1.04103267, "balance_loss_mlp": 1.02404392, "epoch": 0.45170599729445365, "flos": 23805973935360.0, "grad_norm": 2.0955290596831713, "language_loss": 0.87512183, "learning_rate": 2.4070446567318053e-06, "loss": 0.89645839, "num_input_tokens_seen": 161223575, "step": 7513, "time_per_iteration": 2.716236114501953 }, { "auxiliary_loss_clip": 0.01110578, "auxiliary_loss_mlp": 0.0103311, "balance_loss_clip": 1.0445292, "balance_loss_mlp": 1.02031827, "epoch": 0.4517661205471216, "flos": 23512942782720.0, "grad_norm": 2.109318524386585, "language_loss": 0.6707387, "learning_rate": 2.406663338649419e-06, "loss": 0.69217563, "num_input_tokens_seen": 161243805, "step": 7514, "time_per_iteration": 2.665377140045166 }, { "auxiliary_loss_clip": 0.01113013, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.04554498, "balance_loss_mlp": 1.01995873, "epoch": 0.4518262437997896, "flos": 23513948363520.0, "grad_norm": 2.2260653694398242, "language_loss": 0.69152886, "learning_rate": 2.406282005146318e-06, "loss": 0.71301687, "num_input_tokens_seen": 161261450, "step": 7515, "time_per_iteration": 2.6233787536621094 }, { "auxiliary_loss_clip": 0.01114597, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.04228842, "balance_loss_mlp": 1.02269435, "epoch": 0.45188636705245755, "flos": 14568061489920.0, "grad_norm": 6.104635540487547, "language_loss": 0.82568568, "learning_rate": 2.405900656236963e-06, "loss": 0.84721178, "num_input_tokens_seen": 161276965, "step": 7516, "time_per_iteration": 2.7125158309936523 }, { "auxiliary_loss_clip": 0.0112394, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.04487455, "balance_loss_mlp": 1.02003694, "epoch": 0.4519464903051255, "flos": 19901550499200.0, "grad_norm": 1.657947130481532, "language_loss": 0.65597039, "learning_rate": 2.4055192919358137e-06, "loss": 0.67754936, "num_input_tokens_seen": 161295375, "step": 7517, "time_per_iteration": 2.6732585430145264 }, { "auxiliary_loss_clip": 0.01091101, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.04268789, "balance_loss_mlp": 1.02015853, "epoch": 0.4520066135577935, "flos": 18844376388480.0, "grad_norm": 2.0502430920821904, "language_loss": 0.63127112, "learning_rate": 2.405137912257333e-06, "loss": 0.65251088, "num_input_tokens_seen": 161313010, "step": 7518, "time_per_iteration": 2.6873538494110107 }, { "auxiliary_loss_clip": 0.01116444, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.0465678, "balance_loss_mlp": 1.02015519, "epoch": 0.45206673681046144, "flos": 48214419713280.0, "grad_norm": 1.68859992173611, "language_loss": 0.59658802, "learning_rate": 2.404756517215982e-06, "loss": 0.61809057, "num_input_tokens_seen": 161336690, "step": 7519, "time_per_iteration": 2.8561198711395264 }, { "auxiliary_loss_clip": 0.01116298, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.0457139, "balance_loss_mlp": 1.02468395, "epoch": 0.4521268600631294, "flos": 23842171866240.0, "grad_norm": 1.5141513880128057, "language_loss": 0.72439361, "learning_rate": 2.404375106826223e-06, "loss": 0.74594009, "num_input_tokens_seen": 161357845, "step": 7520, "time_per_iteration": 2.709179162979126 }, { "auxiliary_loss_clip": 0.0110396, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.04404962, "balance_loss_mlp": 1.02297747, "epoch": 0.4521869833157974, "flos": 18843622202880.0, "grad_norm": 2.131399149186965, "language_loss": 0.75379634, "learning_rate": 2.4039936811025194e-06, "loss": 0.77519679, "num_input_tokens_seen": 161375160, "step": 7521, "time_per_iteration": 2.78236722946167 }, { "auxiliary_loss_clip": 0.01109339, "auxiliary_loss_mlp": 0.01039668, "balance_loss_clip": 1.04502964, "balance_loss_mlp": 1.02507663, "epoch": 0.45224710656846534, "flos": 19788072456960.0, "grad_norm": 2.2802922264962247, "language_loss": 0.68217206, "learning_rate": 2.4036122400593343e-06, "loss": 0.70366216, "num_input_tokens_seen": 161393690, "step": 7522, "time_per_iteration": 2.698141574859619 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04239058, "balance_loss_mlp": 1.02306962, "epoch": 0.4523072298211333, "flos": 28256131681920.0, "grad_norm": 1.6149288487041198, "language_loss": 0.6114409, "learning_rate": 2.403230783711134e-06, "loss": 0.63292497, "num_input_tokens_seen": 161415015, "step": 7523, "time_per_iteration": 2.765838623046875 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01039402, "balance_loss_clip": 1.04672575, "balance_loss_mlp": 1.02425027, "epoch": 0.45236735307380127, "flos": 11181039511680.0, "grad_norm": 2.0249866031396837, "language_loss": 0.78044772, "learning_rate": 2.4028493120723813e-06, "loss": 0.80206585, "num_input_tokens_seen": 161432940, "step": 7524, "time_per_iteration": 2.6178715229034424 }, { "auxiliary_loss_clip": 0.01083067, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.04386139, "balance_loss_mlp": 1.02560115, "epoch": 0.45242747632646924, "flos": 22601386408320.0, "grad_norm": 2.4629173570449447, "language_loss": 0.63756073, "learning_rate": 2.4024678251575417e-06, "loss": 0.65878761, "num_input_tokens_seen": 161452215, "step": 7525, "time_per_iteration": 2.767791509628296 }, { "auxiliary_loss_clip": 0.01116902, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.04607654, "balance_loss_mlp": 1.02390599, "epoch": 0.45248759957913726, "flos": 18256267008000.0, "grad_norm": 1.8561008840058875, "language_loss": 0.78973663, "learning_rate": 2.402086322981083e-06, "loss": 0.81127673, "num_input_tokens_seen": 161469520, "step": 7526, "time_per_iteration": 2.6315999031066895 }, { "auxiliary_loss_clip": 0.01098614, "auxiliary_loss_mlp": 0.01030271, "balance_loss_clip": 1.04242575, "balance_loss_mlp": 1.01696694, "epoch": 0.4525477228318052, "flos": 22450094323200.0, "grad_norm": 1.8159616365895555, "language_loss": 0.80961096, "learning_rate": 2.40170480555747e-06, "loss": 0.83089983, "num_input_tokens_seen": 161487335, "step": 7527, "time_per_iteration": 2.6868715286254883 }, { "auxiliary_loss_clip": 0.01092415, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.04517341, "balance_loss_mlp": 1.01763892, "epoch": 0.4526078460844732, "flos": 29644869260160.0, "grad_norm": 11.448753069744305, "language_loss": 0.6562798, "learning_rate": 2.4013232729011706e-06, "loss": 0.67751861, "num_input_tokens_seen": 161510095, "step": 7528, "time_per_iteration": 2.816391944885254 }, { "auxiliary_loss_clip": 0.01100127, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.04077947, "balance_loss_mlp": 1.02030635, "epoch": 0.45266796933714115, "flos": 23039747988480.0, "grad_norm": 1.584867366654962, "language_loss": 0.75341809, "learning_rate": 2.4009417250266525e-06, "loss": 0.77475703, "num_input_tokens_seen": 161528725, "step": 7529, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01127981, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.04677176, "balance_loss_mlp": 1.02092457, "epoch": 0.4527280925898091, "flos": 14428405411200.0, "grad_norm": 2.148118662824089, "language_loss": 0.73154545, "learning_rate": 2.400560161948384e-06, "loss": 0.75316578, "num_input_tokens_seen": 161547195, "step": 7530, "time_per_iteration": 2.626149892807007 }, { "auxiliary_loss_clip": 0.01097205, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.04691768, "balance_loss_mlp": 1.0233357, "epoch": 0.4527882158424771, "flos": 22925515760640.0, "grad_norm": 1.600682021317837, "language_loss": 0.75962186, "learning_rate": 2.400178583680834e-06, "loss": 0.78095925, "num_input_tokens_seen": 161565565, "step": 7531, "time_per_iteration": 2.7901298999786377 }, { "auxiliary_loss_clip": 0.01122835, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.04418015, "balance_loss_mlp": 1.02203524, "epoch": 0.45284833909514505, "flos": 25555326105600.0, "grad_norm": 1.5467116056600763, "language_loss": 0.66987002, "learning_rate": 2.3997969902384717e-06, "loss": 0.69145852, "num_input_tokens_seen": 161586630, "step": 7532, "time_per_iteration": 2.693523645401001 }, { "auxiliary_loss_clip": 0.01115241, "auxiliary_loss_mlp": 0.0104024, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.02715659, "epoch": 0.452908462347813, "flos": 18150007599360.0, "grad_norm": 3.168484665922808, "language_loss": 0.78721988, "learning_rate": 2.399415381635768e-06, "loss": 0.80877471, "num_input_tokens_seen": 161603815, "step": 7533, "time_per_iteration": 2.6418774127960205 }, { "auxiliary_loss_clip": 0.01101942, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04315686, "balance_loss_mlp": 1.0244813, "epoch": 0.452968585600481, "flos": 19062749122560.0, "grad_norm": 2.220433880382594, "language_loss": 0.83064616, "learning_rate": 2.3990337578871927e-06, "loss": 0.85206366, "num_input_tokens_seen": 161622900, "step": 7534, "time_per_iteration": 2.751016855239868 }, { "auxiliary_loss_clip": 0.01102917, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.04744101, "balance_loss_mlp": 1.02389479, "epoch": 0.45302870885314894, "flos": 22051737515520.0, "grad_norm": 1.8531826529396993, "language_loss": 0.76665461, "learning_rate": 2.3986521190072176e-06, "loss": 0.78806889, "num_input_tokens_seen": 161641700, "step": 7535, "time_per_iteration": 2.6611855030059814 }, { "auxiliary_loss_clip": 0.01083875, "auxiliary_loss_mlp": 0.01036335, "balance_loss_clip": 1.04374576, "balance_loss_mlp": 1.02368724, "epoch": 0.4530888321058169, "flos": 20376217751040.0, "grad_norm": 1.5302063461742579, "language_loss": 0.80437911, "learning_rate": 2.3982704650103138e-06, "loss": 0.82558113, "num_input_tokens_seen": 161661955, "step": 7536, "time_per_iteration": 2.7666051387786865 }, { "auxiliary_loss_clip": 0.01097222, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.04180908, "balance_loss_mlp": 1.02248287, "epoch": 0.4531489553584849, "flos": 14830425406080.0, "grad_norm": 2.016168707097938, "language_loss": 0.76173598, "learning_rate": 2.3978887959109544e-06, "loss": 0.78307086, "num_input_tokens_seen": 161679245, "step": 7537, "time_per_iteration": 2.690034866333008 }, { "auxiliary_loss_clip": 0.01118629, "auxiliary_loss_mlp": 0.01035481, "balance_loss_clip": 1.04544806, "balance_loss_mlp": 1.0222249, "epoch": 0.45320907861115284, "flos": 21944975316480.0, "grad_norm": 1.9502516921913984, "language_loss": 0.75985712, "learning_rate": 2.3975071117236118e-06, "loss": 0.78139818, "num_input_tokens_seen": 161698795, "step": 7538, "time_per_iteration": 2.692582130432129 }, { "auxiliary_loss_clip": 0.01037446, "auxiliary_loss_mlp": 0.01009452, "balance_loss_clip": 1.01847482, "balance_loss_mlp": 1.00774765, "epoch": 0.45326920186382086, "flos": 66251455038720.0, "grad_norm": 0.7823640203744525, "language_loss": 0.62291718, "learning_rate": 2.3971254124627593e-06, "loss": 0.64338624, "num_input_tokens_seen": 161761980, "step": 7539, "time_per_iteration": 6.417045593261719 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01046019, "balance_loss_clip": 1.04852843, "balance_loss_mlp": 1.03270316, "epoch": 0.4533293251164888, "flos": 14684233052160.0, "grad_norm": 1.7334435648675772, "language_loss": 0.65637821, "learning_rate": 2.396743698142872e-06, "loss": 0.67813587, "num_input_tokens_seen": 161779455, "step": 7540, "time_per_iteration": 2.7546002864837646 }, { "auxiliary_loss_clip": 0.01106819, "auxiliary_loss_mlp": 0.01043222, "balance_loss_clip": 1.0439229, "balance_loss_mlp": 1.02768898, "epoch": 0.4533894483691568, "flos": 22601206840320.0, "grad_norm": 2.0843332238803587, "language_loss": 0.84594655, "learning_rate": 2.396361968778424e-06, "loss": 0.86744702, "num_input_tokens_seen": 161798980, "step": 7541, "time_per_iteration": 4.3779473304748535 }, { "auxiliary_loss_clip": 0.01103981, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.04346132, "balance_loss_mlp": 1.02451134, "epoch": 0.45344957162182475, "flos": 34751617666560.0, "grad_norm": 1.786741767322354, "language_loss": 0.76398253, "learning_rate": 2.395980224383889e-06, "loss": 0.78540504, "num_input_tokens_seen": 161819745, "step": 7542, "time_per_iteration": 2.8061442375183105 }, { "auxiliary_loss_clip": 0.01100521, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.04320002, "balance_loss_mlp": 1.01665092, "epoch": 0.4535096948744927, "flos": 23550218121600.0, "grad_norm": 4.384838077420028, "language_loss": 0.80294377, "learning_rate": 2.395598464973746e-06, "loss": 0.82425809, "num_input_tokens_seen": 161838575, "step": 7543, "time_per_iteration": 4.4142186641693115 }, { "auxiliary_loss_clip": 0.01116855, "auxiliary_loss_mlp": 0.00771625, "balance_loss_clip": 1.04452896, "balance_loss_mlp": 1.00043499, "epoch": 0.4535698181271607, "flos": 25557552748800.0, "grad_norm": 1.7946145717938884, "language_loss": 0.75708425, "learning_rate": 2.395216690562469e-06, "loss": 0.77596909, "num_input_tokens_seen": 161858590, "step": 7544, "time_per_iteration": 2.706681966781616 }, { "auxiliary_loss_clip": 0.01097765, "auxiliary_loss_mlp": 0.01037632, "balance_loss_clip": 1.04519629, "balance_loss_mlp": 1.02378595, "epoch": 0.45362994137982865, "flos": 24864117713280.0, "grad_norm": 1.7108154873098056, "language_loss": 0.75483274, "learning_rate": 2.3948349011645355e-06, "loss": 0.7761867, "num_input_tokens_seen": 161878390, "step": 7545, "time_per_iteration": 2.741312026977539 }, { "auxiliary_loss_clip": 0.01106771, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.04418731, "balance_loss_mlp": 1.02098417, "epoch": 0.4536900646324966, "flos": 30806794408320.0, "grad_norm": 2.2011621045210057, "language_loss": 0.72520149, "learning_rate": 2.394453096794423e-06, "loss": 0.74662089, "num_input_tokens_seen": 161898610, "step": 7546, "time_per_iteration": 2.7891902923583984 }, { "auxiliary_loss_clip": 0.01108307, "auxiliary_loss_mlp": 0.01035115, "balance_loss_clip": 1.04388261, "balance_loss_mlp": 1.02008224, "epoch": 0.4537501878851646, "flos": 23404313076480.0, "grad_norm": 1.593135285125141, "language_loss": 0.75609434, "learning_rate": 2.394071277466609e-06, "loss": 0.77752858, "num_input_tokens_seen": 161918210, "step": 7547, "time_per_iteration": 2.7260210514068604 }, { "auxiliary_loss_clip": 0.01120791, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.04588616, "balance_loss_mlp": 1.01945722, "epoch": 0.45381031113783254, "flos": 18149289327360.0, "grad_norm": 2.150959748604014, "language_loss": 0.70081824, "learning_rate": 2.393689443195573e-06, "loss": 0.72236335, "num_input_tokens_seen": 161936950, "step": 7548, "time_per_iteration": 2.652388095855713 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.04662538, "balance_loss_mlp": 1.0256331, "epoch": 0.4538704343905005, "flos": 25336666062720.0, "grad_norm": 2.8840782688813293, "language_loss": 0.73135072, "learning_rate": 2.393307593995794e-06, "loss": 0.75303733, "num_input_tokens_seen": 161955550, "step": 7549, "time_per_iteration": 2.8452274799346924 }, { "auxiliary_loss_clip": 0.01091023, "auxiliary_loss_mlp": 0.01028579, "balance_loss_clip": 1.040573, "balance_loss_mlp": 1.01576996, "epoch": 0.4539305576431685, "flos": 28731445378560.0, "grad_norm": 1.9190169905093657, "language_loss": 0.65320408, "learning_rate": 2.392925729881751e-06, "loss": 0.67440009, "num_input_tokens_seen": 161976760, "step": 7550, "time_per_iteration": 2.783653497695923 }, { "auxiliary_loss_clip": 0.01113741, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.05046797, "balance_loss_mlp": 1.02172232, "epoch": 0.45399068089583644, "flos": 22492397566080.0, "grad_norm": 1.6128261499338563, "language_loss": 0.69028163, "learning_rate": 2.3925438508679263e-06, "loss": 0.71176994, "num_input_tokens_seen": 161996120, "step": 7551, "time_per_iteration": 2.6571664810180664 }, { "auxiliary_loss_clip": 0.01115638, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.04326105, "balance_loss_mlp": 1.01979804, "epoch": 0.45405080414850446, "flos": 12893403651840.0, "grad_norm": 1.789312830614556, "language_loss": 0.79496789, "learning_rate": 2.392161956968798e-06, "loss": 0.81646329, "num_input_tokens_seen": 162011125, "step": 7552, "time_per_iteration": 2.6482155323028564 }, { "auxiliary_loss_clip": 0.01042694, "auxiliary_loss_mlp": 0.0100358, "balance_loss_clip": 1.02483499, "balance_loss_mlp": 1.00200677, "epoch": 0.4541109274011724, "flos": 59766919724160.0, "grad_norm": 0.8270469682211425, "language_loss": 0.57826698, "learning_rate": 2.39178004819885e-06, "loss": 0.59872973, "num_input_tokens_seen": 162068705, "step": 7553, "time_per_iteration": 3.1456856727600098 }, { "auxiliary_loss_clip": 0.01064062, "auxiliary_loss_mlp": 0.01034097, "balance_loss_clip": 1.04350471, "balance_loss_mlp": 1.02177691, "epoch": 0.4541710506538404, "flos": 28511743841280.0, "grad_norm": 1.3658485385977341, "language_loss": 0.76709622, "learning_rate": 2.3913981245725626e-06, "loss": 0.78807783, "num_input_tokens_seen": 162089655, "step": 7554, "time_per_iteration": 2.8080356121063232 }, { "auxiliary_loss_clip": 0.01108851, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.0467329, "balance_loss_mlp": 1.02056265, "epoch": 0.45423117390650836, "flos": 17675591742720.0, "grad_norm": 3.0408177613289014, "language_loss": 0.7764836, "learning_rate": 2.3910161861044194e-06, "loss": 0.79792738, "num_input_tokens_seen": 162108465, "step": 7555, "time_per_iteration": 2.6776504516601562 }, { "auxiliary_loss_clip": 0.01059757, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 1.04157853, "balance_loss_mlp": 1.01914918, "epoch": 0.4542912971591763, "flos": 28072556248320.0, "grad_norm": 1.7035673731774164, "language_loss": 0.72646725, "learning_rate": 2.390634232808903e-06, "loss": 0.74738687, "num_input_tokens_seen": 162129910, "step": 7556, "time_per_iteration": 2.851022720336914 }, { "auxiliary_loss_clip": 0.01133495, "auxiliary_loss_mlp": 0.01038462, "balance_loss_clip": 1.04808855, "balance_loss_mlp": 1.02491426, "epoch": 0.4543514204118443, "flos": 22671771108480.0, "grad_norm": 2.040538066845486, "language_loss": 0.6298486, "learning_rate": 2.3902522647004982e-06, "loss": 0.65156817, "num_input_tokens_seen": 162148840, "step": 7557, "time_per_iteration": 2.7630646228790283 }, { "auxiliary_loss_clip": 0.01029784, "auxiliary_loss_mlp": 0.0100461, "balance_loss_clip": 1.02091062, "balance_loss_mlp": 1.00302434, "epoch": 0.45441154366451225, "flos": 58216549921920.0, "grad_norm": 0.683633086089208, "language_loss": 0.57569897, "learning_rate": 2.3898702817936875e-06, "loss": 0.59604287, "num_input_tokens_seen": 162208500, "step": 7558, "time_per_iteration": 3.1137866973876953 }, { "auxiliary_loss_clip": 0.01120146, "auxiliary_loss_mlp": 0.0104176, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.02645946, "epoch": 0.4544716669171802, "flos": 16764286763520.0, "grad_norm": 4.36821938683546, "language_loss": 0.56214309, "learning_rate": 2.3894882841029573e-06, "loss": 0.58376217, "num_input_tokens_seen": 162224650, "step": 7559, "time_per_iteration": 2.6453661918640137 }, { "auxiliary_loss_clip": 0.01114034, "auxiliary_loss_mlp": 0.00771404, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.00053644, "epoch": 0.4545317901698482, "flos": 15925233991680.0, "grad_norm": 3.62707185125481, "language_loss": 0.72154331, "learning_rate": 2.389106271642792e-06, "loss": 0.74039769, "num_input_tokens_seen": 162242930, "step": 7560, "time_per_iteration": 2.734957456588745 }, { "auxiliary_loss_clip": 0.01047807, "auxiliary_loss_mlp": 0.01042508, "balance_loss_clip": 1.03757131, "balance_loss_mlp": 1.02745199, "epoch": 0.45459191342251615, "flos": 17639752947840.0, "grad_norm": 2.1379103724447517, "language_loss": 0.69509232, "learning_rate": 2.3887242444276775e-06, "loss": 0.71599543, "num_input_tokens_seen": 162261455, "step": 7561, "time_per_iteration": 2.8633503913879395 }, { "auxiliary_loss_clip": 0.01103836, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.04502749, "balance_loss_mlp": 1.02508128, "epoch": 0.4546520366751841, "flos": 16176608346240.0, "grad_norm": 1.7850356135584633, "language_loss": 0.85308814, "learning_rate": 2.3883422024721015e-06, "loss": 0.87450719, "num_input_tokens_seen": 162279725, "step": 7562, "time_per_iteration": 2.6936264038085938 }, { "auxiliary_loss_clip": 0.01113259, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.04309893, "balance_loss_mlp": 1.0244745, "epoch": 0.4547121599278521, "flos": 19751443562880.0, "grad_norm": 1.7930294917475702, "language_loss": 0.89894032, "learning_rate": 2.38796014579055e-06, "loss": 0.92045587, "num_input_tokens_seen": 162297865, "step": 7563, "time_per_iteration": 2.6632707118988037 }, { "auxiliary_loss_clip": 0.01128772, "auxiliary_loss_mlp": 0.00772113, "balance_loss_clip": 1.04633093, "balance_loss_mlp": 1.00060475, "epoch": 0.45477228318052004, "flos": 19937461121280.0, "grad_norm": 1.7120070486519374, "language_loss": 0.71349525, "learning_rate": 2.3875780743975097e-06, "loss": 0.73250407, "num_input_tokens_seen": 162316010, "step": 7564, "time_per_iteration": 2.6610071659088135 }, { "auxiliary_loss_clip": 0.01118776, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.04351079, "balance_loss_mlp": 1.02376413, "epoch": 0.454832406433188, "flos": 21288312829440.0, "grad_norm": 2.3273072225052998, "language_loss": 0.67977536, "learning_rate": 2.3871959883074713e-06, "loss": 0.70133895, "num_input_tokens_seen": 162336115, "step": 7565, "time_per_iteration": 2.645447015762329 }, { "auxiliary_loss_clip": 0.01084701, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.04171932, "balance_loss_mlp": 1.02002633, "epoch": 0.45489252968585603, "flos": 24498726612480.0, "grad_norm": 1.877770036567151, "language_loss": 0.80176723, "learning_rate": 2.386813887534922e-06, "loss": 0.82294714, "num_input_tokens_seen": 162355705, "step": 7566, "time_per_iteration": 2.7949163913726807 }, { "auxiliary_loss_clip": 0.01090452, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.04210711, "balance_loss_mlp": 1.01981235, "epoch": 0.454952652938524, "flos": 17092474352640.0, "grad_norm": 1.6100724605132029, "language_loss": 0.73702621, "learning_rate": 2.3864317720943508e-06, "loss": 0.75828493, "num_input_tokens_seen": 162374055, "step": 7567, "time_per_iteration": 2.8082687854766846 }, { "auxiliary_loss_clip": 0.01093893, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.04401243, "balance_loss_mlp": 1.02519345, "epoch": 0.45501277619119196, "flos": 27630387826560.0, "grad_norm": 1.3909583171669249, "language_loss": 0.81125635, "learning_rate": 2.386049642000249e-06, "loss": 0.83258814, "num_input_tokens_seen": 162393560, "step": 7568, "time_per_iteration": 2.7837767601013184 }, { "auxiliary_loss_clip": 0.01126615, "auxiliary_loss_mlp": 0.01047153, "balance_loss_clip": 1.04950857, "balance_loss_mlp": 1.03145313, "epoch": 0.4550728994438599, "flos": 19974664632960.0, "grad_norm": 2.2201304610210175, "language_loss": 0.79881442, "learning_rate": 2.3856674972671055e-06, "loss": 0.82055211, "num_input_tokens_seen": 162413170, "step": 7569, "time_per_iteration": 2.6318490505218506 }, { "auxiliary_loss_clip": 0.01121847, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.04655576, "balance_loss_mlp": 1.02286983, "epoch": 0.4551330226965279, "flos": 26066873646720.0, "grad_norm": 1.3612588382742794, "language_loss": 0.75316679, "learning_rate": 2.385285337909412e-06, "loss": 0.77476597, "num_input_tokens_seen": 162434080, "step": 7570, "time_per_iteration": 2.6693389415740967 }, { "auxiliary_loss_clip": 0.0110874, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.0496285, "balance_loss_mlp": 1.02787971, "epoch": 0.45519314594919585, "flos": 32781091501440.0, "grad_norm": 1.7331933441120846, "language_loss": 0.74851429, "learning_rate": 2.3849031639416596e-06, "loss": 0.77002835, "num_input_tokens_seen": 162455445, "step": 7571, "time_per_iteration": 2.8367550373077393 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.04903221, "balance_loss_mlp": 1.02305007, "epoch": 0.4552532692018638, "flos": 19172671718400.0, "grad_norm": 1.8103885190184377, "language_loss": 0.81033444, "learning_rate": 2.3845209753783414e-06, "loss": 0.83188736, "num_input_tokens_seen": 162474940, "step": 7572, "time_per_iteration": 2.654205322265625 }, { "auxiliary_loss_clip": 0.01114723, "auxiliary_loss_mlp": 0.01041135, "balance_loss_clip": 1.04709005, "balance_loss_mlp": 1.02511287, "epoch": 0.4553133924545318, "flos": 26027156183040.0, "grad_norm": 1.7361541689984175, "language_loss": 0.7262516, "learning_rate": 2.3841387722339486e-06, "loss": 0.74781018, "num_input_tokens_seen": 162493340, "step": 7573, "time_per_iteration": 2.7468600273132324 }, { "auxiliary_loss_clip": 0.01124507, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.04916418, "balance_loss_mlp": 1.02327013, "epoch": 0.45537351570719975, "flos": 30661535808000.0, "grad_norm": 1.869301925708578, "language_loss": 0.74335551, "learning_rate": 2.3837565545229748e-06, "loss": 0.76499879, "num_input_tokens_seen": 162514360, "step": 7574, "time_per_iteration": 2.7575597763061523 }, { "auxiliary_loss_clip": 0.01121884, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.04758859, "balance_loss_mlp": 1.02184868, "epoch": 0.4554336389598677, "flos": 24353396184960.0, "grad_norm": 1.5603127476263212, "language_loss": 0.7161333, "learning_rate": 2.383374322259915e-06, "loss": 0.7377193, "num_input_tokens_seen": 162535240, "step": 7575, "time_per_iteration": 2.6638269424438477 }, { "auxiliary_loss_clip": 0.01106959, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.04536855, "balance_loss_mlp": 1.02120471, "epoch": 0.4554937622125357, "flos": 20557925677440.0, "grad_norm": 1.872589408642276, "language_loss": 0.73370463, "learning_rate": 2.3829920754592617e-06, "loss": 0.7551288, "num_input_tokens_seen": 162553880, "step": 7576, "time_per_iteration": 2.686311721801758 }, { "auxiliary_loss_clip": 0.01129005, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.04784572, "balance_loss_mlp": 1.02179956, "epoch": 0.45555388546520365, "flos": 22820764723200.0, "grad_norm": 1.7873556557153987, "language_loss": 0.66664052, "learning_rate": 2.382609814135511e-06, "loss": 0.68829584, "num_input_tokens_seen": 162574485, "step": 7577, "time_per_iteration": 2.6766581535339355 }, { "auxiliary_loss_clip": 0.01103092, "auxiliary_loss_mlp": 0.01046596, "balance_loss_clip": 1.04435253, "balance_loss_mlp": 1.0300076, "epoch": 0.4556140087178716, "flos": 21725992051200.0, "grad_norm": 1.9298557564452474, "language_loss": 0.74309111, "learning_rate": 2.382227538303157e-06, "loss": 0.76458794, "num_input_tokens_seen": 162595130, "step": 7578, "time_per_iteration": 4.310480356216431 }, { "auxiliary_loss_clip": 0.01079377, "auxiliary_loss_mlp": 0.00774819, "balance_loss_clip": 1.04437256, "balance_loss_mlp": 1.00061071, "epoch": 0.45567413197053963, "flos": 25994513698560.0, "grad_norm": 1.7583976894464832, "language_loss": 0.69843179, "learning_rate": 2.381845247976697e-06, "loss": 0.71697378, "num_input_tokens_seen": 162615720, "step": 7579, "time_per_iteration": 4.325899362564087 }, { "auxiliary_loss_clip": 0.01116252, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.0446142, "balance_loss_mlp": 1.0195992, "epoch": 0.4557342552232076, "flos": 21537604195200.0, "grad_norm": 1.7639178263730233, "language_loss": 0.78628397, "learning_rate": 2.381462943170627e-06, "loss": 0.80777884, "num_input_tokens_seen": 162635825, "step": 7580, "time_per_iteration": 2.6391446590423584 }, { "auxiliary_loss_clip": 0.0113405, "auxiliary_loss_mlp": 0.01031474, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.01697779, "epoch": 0.45579437847587556, "flos": 40001972647680.0, "grad_norm": 1.99718885063772, "language_loss": 0.68943548, "learning_rate": 2.381080623899444e-06, "loss": 0.71109068, "num_input_tokens_seen": 162659130, "step": 7581, "time_per_iteration": 4.234206914901733 }, { "auxiliary_loss_clip": 0.01111938, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.04282808, "balance_loss_mlp": 1.01836669, "epoch": 0.4558545017285435, "flos": 31138501530240.0, "grad_norm": 1.6647606381596314, "language_loss": 0.73356318, "learning_rate": 2.3806982901776455e-06, "loss": 0.75501084, "num_input_tokens_seen": 162681665, "step": 7582, "time_per_iteration": 4.333024978637695 }, { "auxiliary_loss_clip": 0.0113626, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.05043411, "balance_loss_mlp": 1.02829337, "epoch": 0.4559146249812115, "flos": 21725776569600.0, "grad_norm": 1.9011112097623832, "language_loss": 0.72327513, "learning_rate": 2.380315942019729e-06, "loss": 0.74507719, "num_input_tokens_seen": 162702040, "step": 7583, "time_per_iteration": 2.633423089981079 }, { "auxiliary_loss_clip": 0.01122524, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.05119634, "balance_loss_mlp": 1.02291131, "epoch": 0.45597474823387946, "flos": 23805973935360.0, "grad_norm": 1.6028864846132196, "language_loss": 0.72692537, "learning_rate": 2.379933579440195e-06, "loss": 0.74852461, "num_input_tokens_seen": 162722375, "step": 7584, "time_per_iteration": 2.6895499229431152 }, { "auxiliary_loss_clip": 0.01089384, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.04311633, "balance_loss_mlp": 1.02606773, "epoch": 0.4560348714865474, "flos": 31905661230720.0, "grad_norm": 1.833639423310481, "language_loss": 0.68204761, "learning_rate": 2.379551202453541e-06, "loss": 0.70334864, "num_input_tokens_seen": 162746095, "step": 7585, "time_per_iteration": 2.7882261276245117 }, { "auxiliary_loss_clip": 0.01132515, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.05002046, "balance_loss_mlp": 1.01725006, "epoch": 0.4560949947392154, "flos": 22048828513920.0, "grad_norm": 1.65915998971852, "language_loss": 0.7634117, "learning_rate": 2.379168811074267e-06, "loss": 0.78504205, "num_input_tokens_seen": 162766330, "step": 7586, "time_per_iteration": 2.636626720428467 }, { "auxiliary_loss_clip": 0.01109504, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 1.04642403, "balance_loss_mlp": 1.01651323, "epoch": 0.45615511799188335, "flos": 24571804832640.0, "grad_norm": 44.63874812648689, "language_loss": 0.78151405, "learning_rate": 2.3787864053168747e-06, "loss": 0.80290735, "num_input_tokens_seen": 162784755, "step": 7587, "time_per_iteration": 2.7801096439361572 }, { "auxiliary_loss_clip": 0.01105539, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.04288149, "balance_loss_mlp": 1.02933669, "epoch": 0.4562152412445513, "flos": 18330709944960.0, "grad_norm": 2.252015566278715, "language_loss": 0.6950196, "learning_rate": 2.378403985195863e-06, "loss": 0.71652043, "num_input_tokens_seen": 162803850, "step": 7588, "time_per_iteration": 2.7108840942382812 }, { "auxiliary_loss_clip": 0.01118383, "auxiliary_loss_mlp": 0.01036327, "balance_loss_clip": 1.05038464, "balance_loss_mlp": 1.02234375, "epoch": 0.4562753644972193, "flos": 13516525814400.0, "grad_norm": 1.6983482750091652, "language_loss": 0.79372728, "learning_rate": 2.378021550725735e-06, "loss": 0.81527448, "num_input_tokens_seen": 162820775, "step": 7589, "time_per_iteration": 2.6967854499816895 }, { "auxiliary_loss_clip": 0.01121003, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.04755974, "balance_loss_mlp": 1.02120006, "epoch": 0.45633548774988725, "flos": 29639697701760.0, "grad_norm": 2.457585749278853, "language_loss": 0.62875861, "learning_rate": 2.377639101920992e-06, "loss": 0.6503284, "num_input_tokens_seen": 162839695, "step": 7590, "time_per_iteration": 2.6659393310546875 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04248881, "balance_loss_mlp": 1.03150392, "epoch": 0.4563956110025552, "flos": 22233409528320.0, "grad_norm": 1.8064400322650376, "language_loss": 0.73125023, "learning_rate": 2.377256638796135e-06, "loss": 0.75272369, "num_input_tokens_seen": 162856095, "step": 7591, "time_per_iteration": 2.7296926975250244 }, { "auxiliary_loss_clip": 0.01113505, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.04979515, "balance_loss_mlp": 1.02757883, "epoch": 0.45645573425522323, "flos": 17092043389440.0, "grad_norm": 2.6622201495184923, "language_loss": 0.76661623, "learning_rate": 2.3768741613656695e-06, "loss": 0.78818369, "num_input_tokens_seen": 162874070, "step": 7592, "time_per_iteration": 2.855787992477417 }, { "auxiliary_loss_clip": 0.01104851, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.04489005, "balance_loss_mlp": 1.026191, "epoch": 0.4565158575078912, "flos": 20332334309760.0, "grad_norm": 2.112667667080726, "language_loss": 0.6938538, "learning_rate": 2.376491669644098e-06, "loss": 0.71532357, "num_input_tokens_seen": 162891000, "step": 7593, "time_per_iteration": 2.7688679695129395 }, { "auxiliary_loss_clip": 0.01110049, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.04238796, "balance_loss_mlp": 1.02174079, "epoch": 0.45657598076055916, "flos": 23983013093760.0, "grad_norm": 2.174557271524546, "language_loss": 0.83913857, "learning_rate": 2.3761091636459248e-06, "loss": 0.86058539, "num_input_tokens_seen": 162910120, "step": 7594, "time_per_iteration": 2.807098865509033 }, { "auxiliary_loss_clip": 0.01036589, "auxiliary_loss_mlp": 0.00753626, "balance_loss_clip": 1.01769352, "balance_loss_mlp": 1.00077426, "epoch": 0.45663610401322713, "flos": 69364297526400.0, "grad_norm": 0.7884707903863047, "language_loss": 0.52737939, "learning_rate": 2.375726643385654e-06, "loss": 0.54528153, "num_input_tokens_seen": 162963720, "step": 7595, "time_per_iteration": 3.2812860012054443 }, { "auxiliary_loss_clip": 0.01096992, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.04297972, "balance_loss_mlp": 1.01864684, "epoch": 0.4566962272658951, "flos": 15149095891200.0, "grad_norm": 2.562717754165903, "language_loss": 0.87188721, "learning_rate": 2.3753441088777915e-06, "loss": 0.89319921, "num_input_tokens_seen": 162975760, "step": 7596, "time_per_iteration": 2.683833122253418 }, { "auxiliary_loss_clip": 0.01126007, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.05094647, "balance_loss_mlp": 1.03226399, "epoch": 0.45675635051856306, "flos": 18697465762560.0, "grad_norm": 8.947162495751469, "language_loss": 0.77418292, "learning_rate": 2.374961560136843e-06, "loss": 0.79590482, "num_input_tokens_seen": 162994865, "step": 7597, "time_per_iteration": 2.686328887939453 }, { "auxiliary_loss_clip": 0.01117589, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.04493558, "balance_loss_mlp": 1.02389073, "epoch": 0.456816473771231, "flos": 19098300608640.0, "grad_norm": 1.6036220935275767, "language_loss": 0.78581583, "learning_rate": 2.374578997177314e-06, "loss": 0.80737466, "num_input_tokens_seen": 163014730, "step": 7598, "time_per_iteration": 2.6856606006622314 }, { "auxiliary_loss_clip": 0.01128723, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.04699326, "balance_loss_mlp": 1.02080941, "epoch": 0.456876597023899, "flos": 28950069507840.0, "grad_norm": 3.021485745265107, "language_loss": 0.71589166, "learning_rate": 2.374196420013712e-06, "loss": 0.73752177, "num_input_tokens_seen": 163033405, "step": 7599, "time_per_iteration": 2.672055244445801 }, { "auxiliary_loss_clip": 0.0109465, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.04185176, "balance_loss_mlp": 1.02445507, "epoch": 0.45693672027656695, "flos": 23289470317440.0, "grad_norm": 2.0431074720876046, "language_loss": 0.70262265, "learning_rate": 2.373813828660544e-06, "loss": 0.72395658, "num_input_tokens_seen": 163051400, "step": 7600, "time_per_iteration": 2.8163371086120605 }, { "auxiliary_loss_clip": 0.01066248, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.04143667, "balance_loss_mlp": 1.02802658, "epoch": 0.4569968435292349, "flos": 20558212986240.0, "grad_norm": 6.700465706217943, "language_loss": 0.79066253, "learning_rate": 2.373431223132319e-06, "loss": 0.81173962, "num_input_tokens_seen": 163069250, "step": 7601, "time_per_iteration": 2.8098480701446533 }, { "auxiliary_loss_clip": 0.01100447, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.04293573, "balance_loss_mlp": 1.02730095, "epoch": 0.4570569667819029, "flos": 41282619223680.0, "grad_norm": 6.824528646616988, "language_loss": 0.71565419, "learning_rate": 2.3730486034435448e-06, "loss": 0.73707151, "num_input_tokens_seen": 163091755, "step": 7602, "time_per_iteration": 2.8971548080444336 }, { "auxiliary_loss_clip": 0.01115269, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.04276979, "balance_loss_mlp": 1.01859641, "epoch": 0.45711709003457085, "flos": 26031573555840.0, "grad_norm": 1.8661067599139867, "language_loss": 0.73023772, "learning_rate": 2.372665969608729e-06, "loss": 0.75173628, "num_input_tokens_seen": 163111600, "step": 7603, "time_per_iteration": 2.709261417388916 }, { "auxiliary_loss_clip": 0.01120961, "auxiliary_loss_mlp": 0.01043179, "balance_loss_clip": 1.04799032, "balance_loss_mlp": 1.02714539, "epoch": 0.4571772132872388, "flos": 22158068751360.0, "grad_norm": 1.901129043888336, "language_loss": 0.83068597, "learning_rate": 2.372283321642383e-06, "loss": 0.85232735, "num_input_tokens_seen": 163127350, "step": 7604, "time_per_iteration": 2.713744640350342 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.05216503, "balance_loss_mlp": 1.02981162, "epoch": 0.45723733653990684, "flos": 23878872587520.0, "grad_norm": 2.0592585158299133, "language_loss": 0.85998154, "learning_rate": 2.371900659559016e-06, "loss": 0.88157928, "num_input_tokens_seen": 163145855, "step": 7605, "time_per_iteration": 2.6666319370269775 }, { "auxiliary_loss_clip": 0.010831, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.04206753, "balance_loss_mlp": 1.02670407, "epoch": 0.4572974597925748, "flos": 16871803148160.0, "grad_norm": 1.8551011968860212, "language_loss": 0.73551464, "learning_rate": 2.371517983373138e-06, "loss": 0.75676405, "num_input_tokens_seen": 163163830, "step": 7606, "time_per_iteration": 2.8618602752685547 }, { "auxiliary_loss_clip": 0.01100268, "auxiliary_loss_mlp": 0.01043762, "balance_loss_clip": 1.0450927, "balance_loss_mlp": 1.02790761, "epoch": 0.45735758304524277, "flos": 13771491528960.0, "grad_norm": 1.9296458941386103, "language_loss": 0.80260599, "learning_rate": 2.371135293099262e-06, "loss": 0.82404631, "num_input_tokens_seen": 163180700, "step": 7607, "time_per_iteration": 2.717987537384033 }, { "auxiliary_loss_clip": 0.01097097, "auxiliary_loss_mlp": 0.01046228, "balance_loss_clip": 1.05015063, "balance_loss_mlp": 1.03169668, "epoch": 0.45741770629791073, "flos": 21100750986240.0, "grad_norm": 1.7686881404445909, "language_loss": 0.81263912, "learning_rate": 2.3707525887518982e-06, "loss": 0.83407241, "num_input_tokens_seen": 163199450, "step": 7608, "time_per_iteration": 2.7047500610351562 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01043615, "balance_loss_clip": 1.04563498, "balance_loss_mlp": 1.02828515, "epoch": 0.4574778295505787, "flos": 23112898035840.0, "grad_norm": 3.284613619336592, "language_loss": 0.68429869, "learning_rate": 2.370369870345559e-06, "loss": 0.70584166, "num_input_tokens_seen": 163217875, "step": 7609, "time_per_iteration": 2.7123308181762695 }, { "auxiliary_loss_clip": 0.01105383, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.04979467, "balance_loss_mlp": 1.03011012, "epoch": 0.45753795280324666, "flos": 24352929308160.0, "grad_norm": 1.7858891409698046, "language_loss": 0.80873275, "learning_rate": 2.369987137894757e-06, "loss": 0.83023953, "num_input_tokens_seen": 163237430, "step": 7610, "time_per_iteration": 2.707108497619629 }, { "auxiliary_loss_clip": 0.01122367, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.04675138, "balance_loss_mlp": 1.02698421, "epoch": 0.4575980760559146, "flos": 16653789550080.0, "grad_norm": 2.2133206913732746, "language_loss": 0.82100248, "learning_rate": 2.3696043914140057e-06, "loss": 0.84264642, "num_input_tokens_seen": 163253905, "step": 7611, "time_per_iteration": 2.6911368370056152 }, { "auxiliary_loss_clip": 0.01127544, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.05061793, "balance_loss_mlp": 1.01889205, "epoch": 0.4576581993085826, "flos": 35911423912320.0, "grad_norm": 2.6253593942917677, "language_loss": 0.73971558, "learning_rate": 2.369221630917819e-06, "loss": 0.76133871, "num_input_tokens_seen": 163274285, "step": 7612, "time_per_iteration": 2.8162691593170166 }, { "auxiliary_loss_clip": 0.01103651, "auxiliary_loss_mlp": 0.01042157, "balance_loss_clip": 1.04241323, "balance_loss_mlp": 1.02680302, "epoch": 0.45771832256125056, "flos": 20080421251200.0, "grad_norm": 1.6042487302929564, "language_loss": 0.84652913, "learning_rate": 2.368838856420711e-06, "loss": 0.86798728, "num_input_tokens_seen": 163293150, "step": 7613, "time_per_iteration": 2.66471266746521 }, { "auxiliary_loss_clip": 0.01096161, "auxiliary_loss_mlp": 0.01038746, "balance_loss_clip": 1.04437852, "balance_loss_mlp": 1.02373135, "epoch": 0.4577784458139185, "flos": 10744329957120.0, "grad_norm": 2.314421678604919, "language_loss": 0.75271547, "learning_rate": 2.3684560679371965e-06, "loss": 0.77406454, "num_input_tokens_seen": 163310065, "step": 7614, "time_per_iteration": 2.740011215209961 }, { "auxiliary_loss_clip": 0.01132592, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.05067575, "balance_loss_mlp": 1.02378809, "epoch": 0.4578385690665865, "flos": 21907269014400.0, "grad_norm": 1.5980870069512307, "language_loss": 0.75026065, "learning_rate": 2.368073265481791e-06, "loss": 0.77196622, "num_input_tokens_seen": 163329415, "step": 7615, "time_per_iteration": 2.694354772567749 }, { "auxiliary_loss_clip": 0.01037366, "auxiliary_loss_mlp": 0.01005104, "balance_loss_clip": 1.02879357, "balance_loss_mlp": 1.00286281, "epoch": 0.45789869231925445, "flos": 64758286667520.0, "grad_norm": 0.785268606967784, "language_loss": 0.57671446, "learning_rate": 2.3676904490690105e-06, "loss": 0.59713912, "num_input_tokens_seen": 163385875, "step": 7616, "time_per_iteration": 3.2036197185516357 }, { "auxiliary_loss_clip": 0.010986, "auxiliary_loss_mlp": 0.00772301, "balance_loss_clip": 1.04307699, "balance_loss_mlp": 1.00081253, "epoch": 0.4579588155719224, "flos": 16144001775360.0, "grad_norm": 1.6020549029918738, "language_loss": 0.70836008, "learning_rate": 2.3673076187133704e-06, "loss": 0.72706908, "num_input_tokens_seen": 163405170, "step": 7617, "time_per_iteration": 2.7075886726379395 }, { "auxiliary_loss_clip": 0.01137127, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.05343175, "balance_loss_mlp": 1.02264261, "epoch": 0.45801893882459044, "flos": 21395541905280.0, "grad_norm": 1.8894449061399028, "language_loss": 0.76292491, "learning_rate": 2.36692477442939e-06, "loss": 0.78466976, "num_input_tokens_seen": 163423155, "step": 7618, "time_per_iteration": 5.8249146938323975 }, { "auxiliary_loss_clip": 0.01101544, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.05301738, "balance_loss_mlp": 1.03189957, "epoch": 0.4580790620772584, "flos": 19536554448000.0, "grad_norm": 1.7481433677396025, "language_loss": 0.77097881, "learning_rate": 2.366541916231585e-06, "loss": 0.79245299, "num_input_tokens_seen": 163442450, "step": 7619, "time_per_iteration": 2.766615629196167 }, { "auxiliary_loss_clip": 0.01134342, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.05348432, "balance_loss_mlp": 1.02757239, "epoch": 0.45813918532992637, "flos": 16581070465920.0, "grad_norm": 1.8920903156272437, "language_loss": 0.72002041, "learning_rate": 2.366159044134473e-06, "loss": 0.74176759, "num_input_tokens_seen": 163459810, "step": 7620, "time_per_iteration": 4.087975025177002 }, { "auxiliary_loss_clip": 0.01109227, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.04942107, "balance_loss_mlp": 1.01892948, "epoch": 0.45819930858259433, "flos": 42230301701760.0, "grad_norm": 1.5465249381842834, "language_loss": 0.77770388, "learning_rate": 2.3657761581525748e-06, "loss": 0.79912305, "num_input_tokens_seen": 163482970, "step": 7621, "time_per_iteration": 2.9124109745025635 }, { "auxiliary_loss_clip": 0.01044673, "auxiliary_loss_mlp": 0.01001257, "balance_loss_clip": 1.02584982, "balance_loss_mlp": 0.99903959, "epoch": 0.4582594318352623, "flos": 63714795638400.0, "grad_norm": 0.7823065471017115, "language_loss": 0.64958, "learning_rate": 2.3653932583004063e-06, "loss": 0.6700393, "num_input_tokens_seen": 163545330, "step": 7622, "time_per_iteration": 4.778898477554321 }, { "auxiliary_loss_clip": 0.01120212, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.05105555, "balance_loss_mlp": 1.02016604, "epoch": 0.45831955508793026, "flos": 26869979882880.0, "grad_norm": 3.654827974152138, "language_loss": 0.79468191, "learning_rate": 2.3650103445924903e-06, "loss": 0.81623328, "num_input_tokens_seen": 163564620, "step": 7623, "time_per_iteration": 2.7033259868621826 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.04181957, "balance_loss_mlp": 1.02728403, "epoch": 0.45837967834059823, "flos": 18733951002240.0, "grad_norm": 1.8933831090876323, "language_loss": 0.70283759, "learning_rate": 2.3646274170433452e-06, "loss": 0.72404432, "num_input_tokens_seen": 163581010, "step": 7624, "time_per_iteration": 2.8526861667633057 }, { "auxiliary_loss_clip": 0.01100025, "auxiliary_loss_mlp": 0.01040188, "balance_loss_clip": 1.04250479, "balance_loss_mlp": 1.02558446, "epoch": 0.4584398015932662, "flos": 21178102924800.0, "grad_norm": 2.2295023596293273, "language_loss": 0.73171687, "learning_rate": 2.364244475667491e-06, "loss": 0.75311905, "num_input_tokens_seen": 163599955, "step": 7625, "time_per_iteration": 2.77284574508667 }, { "auxiliary_loss_clip": 0.01120178, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.05209434, "balance_loss_mlp": 1.02369022, "epoch": 0.45849992484593416, "flos": 19790047704960.0, "grad_norm": 2.499945379712242, "language_loss": 0.77924562, "learning_rate": 2.363861520479451e-06, "loss": 0.80081707, "num_input_tokens_seen": 163618545, "step": 7626, "time_per_iteration": 2.813945770263672 }, { "auxiliary_loss_clip": 0.01137615, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.05263078, "balance_loss_mlp": 1.02645612, "epoch": 0.4585600480986021, "flos": 18223265387520.0, "grad_norm": 1.5689934094814115, "language_loss": 0.84652817, "learning_rate": 2.3634785514937445e-06, "loss": 0.8683064, "num_input_tokens_seen": 163636055, "step": 7627, "time_per_iteration": 2.659053087234497 }, { "auxiliary_loss_clip": 0.01138145, "auxiliary_loss_mlp": 0.01040233, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02531946, "epoch": 0.4586201713512701, "flos": 29022213974400.0, "grad_norm": 1.5125222475387885, "language_loss": 0.6911087, "learning_rate": 2.3630955687248953e-06, "loss": 0.71289253, "num_input_tokens_seen": 163657485, "step": 7628, "time_per_iteration": 2.693678617477417 }, { "auxiliary_loss_clip": 0.01118783, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.04731619, "balance_loss_mlp": 1.02110827, "epoch": 0.45868029460393805, "flos": 23404600385280.0, "grad_norm": 1.4972122231294245, "language_loss": 0.78672099, "learning_rate": 2.3627125721874265e-06, "loss": 0.80825746, "num_input_tokens_seen": 163676030, "step": 7629, "time_per_iteration": 2.6437535285949707 }, { "auxiliary_loss_clip": 0.01113389, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.04590559, "balance_loss_mlp": 1.03034973, "epoch": 0.458740417856606, "flos": 18221972497920.0, "grad_norm": 2.2059444062956985, "language_loss": 0.79377991, "learning_rate": 2.3623295618958595e-06, "loss": 0.81536937, "num_input_tokens_seen": 163694490, "step": 7630, "time_per_iteration": 2.7565791606903076 }, { "auxiliary_loss_clip": 0.01111942, "auxiliary_loss_mlp": 0.01039415, "balance_loss_clip": 1.04838312, "balance_loss_mlp": 1.02481222, "epoch": 0.458800541109274, "flos": 34568760504960.0, "grad_norm": 2.1212994157581293, "language_loss": 0.72087741, "learning_rate": 2.3619465378647198e-06, "loss": 0.74239099, "num_input_tokens_seen": 163717035, "step": 7631, "time_per_iteration": 2.7880306243896484 }, { "auxiliary_loss_clip": 0.01094955, "auxiliary_loss_mlp": 0.01048432, "balance_loss_clip": 1.04605651, "balance_loss_mlp": 1.03280342, "epoch": 0.458860664361942, "flos": 17712112896000.0, "grad_norm": 2.4606182879569145, "language_loss": 0.71433818, "learning_rate": 2.361563500108531e-06, "loss": 0.73577201, "num_input_tokens_seen": 163734525, "step": 7632, "time_per_iteration": 2.7352800369262695 }, { "auxiliary_loss_clip": 0.01081835, "auxiliary_loss_mlp": 0.00774034, "balance_loss_clip": 1.04268694, "balance_loss_mlp": 1.00058782, "epoch": 0.45892078761460997, "flos": 18441889516800.0, "grad_norm": 2.5758659525876824, "language_loss": 0.68867576, "learning_rate": 2.3611804486418178e-06, "loss": 0.7072345, "num_input_tokens_seen": 163752860, "step": 7633, "time_per_iteration": 2.848534107208252 }, { "auxiliary_loss_clip": 0.01122955, "auxiliary_loss_mlp": 0.01043952, "balance_loss_clip": 1.05012798, "balance_loss_mlp": 1.02942061, "epoch": 0.45898091086727794, "flos": 22672956257280.0, "grad_norm": 1.690968390723207, "language_loss": 0.80858737, "learning_rate": 2.3607973834791062e-06, "loss": 0.83025646, "num_input_tokens_seen": 163772495, "step": 7634, "time_per_iteration": 2.6536448001861572 }, { "auxiliary_loss_clip": 0.01122911, "auxiliary_loss_mlp": 0.00773021, "balance_loss_clip": 1.04987049, "balance_loss_mlp": 1.00053596, "epoch": 0.4590410341199459, "flos": 21652949744640.0, "grad_norm": 1.6933583063541449, "language_loss": 0.81255853, "learning_rate": 2.3604143046349216e-06, "loss": 0.83151788, "num_input_tokens_seen": 163791475, "step": 7635, "time_per_iteration": 2.6140496730804443 }, { "auxiliary_loss_clip": 0.01110725, "auxiliary_loss_mlp": 0.01043522, "balance_loss_clip": 1.04990745, "balance_loss_mlp": 1.02941322, "epoch": 0.45910115737261387, "flos": 36535372087680.0, "grad_norm": 1.4938285014309638, "language_loss": 0.64786839, "learning_rate": 2.3600312121237905e-06, "loss": 0.66941082, "num_input_tokens_seen": 163812995, "step": 7636, "time_per_iteration": 2.9211695194244385 }, { "auxiliary_loss_clip": 0.01117391, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.05096126, "balance_loss_mlp": 1.0207361, "epoch": 0.45916128062528183, "flos": 24419866302720.0, "grad_norm": 1.5704675488980822, "language_loss": 0.8052876, "learning_rate": 2.3596481059602395e-06, "loss": 0.82680643, "num_input_tokens_seen": 163833945, "step": 7637, "time_per_iteration": 2.703902244567871 }, { "auxiliary_loss_clip": 0.0110221, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.04369295, "balance_loss_mlp": 1.02650893, "epoch": 0.4592214038779498, "flos": 23221958705280.0, "grad_norm": 1.340585421251073, "language_loss": 0.75339955, "learning_rate": 2.3592649861587965e-06, "loss": 0.7748512, "num_input_tokens_seen": 163853885, "step": 7638, "time_per_iteration": 2.8683316707611084 }, { "auxiliary_loss_clip": 0.01118666, "auxiliary_loss_mlp": 0.01037335, "balance_loss_clip": 1.04785442, "balance_loss_mlp": 1.02312553, "epoch": 0.45928152713061776, "flos": 19172133014400.0, "grad_norm": 1.8020175509044534, "language_loss": 0.74017608, "learning_rate": 2.358881852733989e-06, "loss": 0.76173615, "num_input_tokens_seen": 163871855, "step": 7639, "time_per_iteration": 2.6385724544525146 }, { "auxiliary_loss_clip": 0.01134704, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.05116391, "balance_loss_mlp": 1.02403021, "epoch": 0.4593416503832857, "flos": 22414686491520.0, "grad_norm": 1.704541952239469, "language_loss": 0.68183744, "learning_rate": 2.358498705700346e-06, "loss": 0.7035653, "num_input_tokens_seen": 163891450, "step": 7640, "time_per_iteration": 2.6786441802978516 }, { "auxiliary_loss_clip": 0.01104644, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.04305553, "balance_loss_mlp": 1.02640736, "epoch": 0.4594017736359537, "flos": 18880215183360.0, "grad_norm": 1.6440653073556697, "language_loss": 0.75610799, "learning_rate": 2.3581155450723958e-06, "loss": 0.77756315, "num_input_tokens_seen": 163909345, "step": 7641, "time_per_iteration": 2.6967337131500244 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.04473758, "balance_loss_mlp": 1.01987791, "epoch": 0.45946189688862166, "flos": 20518567349760.0, "grad_norm": 1.7366807351650166, "language_loss": 0.7477932, "learning_rate": 2.357732370864668e-06, "loss": 0.76921976, "num_input_tokens_seen": 163926940, "step": 7642, "time_per_iteration": 2.7593836784362793 }, { "auxiliary_loss_clip": 0.01033439, "auxiliary_loss_mlp": 0.01015123, "balance_loss_clip": 1.02063584, "balance_loss_mlp": 1.01360917, "epoch": 0.4595220201412896, "flos": 61405990162560.0, "grad_norm": 0.8870453562304583, "language_loss": 0.58169055, "learning_rate": 2.357349183091694e-06, "loss": 0.60217613, "num_input_tokens_seen": 163977785, "step": 7643, "time_per_iteration": 3.008721351623535 }, { "auxiliary_loss_clip": 0.01126407, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.04902744, "balance_loss_mlp": 1.02468801, "epoch": 0.4595821433939576, "flos": 23330947547520.0, "grad_norm": 1.6727361984558426, "language_loss": 0.92977291, "learning_rate": 2.3569659817680016e-06, "loss": 0.95142883, "num_input_tokens_seen": 163996630, "step": 7644, "time_per_iteration": 2.6844348907470703 }, { "auxiliary_loss_clip": 0.01118806, "auxiliary_loss_mlp": 0.0103695, "balance_loss_clip": 1.04879534, "balance_loss_mlp": 1.02278805, "epoch": 0.4596422666466256, "flos": 14282356711680.0, "grad_norm": 2.49930104784668, "language_loss": 0.82485175, "learning_rate": 2.3565827669081243e-06, "loss": 0.84640932, "num_input_tokens_seen": 164013190, "step": 7645, "time_per_iteration": 2.649367332458496 }, { "auxiliary_loss_clip": 0.01010103, "auxiliary_loss_mlp": 0.00999811, "balance_loss_clip": 1.01816797, "balance_loss_mlp": 0.99795145, "epoch": 0.4597023898992936, "flos": 65727337737600.0, "grad_norm": 0.7581805782249401, "language_loss": 0.59857589, "learning_rate": 2.356199538526593e-06, "loss": 0.61867499, "num_input_tokens_seen": 164074030, "step": 7646, "time_per_iteration": 3.211512327194214 }, { "auxiliary_loss_clip": 0.01116258, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.04631102, "balance_loss_mlp": 1.02006984, "epoch": 0.45976251315196154, "flos": 26907075653760.0, "grad_norm": 1.794903772385352, "language_loss": 0.72503293, "learning_rate": 2.355816296637939e-06, "loss": 0.74654794, "num_input_tokens_seen": 164095515, "step": 7647, "time_per_iteration": 2.792795419692993 }, { "auxiliary_loss_clip": 0.01096575, "auxiliary_loss_mlp": 0.01041791, "balance_loss_clip": 1.04206514, "balance_loss_mlp": 1.02684855, "epoch": 0.4598226364046295, "flos": 26618066824320.0, "grad_norm": 1.7350588372730733, "language_loss": 0.66805142, "learning_rate": 2.3554330412566957e-06, "loss": 0.68943512, "num_input_tokens_seen": 164117270, "step": 7648, "time_per_iteration": 2.798882484436035 }, { "auxiliary_loss_clip": 0.01120443, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.04601169, "balance_loss_mlp": 1.0234313, "epoch": 0.45988275965729747, "flos": 24387762522240.0, "grad_norm": 1.4487791655991338, "language_loss": 0.78854847, "learning_rate": 2.3550497723973953e-06, "loss": 0.81013, "num_input_tokens_seen": 164137850, "step": 7649, "time_per_iteration": 2.710026979446411 }, { "auxiliary_loss_clip": 0.01071387, "auxiliary_loss_mlp": 0.01039161, "balance_loss_clip": 1.0469979, "balance_loss_mlp": 1.02459955, "epoch": 0.45994288290996543, "flos": 24535822383360.0, "grad_norm": 1.68877556398497, "language_loss": 0.69140404, "learning_rate": 2.3546664900745726e-06, "loss": 0.71250951, "num_input_tokens_seen": 164157960, "step": 7650, "time_per_iteration": 2.862882375717163 }, { "auxiliary_loss_clip": 0.01128714, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.05184257, "balance_loss_mlp": 1.02592099, "epoch": 0.4600030061626334, "flos": 14830245838080.0, "grad_norm": 2.8986833449878686, "language_loss": 0.844868, "learning_rate": 2.354283194302761e-06, "loss": 0.86657685, "num_input_tokens_seen": 164174590, "step": 7651, "time_per_iteration": 2.624094247817993 }, { "auxiliary_loss_clip": 0.01108337, "auxiliary_loss_mlp": 0.00771732, "balance_loss_clip": 1.04726708, "balance_loss_mlp": 1.00045896, "epoch": 0.46006312941530136, "flos": 18113845582080.0, "grad_norm": 1.8740934460638858, "language_loss": 0.75375748, "learning_rate": 2.3538998850964948e-06, "loss": 0.77255821, "num_input_tokens_seen": 164192935, "step": 7652, "time_per_iteration": 2.7064099311828613 }, { "auxiliary_loss_clip": 0.01083449, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.04353166, "balance_loss_mlp": 1.019364, "epoch": 0.46012325266796933, "flos": 21976468565760.0, "grad_norm": 1.6780448716001595, "language_loss": 0.75990206, "learning_rate": 2.3535165624703097e-06, "loss": 0.78107214, "num_input_tokens_seen": 164213160, "step": 7653, "time_per_iteration": 2.840228319168091 }, { "auxiliary_loss_clip": 0.01090017, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.04773235, "balance_loss_mlp": 1.02063906, "epoch": 0.4601833759206373, "flos": 15268068714240.0, "grad_norm": 4.060223218919271, "language_loss": 0.65658432, "learning_rate": 2.353133226438741e-06, "loss": 0.67785805, "num_input_tokens_seen": 164229330, "step": 7654, "time_per_iteration": 2.8097331523895264 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.01038674, "balance_loss_clip": 1.04187179, "balance_loss_mlp": 1.02436912, "epoch": 0.46024349917330526, "flos": 27088999061760.0, "grad_norm": 1.8761760458574834, "language_loss": 0.79274917, "learning_rate": 2.3527498770163248e-06, "loss": 0.81416821, "num_input_tokens_seen": 164248240, "step": 7655, "time_per_iteration": 2.758086681365967 }, { "auxiliary_loss_clip": 0.01090903, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.0439781, "balance_loss_mlp": 1.01801491, "epoch": 0.4603036224259732, "flos": 24462923731200.0, "grad_norm": 1.6240518023721515, "language_loss": 0.68172526, "learning_rate": 2.3523665142175985e-06, "loss": 0.70295978, "num_input_tokens_seen": 164268020, "step": 7656, "time_per_iteration": 2.740079402923584 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.01034222, "balance_loss_clip": 1.04353023, "balance_loss_mlp": 1.02023935, "epoch": 0.4603637456786412, "flos": 28109292883200.0, "grad_norm": 2.01428243239582, "language_loss": 0.80944681, "learning_rate": 2.351983138057098e-06, "loss": 0.83086479, "num_input_tokens_seen": 164287305, "step": 7657, "time_per_iteration": 5.946510314941406 }, { "auxiliary_loss_clip": 0.01130018, "auxiliary_loss_mlp": 0.00771647, "balance_loss_clip": 1.04671657, "balance_loss_mlp": 1.00056028, "epoch": 0.4604238689313092, "flos": 24348942898560.0, "grad_norm": 2.997035997447325, "language_loss": 0.70678955, "learning_rate": 2.3515997485493623e-06, "loss": 0.72580624, "num_input_tokens_seen": 164306835, "step": 7658, "time_per_iteration": 2.710728883743286 }, { "auxiliary_loss_clip": 0.01037878, "auxiliary_loss_mlp": 0.01003053, "balance_loss_clip": 1.01928806, "balance_loss_mlp": 1.00126505, "epoch": 0.4604839921839772, "flos": 53606229431040.0, "grad_norm": 0.9879963677197028, "language_loss": 0.62104321, "learning_rate": 2.351216345708928e-06, "loss": 0.64145255, "num_input_tokens_seen": 164367095, "step": 7659, "time_per_iteration": 4.733903646469116 }, { "auxiliary_loss_clip": 0.01079557, "auxiliary_loss_mlp": 0.01042331, "balance_loss_clip": 1.04242504, "balance_loss_mlp": 1.02548122, "epoch": 0.46054411543664514, "flos": 31248424126080.0, "grad_norm": 1.6833434349921483, "language_loss": 0.68750244, "learning_rate": 2.350832929550336e-06, "loss": 0.70872128, "num_input_tokens_seen": 164388895, "step": 7660, "time_per_iteration": 2.8501877784729004 }, { "auxiliary_loss_clip": 0.01115644, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.04312992, "balance_loss_mlp": 1.02450275, "epoch": 0.4606042386893131, "flos": 24092863862400.0, "grad_norm": 4.508470627980692, "language_loss": 0.77059424, "learning_rate": 2.3504495000881227e-06, "loss": 0.79214668, "num_input_tokens_seen": 164409080, "step": 7661, "time_per_iteration": 4.375652313232422 }, { "auxiliary_loss_clip": 0.01111668, "auxiliary_loss_mlp": 0.01045702, "balance_loss_clip": 1.04530478, "balance_loss_mlp": 1.02989531, "epoch": 0.46066436194198107, "flos": 26578457101440.0, "grad_norm": 1.8557827945777399, "language_loss": 0.75165689, "learning_rate": 2.3500660573368305e-06, "loss": 0.77323061, "num_input_tokens_seen": 164427585, "step": 7662, "time_per_iteration": 2.654381513595581 }, { "auxiliary_loss_clip": 0.01104085, "auxiliary_loss_mlp": 0.01041771, "balance_loss_clip": 1.0422461, "balance_loss_mlp": 1.02585697, "epoch": 0.46072448519464904, "flos": 17775602184960.0, "grad_norm": 3.5055114571256922, "language_loss": 0.79886508, "learning_rate": 2.349682601310998e-06, "loss": 0.82032371, "num_input_tokens_seen": 164438455, "step": 7663, "time_per_iteration": 2.6240744590759277 }, { "auxiliary_loss_clip": 0.0111588, "auxiliary_loss_mlp": 0.01034844, "balance_loss_clip": 1.04562616, "balance_loss_mlp": 1.02098536, "epoch": 0.460784608447317, "flos": 15086109392640.0, "grad_norm": 2.0015713101361565, "language_loss": 0.73791528, "learning_rate": 2.3492991320251653e-06, "loss": 0.75942254, "num_input_tokens_seen": 164456830, "step": 7664, "time_per_iteration": 2.673335075378418 }, { "auxiliary_loss_clip": 0.01096445, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.04571927, "balance_loss_mlp": 1.02313614, "epoch": 0.46084473169998497, "flos": 18588261438720.0, "grad_norm": 1.5274295482700302, "language_loss": 0.7257731, "learning_rate": 2.3489156494938753e-06, "loss": 0.74710619, "num_input_tokens_seen": 164475375, "step": 7665, "time_per_iteration": 2.7057924270629883 }, { "auxiliary_loss_clip": 0.01104187, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.04968786, "balance_loss_mlp": 1.02148521, "epoch": 0.46090485495265293, "flos": 19494789909120.0, "grad_norm": 1.7665019302136358, "language_loss": 0.78369665, "learning_rate": 2.348532153731669e-06, "loss": 0.80508822, "num_input_tokens_seen": 164492040, "step": 7666, "time_per_iteration": 2.6954169273376465 }, { "auxiliary_loss_clip": 0.0108371, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.04061627, "balance_loss_mlp": 1.01935792, "epoch": 0.4609649782053209, "flos": 33364927163520.0, "grad_norm": 1.7291426769142197, "language_loss": 0.74374932, "learning_rate": 2.348148644753088e-06, "loss": 0.76493704, "num_input_tokens_seen": 164513665, "step": 7667, "time_per_iteration": 2.781087636947632 }, { "auxiliary_loss_clip": 0.01083108, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.04470205, "balance_loss_mlp": 1.02440965, "epoch": 0.46102510145798886, "flos": 23769165473280.0, "grad_norm": 1.4213815945133983, "language_loss": 0.75993818, "learning_rate": 2.347765122572676e-06, "loss": 0.78114939, "num_input_tokens_seen": 164533890, "step": 7668, "time_per_iteration": 2.8653104305267334 }, { "auxiliary_loss_clip": 0.010726, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.04025698, "balance_loss_mlp": 1.02047563, "epoch": 0.4610852247106568, "flos": 23294821443840.0, "grad_norm": 1.7696248586775516, "language_loss": 0.78228277, "learning_rate": 2.347381587204975e-06, "loss": 0.80334735, "num_input_tokens_seen": 164553815, "step": 7669, "time_per_iteration": 2.783662796020508 }, { "auxiliary_loss_clip": 0.01110483, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.04095972, "balance_loss_mlp": 1.02259183, "epoch": 0.4611453479633248, "flos": 25447450584960.0, "grad_norm": 1.7322551840105593, "language_loss": 0.82352221, "learning_rate": 2.34699803866453e-06, "loss": 0.84500754, "num_input_tokens_seen": 164573125, "step": 7670, "time_per_iteration": 2.6722826957702637 }, { "auxiliary_loss_clip": 0.01118191, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.04624724, "balance_loss_mlp": 1.02086234, "epoch": 0.4612054712159928, "flos": 21139606523520.0, "grad_norm": 1.6399167633004121, "language_loss": 0.63361788, "learning_rate": 2.3466144769658845e-06, "loss": 0.6551491, "num_input_tokens_seen": 164592575, "step": 7671, "time_per_iteration": 2.6507785320281982 }, { "auxiliary_loss_clip": 0.01038838, "auxiliary_loss_mlp": 0.01005964, "balance_loss_clip": 1.02976012, "balance_loss_mlp": 1.0044564, "epoch": 0.4612655944686608, "flos": 69959266404480.0, "grad_norm": 0.6926647500019024, "language_loss": 0.55842638, "learning_rate": 2.346230902123583e-06, "loss": 0.57887447, "num_input_tokens_seen": 164659795, "step": 7672, "time_per_iteration": 3.330268144607544 }, { "auxiliary_loss_clip": 0.01119098, "auxiliary_loss_mlp": 0.01040288, "balance_loss_clip": 1.04617, "balance_loss_mlp": 1.02645397, "epoch": 0.46132571772132874, "flos": 16837149502080.0, "grad_norm": 1.8809200572873195, "language_loss": 0.70954943, "learning_rate": 2.3458473141521715e-06, "loss": 0.7311433, "num_input_tokens_seen": 164678735, "step": 7673, "time_per_iteration": 2.65659499168396 }, { "auxiliary_loss_clip": 0.01103001, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.04363799, "balance_loss_mlp": 1.01938248, "epoch": 0.4613858409739967, "flos": 35808935431680.0, "grad_norm": 1.9110713796675685, "language_loss": 0.70837104, "learning_rate": 2.345463713066195e-06, "loss": 0.72973394, "num_input_tokens_seen": 164700885, "step": 7674, "time_per_iteration": 2.8332366943359375 }, { "auxiliary_loss_clip": 0.01103023, "auxiliary_loss_mlp": 0.0104104, "balance_loss_clip": 1.04143381, "balance_loss_mlp": 1.02709818, "epoch": 0.4614459642266647, "flos": 35266756567680.0, "grad_norm": 1.6933433527162, "language_loss": 0.65489, "learning_rate": 2.3450800988801996e-06, "loss": 0.67633063, "num_input_tokens_seen": 164726960, "step": 7675, "time_per_iteration": 2.8454952239990234 }, { "auxiliary_loss_clip": 0.01047065, "auxiliary_loss_mlp": 0.01003099, "balance_loss_clip": 1.02009785, "balance_loss_mlp": 1.00131118, "epoch": 0.46150608747933264, "flos": 66704610044160.0, "grad_norm": 0.8598142136337862, "language_loss": 0.58659744, "learning_rate": 2.3446964716087327e-06, "loss": 0.60709906, "num_input_tokens_seen": 164788525, "step": 7676, "time_per_iteration": 3.1523091793060303 }, { "auxiliary_loss_clip": 0.0101473, "auxiliary_loss_mlp": 0.01002448, "balance_loss_clip": 1.01614749, "balance_loss_mlp": 1.00077868, "epoch": 0.4615662107320006, "flos": 55830177025920.0, "grad_norm": 0.7931279707742926, "language_loss": 0.62803817, "learning_rate": 2.344312831266341e-06, "loss": 0.64820993, "num_input_tokens_seen": 164843525, "step": 7677, "time_per_iteration": 3.1055288314819336 }, { "auxiliary_loss_clip": 0.01103004, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.04363084, "balance_loss_mlp": 1.02309012, "epoch": 0.46162633398466857, "flos": 15483245137920.0, "grad_norm": 2.4819209870900636, "language_loss": 0.76371491, "learning_rate": 2.3439291778675718e-06, "loss": 0.78511101, "num_input_tokens_seen": 164859895, "step": 7678, "time_per_iteration": 2.6796817779541016 }, { "auxiliary_loss_clip": 0.01131922, "auxiliary_loss_mlp": 0.01035943, "balance_loss_clip": 1.04888463, "balance_loss_mlp": 1.02157795, "epoch": 0.46168645723733653, "flos": 20011437181440.0, "grad_norm": 2.4568506909255974, "language_loss": 0.66881382, "learning_rate": 2.343545511426974e-06, "loss": 0.69049251, "num_input_tokens_seen": 164878030, "step": 7679, "time_per_iteration": 2.669527053833008 }, { "auxiliary_loss_clip": 0.01095986, "auxiliary_loss_mlp": 0.01037988, "balance_loss_clip": 1.04533219, "balance_loss_mlp": 1.02469063, "epoch": 0.4617465804900045, "flos": 20298542590080.0, "grad_norm": 2.335341416202827, "language_loss": 0.70432782, "learning_rate": 2.3431618319590963e-06, "loss": 0.7256676, "num_input_tokens_seen": 164895710, "step": 7680, "time_per_iteration": 2.7286808490753174 }, { "auxiliary_loss_clip": 0.01137583, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.05160725, "balance_loss_mlp": 1.02904963, "epoch": 0.46180670374267246, "flos": 22346312952960.0, "grad_norm": 1.9037139750308347, "language_loss": 0.63464803, "learning_rate": 2.342778139478487e-06, "loss": 0.65646052, "num_input_tokens_seen": 164913365, "step": 7681, "time_per_iteration": 2.6214568614959717 }, { "auxiliary_loss_clip": 0.01116453, "auxiliary_loss_mlp": 0.01029466, "balance_loss_clip": 1.04633749, "balance_loss_mlp": 1.01636481, "epoch": 0.46186682699534043, "flos": 19895696582400.0, "grad_norm": 1.5164971745129476, "language_loss": 0.67357612, "learning_rate": 2.342394433999697e-06, "loss": 0.69503522, "num_input_tokens_seen": 164931620, "step": 7682, "time_per_iteration": 2.647353410720825 }, { "auxiliary_loss_clip": 0.01088835, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.04340196, "balance_loss_mlp": 1.02619505, "epoch": 0.4619269502480084, "flos": 31503569408640.0, "grad_norm": 2.227871519060849, "language_loss": 0.73820949, "learning_rate": 2.342010715537275e-06, "loss": 0.75950789, "num_input_tokens_seen": 164950905, "step": 7683, "time_per_iteration": 2.7580692768096924 }, { "auxiliary_loss_clip": 0.01128951, "auxiliary_loss_mlp": 0.01039533, "balance_loss_clip": 1.04759753, "balance_loss_mlp": 1.02627087, "epoch": 0.46198707350067636, "flos": 25009484054400.0, "grad_norm": 1.7711337337418462, "language_loss": 0.76479292, "learning_rate": 2.3416269841057726e-06, "loss": 0.7864778, "num_input_tokens_seen": 164970950, "step": 7684, "time_per_iteration": 2.6827478408813477 }, { "auxiliary_loss_clip": 0.01136661, "auxiliary_loss_mlp": 0.01044253, "balance_loss_clip": 1.0495609, "balance_loss_mlp": 1.02969098, "epoch": 0.4620471967533444, "flos": 18292357198080.0, "grad_norm": 1.8114594945271643, "language_loss": 0.79657519, "learning_rate": 2.3412432397197412e-06, "loss": 0.81838435, "num_input_tokens_seen": 164989855, "step": 7685, "time_per_iteration": 2.6539084911346436 }, { "auxiliary_loss_clip": 0.01085193, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.04328656, "balance_loss_mlp": 1.03158486, "epoch": 0.46210732000601235, "flos": 33985104410880.0, "grad_norm": 2.276305365525513, "language_loss": 0.66791403, "learning_rate": 2.340859482393731e-06, "loss": 0.68923569, "num_input_tokens_seen": 165012290, "step": 7686, "time_per_iteration": 2.8229949474334717 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.00772257, "balance_loss_clip": 1.04507184, "balance_loss_mlp": 1.00066257, "epoch": 0.4621674432586803, "flos": 25009412227200.0, "grad_norm": 2.1846142929829693, "language_loss": 0.73938292, "learning_rate": 2.340475712142296e-06, "loss": 0.75817347, "num_input_tokens_seen": 165030810, "step": 7687, "time_per_iteration": 2.8577284812927246 }, { "auxiliary_loss_clip": 0.01066455, "auxiliary_loss_mlp": 0.01038717, "balance_loss_clip": 1.0470593, "balance_loss_mlp": 1.02399492, "epoch": 0.4622275665113483, "flos": 22014031213440.0, "grad_norm": 2.1409043019128253, "language_loss": 0.74955392, "learning_rate": 2.3400919289799873e-06, "loss": 0.77060568, "num_input_tokens_seen": 165050205, "step": 7688, "time_per_iteration": 2.8981478214263916 }, { "auxiliary_loss_clip": 0.01076735, "auxiliary_loss_mlp": 0.00771909, "balance_loss_clip": 1.03838563, "balance_loss_mlp": 1.0005393, "epoch": 0.46228768976401624, "flos": 24058820747520.0, "grad_norm": 1.6416992765701228, "language_loss": 0.78753114, "learning_rate": 2.3397081329213585e-06, "loss": 0.80601752, "num_input_tokens_seen": 165069370, "step": 7689, "time_per_iteration": 2.8450090885162354 }, { "auxiliary_loss_clip": 0.01117226, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.04319644, "balance_loss_mlp": 1.02512646, "epoch": 0.4623478130166842, "flos": 26651391667200.0, "grad_norm": 2.047300589730092, "language_loss": 0.56996405, "learning_rate": 2.339324323980964e-06, "loss": 0.5915342, "num_input_tokens_seen": 165089610, "step": 7690, "time_per_iteration": 2.6919097900390625 }, { "auxiliary_loss_clip": 0.0111777, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.04474783, "balance_loss_mlp": 1.02853799, "epoch": 0.46240793626935217, "flos": 20558428467840.0, "grad_norm": 2.950419828824325, "language_loss": 0.82586032, "learning_rate": 2.3389405021733562e-06, "loss": 0.84746557, "num_input_tokens_seen": 165109050, "step": 7691, "time_per_iteration": 2.695331573486328 }, { "auxiliary_loss_clip": 0.01108828, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.04660177, "balance_loss_mlp": 1.02088761, "epoch": 0.46246805952202014, "flos": 22456055980800.0, "grad_norm": 1.4872733065963748, "language_loss": 0.75199407, "learning_rate": 2.338556667513091e-06, "loss": 0.77342725, "num_input_tokens_seen": 165130130, "step": 7692, "time_per_iteration": 2.6822991371154785 }, { "auxiliary_loss_clip": 0.01097579, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.04742086, "balance_loss_mlp": 1.0297097, "epoch": 0.4625281827746881, "flos": 35041308854400.0, "grad_norm": 1.6276482481397991, "language_loss": 0.74345845, "learning_rate": 2.338172820014723e-06, "loss": 0.76487935, "num_input_tokens_seen": 165152685, "step": 7693, "time_per_iteration": 2.8581414222717285 }, { "auxiliary_loss_clip": 0.01087933, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.04530871, "balance_loss_mlp": 1.04086781, "epoch": 0.46258830602735607, "flos": 21068647205760.0, "grad_norm": 2.088066659615079, "language_loss": 0.85329688, "learning_rate": 2.337788959692808e-06, "loss": 0.8747378, "num_input_tokens_seen": 165173315, "step": 7694, "time_per_iteration": 2.730196237564087 }, { "auxiliary_loss_clip": 0.01111115, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.04707479, "balance_loss_mlp": 1.02936506, "epoch": 0.46264842928002403, "flos": 26177227205760.0, "grad_norm": 2.853578946778756, "language_loss": 0.79611814, "learning_rate": 2.337405086561902e-06, "loss": 0.81765783, "num_input_tokens_seen": 165192395, "step": 7695, "time_per_iteration": 2.7454562187194824 }, { "auxiliary_loss_clip": 0.01114811, "auxiliary_loss_mlp": 0.01037414, "balance_loss_clip": 1.04553604, "balance_loss_mlp": 1.02390218, "epoch": 0.462708552532692, "flos": 16764214936320.0, "grad_norm": 1.803891217274167, "language_loss": 0.72445035, "learning_rate": 2.3370212006365606e-06, "loss": 0.74597263, "num_input_tokens_seen": 165211355, "step": 7696, "time_per_iteration": 4.214217901229858 }, { "auxiliary_loss_clip": 0.01110882, "auxiliary_loss_mlp": 0.01046867, "balance_loss_clip": 1.04748213, "balance_loss_mlp": 1.03221607, "epoch": 0.46276867578535996, "flos": 15560453422080.0, "grad_norm": 1.5710514609338178, "language_loss": 0.69939005, "learning_rate": 2.3366373019313423e-06, "loss": 0.72096753, "num_input_tokens_seen": 165229380, "step": 7697, "time_per_iteration": 4.213683843612671 }, { "auxiliary_loss_clip": 0.01133171, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.05145979, "balance_loss_mlp": 1.02264249, "epoch": 0.462828799038028, "flos": 22415404763520.0, "grad_norm": 1.9243080556164578, "language_loss": 0.84559363, "learning_rate": 2.3362533904608025e-06, "loss": 0.86728865, "num_input_tokens_seen": 165247200, "step": 7698, "time_per_iteration": 2.6434006690979004 }, { "auxiliary_loss_clip": 0.01130166, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.04838073, "balance_loss_mlp": 1.02357352, "epoch": 0.46288892229069595, "flos": 21069580959360.0, "grad_norm": 8.31912219741259, "language_loss": 0.71345413, "learning_rate": 2.335869466239502e-06, "loss": 0.73512906, "num_input_tokens_seen": 165265825, "step": 7699, "time_per_iteration": 4.157729387283325 }, { "auxiliary_loss_clip": 0.01073609, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.04345739, "balance_loss_mlp": 1.02550519, "epoch": 0.4629490455433639, "flos": 23185688947200.0, "grad_norm": 1.732328117704307, "language_loss": 0.71911675, "learning_rate": 2.335485529281996e-06, "loss": 0.74025667, "num_input_tokens_seen": 165284380, "step": 7700, "time_per_iteration": 2.8432295322418213 }, { "auxiliary_loss_clip": 0.01128125, "auxiliary_loss_mlp": 0.00771852, "balance_loss_clip": 1.04640698, "balance_loss_mlp": 1.00047588, "epoch": 0.4630091687960319, "flos": 18835541642880.0, "grad_norm": 2.4184025660528863, "language_loss": 0.73149109, "learning_rate": 2.3351015796028467e-06, "loss": 0.7504909, "num_input_tokens_seen": 165300320, "step": 7701, "time_per_iteration": 4.2371203899383545 }, { "auxiliary_loss_clip": 0.01087014, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.04401398, "balance_loss_mlp": 1.02921128, "epoch": 0.46306929204869984, "flos": 38907020407680.0, "grad_norm": 2.4372676297457216, "language_loss": 0.65005761, "learning_rate": 2.3347176172166114e-06, "loss": 0.67136943, "num_input_tokens_seen": 165318130, "step": 7702, "time_per_iteration": 2.875633716583252 }, { "auxiliary_loss_clip": 0.01103467, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.04441071, "balance_loss_mlp": 1.01875424, "epoch": 0.4631294153013678, "flos": 19644178573440.0, "grad_norm": 1.9024039666922008, "language_loss": 0.73310453, "learning_rate": 2.33433364213785e-06, "loss": 0.75446641, "num_input_tokens_seen": 165336225, "step": 7703, "time_per_iteration": 2.7307324409484863 }, { "auxiliary_loss_clip": 0.01109216, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.04673266, "balance_loss_mlp": 1.02145839, "epoch": 0.4631895385540358, "flos": 24608254158720.0, "grad_norm": 1.9428423147374236, "language_loss": 0.68751299, "learning_rate": 2.3339496543811243e-06, "loss": 0.70897353, "num_input_tokens_seen": 165355005, "step": 7704, "time_per_iteration": 2.7113852500915527 }, { "auxiliary_loss_clip": 0.01120314, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.04720986, "balance_loss_mlp": 1.01935196, "epoch": 0.46324966180670374, "flos": 26320115508480.0, "grad_norm": 2.3420396256779443, "language_loss": 0.81331742, "learning_rate": 2.3335656539609934e-06, "loss": 0.83486044, "num_input_tokens_seen": 165374910, "step": 7705, "time_per_iteration": 2.804708480834961 }, { "auxiliary_loss_clip": 0.01119161, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.04762256, "balance_loss_mlp": 1.02172124, "epoch": 0.4633097850593717, "flos": 19240506552960.0, "grad_norm": 1.6909152504462979, "language_loss": 0.77714217, "learning_rate": 2.3331816408920196e-06, "loss": 0.79869187, "num_input_tokens_seen": 165392590, "step": 7706, "time_per_iteration": 2.67990779876709 }, { "auxiliary_loss_clip": 0.01102016, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.04767776, "balance_loss_mlp": 1.02023578, "epoch": 0.46336990831203967, "flos": 22783166161920.0, "grad_norm": 2.039386256395222, "language_loss": 0.699494, "learning_rate": 2.3327976151887654e-06, "loss": 0.7208522, "num_input_tokens_seen": 165411195, "step": 7707, "time_per_iteration": 2.7109720706939697 }, { "auxiliary_loss_clip": 0.01111011, "auxiliary_loss_mlp": 0.01038647, "balance_loss_clip": 1.04469609, "balance_loss_mlp": 1.02306628, "epoch": 0.46343003156470763, "flos": 38210604543360.0, "grad_norm": 1.931472234163978, "language_loss": 0.61287057, "learning_rate": 2.332413576865791e-06, "loss": 0.63436711, "num_input_tokens_seen": 165430150, "step": 7708, "time_per_iteration": 2.8489346504211426 }, { "auxiliary_loss_clip": 0.01089075, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.04273093, "balance_loss_mlp": 1.01930773, "epoch": 0.4634901548173756, "flos": 31938555110400.0, "grad_norm": 2.4081522593734332, "language_loss": 0.77443427, "learning_rate": 2.3320295259376614e-06, "loss": 0.79565972, "num_input_tokens_seen": 165450595, "step": 7709, "time_per_iteration": 2.720604419708252 }, { "auxiliary_loss_clip": 0.01134634, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.04938257, "balance_loss_mlp": 1.02433753, "epoch": 0.46355027807004356, "flos": 20082540153600.0, "grad_norm": 1.78810829524809, "language_loss": 0.77216917, "learning_rate": 2.3316454624189385e-06, "loss": 0.79391134, "num_input_tokens_seen": 165469515, "step": 7710, "time_per_iteration": 2.5303022861480713 }, { "auxiliary_loss_clip": 0.01122514, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.04637122, "balance_loss_mlp": 1.02172804, "epoch": 0.4636104013227116, "flos": 24061370613120.0, "grad_norm": 2.2400017320201187, "language_loss": 0.73509276, "learning_rate": 2.3312613863241865e-06, "loss": 0.75669408, "num_input_tokens_seen": 165488125, "step": 7711, "time_per_iteration": 2.5654797554016113 }, { "auxiliary_loss_clip": 0.0110546, "auxiliary_loss_mlp": 0.01046309, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.03109789, "epoch": 0.46367052457537955, "flos": 23914639555200.0, "grad_norm": 1.4625168937424313, "language_loss": 0.71734262, "learning_rate": 2.33087729766797e-06, "loss": 0.73886031, "num_input_tokens_seen": 165509225, "step": 7712, "time_per_iteration": 2.6021108627319336 }, { "auxiliary_loss_clip": 0.01109448, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.04681897, "balance_loss_mlp": 1.02359128, "epoch": 0.4637306478280475, "flos": 26396533693440.0, "grad_norm": 10.680731903132253, "language_loss": 0.73100054, "learning_rate": 2.3304931964648524e-06, "loss": 0.75249463, "num_input_tokens_seen": 165529945, "step": 7713, "time_per_iteration": 2.7074029445648193 }, { "auxiliary_loss_clip": 0.01098034, "auxiliary_loss_mlp": 0.01037925, "balance_loss_clip": 1.0441041, "balance_loss_mlp": 1.02191556, "epoch": 0.4637907710807155, "flos": 21980706370560.0, "grad_norm": 1.6982870192648571, "language_loss": 0.5889293, "learning_rate": 2.3301090827294e-06, "loss": 0.61028892, "num_input_tokens_seen": 165550690, "step": 7714, "time_per_iteration": 2.710048198699951 }, { "auxiliary_loss_clip": 0.01120282, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.04763293, "balance_loss_mlp": 1.01950562, "epoch": 0.46385089433338345, "flos": 12422291846400.0, "grad_norm": 1.91274815186046, "language_loss": 0.70204347, "learning_rate": 2.3297249564761784e-06, "loss": 0.72358704, "num_input_tokens_seen": 165567775, "step": 7715, "time_per_iteration": 2.6403465270996094 }, { "auxiliary_loss_clip": 0.01138235, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.04938495, "balance_loss_mlp": 1.02725387, "epoch": 0.4639110175860514, "flos": 23915752876800.0, "grad_norm": 2.6000471859571777, "language_loss": 0.68646967, "learning_rate": 2.3293408177197527e-06, "loss": 0.7082715, "num_input_tokens_seen": 165587010, "step": 7716, "time_per_iteration": 2.6233439445495605 }, { "auxiliary_loss_clip": 0.01132713, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.0472188, "balance_loss_mlp": 1.01599193, "epoch": 0.4639711408387194, "flos": 25300396304640.0, "grad_norm": 1.7614766285874086, "language_loss": 0.809901, "learning_rate": 2.328956666474691e-06, "loss": 0.83153987, "num_input_tokens_seen": 165607850, "step": 7717, "time_per_iteration": 2.6267318725585938 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.0477078, "balance_loss_mlp": 1.02206373, "epoch": 0.46403126409138734, "flos": 21211822817280.0, "grad_norm": 1.7513215449973674, "language_loss": 0.73192513, "learning_rate": 2.3285725027555593e-06, "loss": 0.75361037, "num_input_tokens_seen": 165627175, "step": 7718, "time_per_iteration": 2.5936009883880615 }, { "auxiliary_loss_clip": 0.01129362, "auxiliary_loss_mlp": 0.00772229, "balance_loss_clip": 1.04671347, "balance_loss_mlp": 1.00063276, "epoch": 0.4640913873440553, "flos": 35845564325760.0, "grad_norm": 1.6991265809872926, "language_loss": 0.70156294, "learning_rate": 2.3281883265769254e-06, "loss": 0.72057891, "num_input_tokens_seen": 165648340, "step": 7719, "time_per_iteration": 2.7047362327575684 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01036441, "balance_loss_clip": 1.05082273, "balance_loss_mlp": 1.02101541, "epoch": 0.46415151059672327, "flos": 19166207270400.0, "grad_norm": 2.142564905802957, "language_loss": 0.86823177, "learning_rate": 2.327804137953357e-06, "loss": 0.88971704, "num_input_tokens_seen": 165667195, "step": 7720, "time_per_iteration": 2.7309963703155518 }, { "auxiliary_loss_clip": 0.01032352, "auxiliary_loss_mlp": 0.01008212, "balance_loss_clip": 1.02414155, "balance_loss_mlp": 1.00647151, "epoch": 0.46421163384939124, "flos": 58912750304640.0, "grad_norm": 0.7188509278747012, "language_loss": 0.55039424, "learning_rate": 2.3274199368994226e-06, "loss": 0.57079989, "num_input_tokens_seen": 165726760, "step": 7721, "time_per_iteration": 3.236877679824829 }, { "auxiliary_loss_clip": 0.01107525, "auxiliary_loss_mlp": 0.01036882, "balance_loss_clip": 1.04643178, "balance_loss_mlp": 1.02240443, "epoch": 0.4642717571020592, "flos": 20157342226560.0, "grad_norm": 2.140310045449241, "language_loss": 0.79792923, "learning_rate": 2.3270357234296918e-06, "loss": 0.81937331, "num_input_tokens_seen": 165745005, "step": 7722, "time_per_iteration": 2.660754919052124 }, { "auxiliary_loss_clip": 0.01135285, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.04771972, "balance_loss_mlp": 1.02478552, "epoch": 0.46433188035472717, "flos": 25046184775680.0, "grad_norm": 1.8420199747356898, "language_loss": 0.77947485, "learning_rate": 2.3266514975587332e-06, "loss": 0.80122739, "num_input_tokens_seen": 165765750, "step": 7723, "time_per_iteration": 2.650667667388916 }, { "auxiliary_loss_clip": 0.010296, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.03560913, "balance_loss_mlp": 1.01945066, "epoch": 0.4643920036073952, "flos": 28075644817920.0, "grad_norm": 1.6775959652720056, "language_loss": 0.68506896, "learning_rate": 2.326267259301118e-06, "loss": 0.7057091, "num_input_tokens_seen": 165787515, "step": 7724, "time_per_iteration": 3.0586209297180176 }, { "auxiliary_loss_clip": 0.01115779, "auxiliary_loss_mlp": 0.01034262, "balance_loss_clip": 1.04832113, "balance_loss_mlp": 1.0193367, "epoch": 0.46445212686006315, "flos": 18369350000640.0, "grad_norm": 3.606583728635542, "language_loss": 0.67163348, "learning_rate": 2.325883008671415e-06, "loss": 0.69313383, "num_input_tokens_seen": 165806675, "step": 7725, "time_per_iteration": 2.9137332439422607 }, { "auxiliary_loss_clip": 0.01113984, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.04604602, "balance_loss_mlp": 1.02554178, "epoch": 0.4645122501127311, "flos": 31721618920320.0, "grad_norm": 1.751091551827286, "language_loss": 0.65037453, "learning_rate": 2.3254987456841955e-06, "loss": 0.67189825, "num_input_tokens_seen": 165829835, "step": 7726, "time_per_iteration": 2.7184534072875977 }, { "auxiliary_loss_clip": 0.0110497, "auxiliary_loss_mlp": 0.00772968, "balance_loss_clip": 1.04436016, "balance_loss_mlp": 1.00061822, "epoch": 0.4645723733653991, "flos": 23768806337280.0, "grad_norm": 1.6559858063545494, "language_loss": 0.74796247, "learning_rate": 2.3251144703540307e-06, "loss": 0.76674187, "num_input_tokens_seen": 165849380, "step": 7727, "time_per_iteration": 2.7193634510040283 }, { "auxiliary_loss_clip": 0.01107461, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.0458529, "balance_loss_mlp": 1.02506471, "epoch": 0.46463249661806705, "flos": 33145512935040.0, "grad_norm": 2.1928121253358293, "language_loss": 0.78549933, "learning_rate": 2.3247301826954936e-06, "loss": 0.80697882, "num_input_tokens_seen": 165868620, "step": 7728, "time_per_iteration": 2.744900703430176 }, { "auxiliary_loss_clip": 0.01092904, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.0414784, "balance_loss_mlp": 1.02373958, "epoch": 0.464692619870735, "flos": 18296020385280.0, "grad_norm": 2.0549050897499135, "language_loss": 0.75892472, "learning_rate": 2.324345882723155e-06, "loss": 0.78024244, "num_input_tokens_seen": 165885915, "step": 7729, "time_per_iteration": 2.7145724296569824 }, { "auxiliary_loss_clip": 0.01108829, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.0485568, "balance_loss_mlp": 1.03153229, "epoch": 0.464752743123403, "flos": 22638051216000.0, "grad_norm": 1.8824527818993837, "language_loss": 0.79760742, "learning_rate": 2.323961570451588e-06, "loss": 0.81916922, "num_input_tokens_seen": 165905465, "step": 7730, "time_per_iteration": 2.7782390117645264 }, { "auxiliary_loss_clip": 0.01130146, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.04756629, "balance_loss_mlp": 1.02265573, "epoch": 0.46481286637607094, "flos": 20412128373120.0, "grad_norm": 1.6262082138117517, "language_loss": 0.77182668, "learning_rate": 2.3235772458953655e-06, "loss": 0.79350036, "num_input_tokens_seen": 165924640, "step": 7731, "time_per_iteration": 2.617314577102661 }, { "auxiliary_loss_clip": 0.01090917, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.04506755, "balance_loss_mlp": 1.02119207, "epoch": 0.4648729896287389, "flos": 34275406129920.0, "grad_norm": 1.6446435516271722, "language_loss": 0.65999961, "learning_rate": 2.323192909069061e-06, "loss": 0.68125969, "num_input_tokens_seen": 165945765, "step": 7732, "time_per_iteration": 2.806825876235962 }, { "auxiliary_loss_clip": 0.01109545, "auxiliary_loss_mlp": 0.0104247, "balance_loss_clip": 1.04427695, "balance_loss_mlp": 1.02551866, "epoch": 0.4649331128814069, "flos": 21321781326720.0, "grad_norm": 2.341941786180864, "language_loss": 0.72770941, "learning_rate": 2.32280855998725e-06, "loss": 0.74922955, "num_input_tokens_seen": 165964025, "step": 7733, "time_per_iteration": 2.6884191036224365 }, { "auxiliary_loss_clip": 0.01046209, "auxiliary_loss_mlp": 0.01002418, "balance_loss_clip": 1.01885557, "balance_loss_mlp": 1.00089204, "epoch": 0.46499323613407484, "flos": 58308515717760.0, "grad_norm": 1.2786299900123337, "language_loss": 0.51944834, "learning_rate": 2.3224241986645057e-06, "loss": 0.53993464, "num_input_tokens_seen": 166021950, "step": 7734, "time_per_iteration": 3.0932440757751465 }, { "auxiliary_loss_clip": 0.01111419, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.05044913, "balance_loss_mlp": 1.01990235, "epoch": 0.4650533593867428, "flos": 10889660384640.0, "grad_norm": 2.1631100357564788, "language_loss": 0.75439203, "learning_rate": 2.3220398251154035e-06, "loss": 0.77584982, "num_input_tokens_seen": 166039675, "step": 7735, "time_per_iteration": 4.546087265014648 }, { "auxiliary_loss_clip": 0.01087553, "auxiliary_loss_mlp": 0.01045865, "balance_loss_clip": 1.04543328, "balance_loss_mlp": 1.0305233, "epoch": 0.46511348263941077, "flos": 19974592805760.0, "grad_norm": 2.3653554564968435, "language_loss": 0.69901764, "learning_rate": 2.321655439354519e-06, "loss": 0.72035182, "num_input_tokens_seen": 166057745, "step": 7736, "time_per_iteration": 4.302860498428345 }, { "auxiliary_loss_clip": 0.01128458, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.0473057, "balance_loss_mlp": 1.0228653, "epoch": 0.46517360589207873, "flos": 19678401256320.0, "grad_norm": 1.6411657567334208, "language_loss": 0.71995008, "learning_rate": 2.321271041396427e-06, "loss": 0.74160457, "num_input_tokens_seen": 166076440, "step": 7737, "time_per_iteration": 2.566603183746338 }, { "auxiliary_loss_clip": 0.01111802, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.05224276, "balance_loss_mlp": 1.02456391, "epoch": 0.46523372914474675, "flos": 16872665074560.0, "grad_norm": 2.50928704064022, "language_loss": 0.83606738, "learning_rate": 2.3208866312557065e-06, "loss": 0.85758948, "num_input_tokens_seen": 166092520, "step": 7738, "time_per_iteration": 2.602149486541748 }, { "auxiliary_loss_clip": 0.0103645, "auxiliary_loss_mlp": 0.01000487, "balance_loss_clip": 1.01920033, "balance_loss_mlp": 0.99899715, "epoch": 0.4652938523974147, "flos": 53439138339840.0, "grad_norm": 0.7761784242108043, "language_loss": 0.57855058, "learning_rate": 2.320502208946932e-06, "loss": 0.59891999, "num_input_tokens_seen": 166156285, "step": 7739, "time_per_iteration": 4.744653940200806 }, { "auxiliary_loss_clip": 0.01111735, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.04867125, "balance_loss_mlp": 1.02728581, "epoch": 0.4653539756500827, "flos": 15231296165760.0, "grad_norm": 1.7825482177936647, "language_loss": 0.85391408, "learning_rate": 2.3201177744846815e-06, "loss": 0.87545103, "num_input_tokens_seen": 166173455, "step": 7740, "time_per_iteration": 4.26358962059021 }, { "auxiliary_loss_clip": 0.01103788, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.04354095, "balance_loss_mlp": 1.02769184, "epoch": 0.46541409890275065, "flos": 23732249270400.0, "grad_norm": 1.728452967927443, "language_loss": 0.75540549, "learning_rate": 2.3197333278835327e-06, "loss": 0.77687728, "num_input_tokens_seen": 166194370, "step": 7741, "time_per_iteration": 2.7189860343933105 }, { "auxiliary_loss_clip": 0.01102378, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.04642224, "balance_loss_mlp": 1.02583992, "epoch": 0.4654742221554186, "flos": 20847329556480.0, "grad_norm": 1.6912495786690362, "language_loss": 0.80807334, "learning_rate": 2.319348869158064e-06, "loss": 0.82949644, "num_input_tokens_seen": 166213195, "step": 7742, "time_per_iteration": 2.7285542488098145 }, { "auxiliary_loss_clip": 0.01109172, "auxiliary_loss_mlp": 0.01044204, "balance_loss_clip": 1.04378545, "balance_loss_mlp": 1.02846837, "epoch": 0.4655343454080866, "flos": 20704836303360.0, "grad_norm": 2.554211916953899, "language_loss": 0.7287879, "learning_rate": 2.3189643983228555e-06, "loss": 0.75032163, "num_input_tokens_seen": 166231350, "step": 7743, "time_per_iteration": 2.8064794540405273 }, { "auxiliary_loss_clip": 0.01097309, "auxiliary_loss_mlp": 0.01035628, "balance_loss_clip": 1.044186, "balance_loss_mlp": 1.01989281, "epoch": 0.46559446866075455, "flos": 18989850470400.0, "grad_norm": 1.9272268848768948, "language_loss": 0.71113133, "learning_rate": 2.318579915392483e-06, "loss": 0.73246074, "num_input_tokens_seen": 166250530, "step": 7744, "time_per_iteration": 2.7021846771240234 }, { "auxiliary_loss_clip": 0.01081647, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.04821372, "balance_loss_mlp": 1.01952028, "epoch": 0.4656545919134225, "flos": 34496364643200.0, "grad_norm": 1.5788774332625253, "language_loss": 0.84865856, "learning_rate": 2.31819542038153e-06, "loss": 0.86981177, "num_input_tokens_seen": 166272545, "step": 7745, "time_per_iteration": 2.8962950706481934 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01044667, "balance_loss_clip": 1.04609525, "balance_loss_mlp": 1.02958083, "epoch": 0.4657147151660905, "flos": 24310554238080.0, "grad_norm": 1.3325532903447972, "language_loss": 0.72868127, "learning_rate": 2.317810913304574e-06, "loss": 0.75031042, "num_input_tokens_seen": 166292135, "step": 7746, "time_per_iteration": 2.654744863510132 }, { "auxiliary_loss_clip": 0.01115957, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.04620576, "balance_loss_mlp": 1.02557254, "epoch": 0.46577483841875844, "flos": 58795139220480.0, "grad_norm": 2.5149225133479667, "language_loss": 0.69942105, "learning_rate": 2.3174263941761963e-06, "loss": 0.72097951, "num_input_tokens_seen": 166316710, "step": 7747, "time_per_iteration": 2.946551561355591 }, { "auxiliary_loss_clip": 0.01087715, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.04082656, "balance_loss_mlp": 1.0269475, "epoch": 0.4658349616714264, "flos": 31321969223040.0, "grad_norm": 1.543824419854341, "language_loss": 0.67369974, "learning_rate": 2.317041863010978e-06, "loss": 0.69501168, "num_input_tokens_seen": 166338535, "step": 7748, "time_per_iteration": 2.7577450275421143 }, { "auxiliary_loss_clip": 0.01095867, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.04655099, "balance_loss_mlp": 1.0242455, "epoch": 0.46589508492409437, "flos": 14860338456960.0, "grad_norm": 2.2493825617355805, "language_loss": 0.6400212, "learning_rate": 2.3166573198235007e-06, "loss": 0.66138601, "num_input_tokens_seen": 166355540, "step": 7749, "time_per_iteration": 2.6768271923065186 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.04833543, "balance_loss_mlp": 1.01912558, "epoch": 0.46595520817676234, "flos": 12895989431040.0, "grad_norm": 2.0851109379556414, "language_loss": 0.74756414, "learning_rate": 2.3162727646283456e-06, "loss": 0.76916647, "num_input_tokens_seen": 166372635, "step": 7750, "time_per_iteration": 2.6180553436279297 }, { "auxiliary_loss_clip": 0.01112353, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.04888475, "balance_loss_mlp": 1.01699984, "epoch": 0.46601533142943036, "flos": 32854169721600.0, "grad_norm": 2.1197385056246, "language_loss": 0.74433059, "learning_rate": 2.3158881974400963e-06, "loss": 0.76577765, "num_input_tokens_seen": 166393175, "step": 7751, "time_per_iteration": 2.7448816299438477 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.049245, "balance_loss_mlp": 1.02301598, "epoch": 0.4660754546820983, "flos": 19967517826560.0, "grad_norm": 2.5234072122891176, "language_loss": 0.73595881, "learning_rate": 2.3155036182733345e-06, "loss": 0.75741076, "num_input_tokens_seen": 166408630, "step": 7752, "time_per_iteration": 2.6944475173950195 }, { "auxiliary_loss_clip": 0.01108633, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.04941273, "balance_loss_mlp": 1.02493417, "epoch": 0.4661355779347663, "flos": 26688164215680.0, "grad_norm": 2.044776600528041, "language_loss": 0.69086194, "learning_rate": 2.315119027142644e-06, "loss": 0.7123493, "num_input_tokens_seen": 166428170, "step": 7753, "time_per_iteration": 2.736854076385498 }, { "auxiliary_loss_clip": 0.01099142, "auxiliary_loss_mlp": 0.01040064, "balance_loss_clip": 1.04148221, "balance_loss_mlp": 1.02494824, "epoch": 0.46619570118743425, "flos": 20959442881920.0, "grad_norm": 2.155464287948458, "language_loss": 0.72724748, "learning_rate": 2.3147344240626076e-06, "loss": 0.74863952, "num_input_tokens_seen": 166446705, "step": 7754, "time_per_iteration": 2.6782143115997314 }, { "auxiliary_loss_clip": 0.01113403, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.04633951, "balance_loss_mlp": 1.01993394, "epoch": 0.4662558244401022, "flos": 24426079355520.0, "grad_norm": 1.424199388432646, "language_loss": 0.78797996, "learning_rate": 2.3143498090478114e-06, "loss": 0.80947065, "num_input_tokens_seen": 166466750, "step": 7755, "time_per_iteration": 2.8091399669647217 }, { "auxiliary_loss_clip": 0.01115387, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.04450297, "balance_loss_mlp": 1.01545656, "epoch": 0.4663159476927702, "flos": 20595452411520.0, "grad_norm": 1.631642654170447, "language_loss": 0.72453964, "learning_rate": 2.3139651821128382e-06, "loss": 0.74599707, "num_input_tokens_seen": 166485400, "step": 7756, "time_per_iteration": 2.7136480808258057 }, { "auxiliary_loss_clip": 0.01117973, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.04585207, "balance_loss_mlp": 1.02137136, "epoch": 0.46637607094543815, "flos": 25661872823040.0, "grad_norm": 2.024488409117557, "language_loss": 0.78578007, "learning_rate": 2.313580543272274e-06, "loss": 0.80732161, "num_input_tokens_seen": 166505730, "step": 7757, "time_per_iteration": 2.6828832626342773 }, { "auxiliary_loss_clip": 0.01090573, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.04173446, "balance_loss_mlp": 1.01717782, "epoch": 0.4664361941981061, "flos": 24273853516800.0, "grad_norm": 2.116616009232987, "language_loss": 0.6656999, "learning_rate": 2.313195892540705e-06, "loss": 0.68692255, "num_input_tokens_seen": 166523770, "step": 7758, "time_per_iteration": 2.7238266468048096 }, { "auxiliary_loss_clip": 0.01098442, "auxiliary_loss_mlp": 0.01044236, "balance_loss_clip": 1.04272914, "balance_loss_mlp": 1.02916837, "epoch": 0.4664963174507741, "flos": 18405871153920.0, "grad_norm": 1.6471741103867168, "language_loss": 0.74542332, "learning_rate": 2.3128112299327147e-06, "loss": 0.76685011, "num_input_tokens_seen": 166542935, "step": 7759, "time_per_iteration": 2.648406744003296 }, { "auxiliary_loss_clip": 0.01110559, "auxiliary_loss_mlp": 0.01047546, "balance_loss_clip": 1.04692769, "balance_loss_mlp": 1.0325253, "epoch": 0.46655644070344204, "flos": 22455122227200.0, "grad_norm": 1.575011375316493, "language_loss": 0.77734709, "learning_rate": 2.312426555462893e-06, "loss": 0.79892808, "num_input_tokens_seen": 166563935, "step": 7760, "time_per_iteration": 2.715393543243408 }, { "auxiliary_loss_clip": 0.01104604, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.04476929, "balance_loss_mlp": 1.01968408, "epoch": 0.46661656395611, "flos": 13808407731840.0, "grad_norm": 1.8509707336449404, "language_loss": 0.74408627, "learning_rate": 2.3120418691458237e-06, "loss": 0.76547837, "num_input_tokens_seen": 166582175, "step": 7761, "time_per_iteration": 2.679760217666626 }, { "auxiliary_loss_clip": 0.01118037, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.04605913, "balance_loss_mlp": 1.02199411, "epoch": 0.466676687208778, "flos": 21652159645440.0, "grad_norm": 1.9428650174374826, "language_loss": 0.78880894, "learning_rate": 2.3116571709960956e-06, "loss": 0.81037712, "num_input_tokens_seen": 166601870, "step": 7762, "time_per_iteration": 2.6236844062805176 }, { "auxiliary_loss_clip": 0.01032755, "auxiliary_loss_mlp": 0.01004567, "balance_loss_clip": 1.01497078, "balance_loss_mlp": 1.00300527, "epoch": 0.46673681046144594, "flos": 68534259068160.0, "grad_norm": 0.7915263755311791, "language_loss": 0.59707403, "learning_rate": 2.311272461028297e-06, "loss": 0.61744726, "num_input_tokens_seen": 166668960, "step": 7763, "time_per_iteration": 3.2309603691101074 }, { "auxiliary_loss_clip": 0.01092007, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.04239237, "balance_loss_mlp": 1.02181077, "epoch": 0.46679693371411396, "flos": 15814449469440.0, "grad_norm": 2.1149132662524766, "language_loss": 0.78707278, "learning_rate": 2.3108877392570146e-06, "loss": 0.80837297, "num_input_tokens_seen": 166686110, "step": 7764, "time_per_iteration": 2.667523145675659 }, { "auxiliary_loss_clip": 0.01102497, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.05066562, "balance_loss_mlp": 1.02470863, "epoch": 0.4668570569667819, "flos": 18514572687360.0, "grad_norm": 1.9076684434806583, "language_loss": 0.72103167, "learning_rate": 2.310503005696839e-06, "loss": 0.74243796, "num_input_tokens_seen": 166703930, "step": 7765, "time_per_iteration": 2.695037364959717 }, { "auxiliary_loss_clip": 0.0108654, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04354358, "balance_loss_mlp": 1.02578509, "epoch": 0.4669171802194499, "flos": 19206643006080.0, "grad_norm": 3.5524770939500763, "language_loss": 0.77958077, "learning_rate": 2.3101182603623576e-06, "loss": 0.80086035, "num_input_tokens_seen": 166719940, "step": 7766, "time_per_iteration": 2.7083003520965576 }, { "auxiliary_loss_clip": 0.01111478, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.0413723, "balance_loss_mlp": 1.02596176, "epoch": 0.46697730347211786, "flos": 12276135406080.0, "grad_norm": 2.008926604773062, "language_loss": 0.64852947, "learning_rate": 2.3097335032681607e-06, "loss": 0.67005551, "num_input_tokens_seen": 166738285, "step": 7767, "time_per_iteration": 2.6344571113586426 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.0104422, "balance_loss_clip": 1.04623926, "balance_loss_mlp": 1.02955675, "epoch": 0.4670374267247858, "flos": 23586739274880.0, "grad_norm": 1.9514245068590486, "language_loss": 0.74225283, "learning_rate": 2.3093487344288393e-06, "loss": 0.76389533, "num_input_tokens_seen": 166758170, "step": 7768, "time_per_iteration": 2.7037155628204346 }, { "auxiliary_loss_clip": 0.01101883, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04606605, "balance_loss_mlp": 1.02081776, "epoch": 0.4670975499774538, "flos": 15991093578240.0, "grad_norm": 1.8795722363955685, "language_loss": 0.70699239, "learning_rate": 2.308963953858982e-06, "loss": 0.72837055, "num_input_tokens_seen": 166775750, "step": 7769, "time_per_iteration": 2.6716794967651367 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.04401624, "balance_loss_mlp": 1.02156949, "epoch": 0.46715767323012175, "flos": 15377596260480.0, "grad_norm": 2.0624542877059158, "language_loss": 0.81268704, "learning_rate": 2.3085791615731803e-06, "loss": 0.83432686, "num_input_tokens_seen": 166791720, "step": 7770, "time_per_iteration": 2.5958662033081055 }, { "auxiliary_loss_clip": 0.01043437, "auxiliary_loss_mlp": 0.01001838, "balance_loss_clip": 1.01635242, "balance_loss_mlp": 1.00027645, "epoch": 0.4672177964827897, "flos": 60252217401600.0, "grad_norm": 0.7961749107066677, "language_loss": 0.5562135, "learning_rate": 2.3081943575860265e-06, "loss": 0.57666636, "num_input_tokens_seen": 166856360, "step": 7771, "time_per_iteration": 3.1569736003875732 }, { "auxiliary_loss_clip": 0.01114939, "auxiliary_loss_mlp": 0.00771824, "balance_loss_clip": 1.04351723, "balance_loss_mlp": 1.00060511, "epoch": 0.4672779197354577, "flos": 27636134002560.0, "grad_norm": 1.896331384644372, "language_loss": 0.65528286, "learning_rate": 2.3078095419121117e-06, "loss": 0.67415047, "num_input_tokens_seen": 166875925, "step": 7772, "time_per_iteration": 2.7263035774230957 }, { "auxiliary_loss_clip": 0.01113556, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.04692101, "balance_loss_mlp": 1.02061212, "epoch": 0.46733804298812565, "flos": 31394257344000.0, "grad_norm": 2.0574903106475513, "language_loss": 0.63557553, "learning_rate": 2.3074247145660283e-06, "loss": 0.65706098, "num_input_tokens_seen": 166896520, "step": 7773, "time_per_iteration": 2.691378593444824 }, { "auxiliary_loss_clip": 0.01112174, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.04673469, "balance_loss_mlp": 1.02454185, "epoch": 0.4673981662407936, "flos": 19500607912320.0, "grad_norm": 1.9630472969764714, "language_loss": 0.80073929, "learning_rate": 2.3070398755623685e-06, "loss": 0.8222639, "num_input_tokens_seen": 166915370, "step": 7774, "time_per_iteration": 2.661416530609131 }, { "auxiliary_loss_clip": 0.01096265, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.04382384, "balance_loss_mlp": 1.01813269, "epoch": 0.4674582894934616, "flos": 20521835487360.0, "grad_norm": 1.5987951306887498, "language_loss": 0.77369159, "learning_rate": 2.306655024915726e-06, "loss": 0.79499024, "num_input_tokens_seen": 166934875, "step": 7775, "time_per_iteration": 4.281586647033691 }, { "auxiliary_loss_clip": 0.01096609, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.04498506, "balance_loss_mlp": 1.02137041, "epoch": 0.46751841274612954, "flos": 22090952188800.0, "grad_norm": 1.8524613051021832, "language_loss": 0.69526893, "learning_rate": 2.306270162640694e-06, "loss": 0.71659672, "num_input_tokens_seen": 166954285, "step": 7776, "time_per_iteration": 4.289973497390747 }, { "auxiliary_loss_clip": 0.0112105, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.04810274, "balance_loss_mlp": 1.02246058, "epoch": 0.46757853599879756, "flos": 26980082046720.0, "grad_norm": 1.5322212077638444, "language_loss": 0.73980904, "learning_rate": 2.3058852887518678e-06, "loss": 0.76138097, "num_input_tokens_seen": 166975975, "step": 7777, "time_per_iteration": 2.7370285987854004 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.045416, "balance_loss_mlp": 1.02208281, "epoch": 0.4676386592514655, "flos": 24134053783680.0, "grad_norm": 2.891298768731385, "language_loss": 0.69314432, "learning_rate": 2.3055004032638394e-06, "loss": 0.71469557, "num_input_tokens_seen": 166996140, "step": 7778, "time_per_iteration": 4.159350633621216 }, { "auxiliary_loss_clip": 0.01119786, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.04801941, "balance_loss_mlp": 1.02624786, "epoch": 0.4676987825041335, "flos": 25483720343040.0, "grad_norm": 2.158752703527913, "language_loss": 0.73216277, "learning_rate": 2.305115506191206e-06, "loss": 0.75377357, "num_input_tokens_seen": 167016105, "step": 7779, "time_per_iteration": 2.6880576610565186 }, { "auxiliary_loss_clip": 0.0108513, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 1.04270327, "balance_loss_mlp": 1.02963924, "epoch": 0.46775890575680146, "flos": 21945298538880.0, "grad_norm": 1.532169986090066, "language_loss": 0.72447348, "learning_rate": 2.304730597548562e-06, "loss": 0.74575877, "num_input_tokens_seen": 167036185, "step": 7780, "time_per_iteration": 4.378252267837524 }, { "auxiliary_loss_clip": 0.01098995, "auxiliary_loss_mlp": 0.01052099, "balance_loss_clip": 1.03960943, "balance_loss_mlp": 1.03428912, "epoch": 0.4678190290094694, "flos": 25228395492480.0, "grad_norm": 1.8072634784489867, "language_loss": 0.74489224, "learning_rate": 2.3043456773505023e-06, "loss": 0.7664032, "num_input_tokens_seen": 167054515, "step": 7781, "time_per_iteration": 2.684298038482666 }, { "auxiliary_loss_clip": 0.01121556, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.04655743, "balance_loss_mlp": 1.02464151, "epoch": 0.4678791522621374, "flos": 32268358811520.0, "grad_norm": 3.3303395339611486, "language_loss": 0.62934184, "learning_rate": 2.3039607456116252e-06, "loss": 0.65096015, "num_input_tokens_seen": 167077245, "step": 7782, "time_per_iteration": 2.801643133163452 }, { "auxiliary_loss_clip": 0.01112208, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.04610753, "balance_loss_mlp": 1.02925098, "epoch": 0.46793927551480535, "flos": 27046480337280.0, "grad_norm": 2.527604831052906, "language_loss": 0.63679516, "learning_rate": 2.3035758023465254e-06, "loss": 0.65835738, "num_input_tokens_seen": 167097235, "step": 7783, "time_per_iteration": 2.779493570327759 }, { "auxiliary_loss_clip": 0.01126101, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04948771, "balance_loss_mlp": 1.02393532, "epoch": 0.4679993987674733, "flos": 17457398576640.0, "grad_norm": 2.4796959185267884, "language_loss": 0.67925286, "learning_rate": 2.303190847569801e-06, "loss": 0.70091814, "num_input_tokens_seen": 167113155, "step": 7784, "time_per_iteration": 2.640165090560913 }, { "auxiliary_loss_clip": 0.01100267, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.04564571, "balance_loss_mlp": 1.0193001, "epoch": 0.4680595220201413, "flos": 17165121609600.0, "grad_norm": 2.0879148282250304, "language_loss": 0.84605902, "learning_rate": 2.3028058812960497e-06, "loss": 0.8673948, "num_input_tokens_seen": 167131765, "step": 7785, "time_per_iteration": 2.6447336673736572 }, { "auxiliary_loss_clip": 0.01095846, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.0473485, "balance_loss_mlp": 1.02278996, "epoch": 0.46811964527280925, "flos": 11327591001600.0, "grad_norm": 1.936392485305852, "language_loss": 0.77363992, "learning_rate": 2.3024209035398678e-06, "loss": 0.79498196, "num_input_tokens_seen": 167149030, "step": 7786, "time_per_iteration": 2.7023332118988037 }, { "auxiliary_loss_clip": 0.01116619, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.04685593, "balance_loss_mlp": 1.02089214, "epoch": 0.4681797685254772, "flos": 24278809593600.0, "grad_norm": 2.0886119764466686, "language_loss": 0.74195051, "learning_rate": 2.302035914315856e-06, "loss": 0.76346588, "num_input_tokens_seen": 167167375, "step": 7787, "time_per_iteration": 2.704002618789673 }, { "auxiliary_loss_clip": 0.0110227, "auxiliary_loss_mlp": 0.01041247, "balance_loss_clip": 1.04562151, "balance_loss_mlp": 1.02654815, "epoch": 0.4682398917781452, "flos": 31650372293760.0, "grad_norm": 1.9198703232455803, "language_loss": 0.65471619, "learning_rate": 2.3016509136386116e-06, "loss": 0.67615134, "num_input_tokens_seen": 167188065, "step": 7788, "time_per_iteration": 2.767409324645996 }, { "auxiliary_loss_clip": 0.01117478, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.0463376, "balance_loss_mlp": 1.02198708, "epoch": 0.46830001503081314, "flos": 28110765340800.0, "grad_norm": 1.576175997941932, "language_loss": 0.63680893, "learning_rate": 2.3012659015227343e-06, "loss": 0.65833306, "num_input_tokens_seen": 167209675, "step": 7789, "time_per_iteration": 2.686382532119751 }, { "auxiliary_loss_clip": 0.01034678, "auxiliary_loss_mlp": 0.01000229, "balance_loss_clip": 1.01769471, "balance_loss_mlp": 0.99867934, "epoch": 0.4683601382834811, "flos": 57881718316800.0, "grad_norm": 0.6946835696901172, "language_loss": 0.61856973, "learning_rate": 2.300880877982825e-06, "loss": 0.63891876, "num_input_tokens_seen": 167273940, "step": 7790, "time_per_iteration": 3.2082865238189697 }, { "auxiliary_loss_clip": 0.01088531, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04553008, "balance_loss_mlp": 1.02514648, "epoch": 0.46842026153614913, "flos": 21871933009920.0, "grad_norm": 1.7348641955250894, "language_loss": 0.79120016, "learning_rate": 2.3004958430334808e-06, "loss": 0.81249446, "num_input_tokens_seen": 167292730, "step": 7791, "time_per_iteration": 2.7868592739105225 }, { "auxiliary_loss_clip": 0.0112267, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.05027902, "balance_loss_mlp": 1.0236336, "epoch": 0.4684803847888171, "flos": 24900818434560.0, "grad_norm": 1.5319083860586857, "language_loss": 0.7509321, "learning_rate": 2.3001107966893052e-06, "loss": 0.77254432, "num_input_tokens_seen": 167313460, "step": 7792, "time_per_iteration": 2.6591553688049316 }, { "auxiliary_loss_clip": 0.01093652, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.03941143, "balance_loss_mlp": 1.02582359, "epoch": 0.46854050804148506, "flos": 26251670142720.0, "grad_norm": 1.6679874379457267, "language_loss": 0.68283308, "learning_rate": 2.299725738964898e-06, "loss": 0.70417762, "num_input_tokens_seen": 167335385, "step": 7793, "time_per_iteration": 2.714614152908325 }, { "auxiliary_loss_clip": 0.01120793, "auxiliary_loss_mlp": 0.00770869, "balance_loss_clip": 1.05047464, "balance_loss_mlp": 1.00063658, "epoch": 0.468600631294153, "flos": 21579799697280.0, "grad_norm": 1.5900503410544595, "language_loss": 0.74045742, "learning_rate": 2.2993406698748607e-06, "loss": 0.75937402, "num_input_tokens_seen": 167353625, "step": 7794, "time_per_iteration": 2.631113052368164 }, { "auxiliary_loss_clip": 0.01101487, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.04786825, "balance_loss_mlp": 1.02505112, "epoch": 0.468660754546821, "flos": 25885632597120.0, "grad_norm": 1.7758607044197945, "language_loss": 0.63441491, "learning_rate": 2.2989555894337953e-06, "loss": 0.65583163, "num_input_tokens_seen": 167374565, "step": 7795, "time_per_iteration": 2.755208969116211 }, { "auxiliary_loss_clip": 0.01090992, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.04455793, "balance_loss_mlp": 1.01939869, "epoch": 0.46872087779948896, "flos": 35475001666560.0, "grad_norm": 1.5780628651808217, "language_loss": 0.6815629, "learning_rate": 2.298570497656304e-06, "loss": 0.70281053, "num_input_tokens_seen": 167395010, "step": 7796, "time_per_iteration": 2.8338258266448975 }, { "auxiliary_loss_clip": 0.01132709, "auxiliary_loss_mlp": 0.00772271, "balance_loss_clip": 1.05046582, "balance_loss_mlp": 1.00074291, "epoch": 0.4687810010521569, "flos": 26396425952640.0, "grad_norm": 3.1208322005509705, "language_loss": 0.7061345, "learning_rate": 2.2981853945569894e-06, "loss": 0.72518432, "num_input_tokens_seen": 167415285, "step": 7797, "time_per_iteration": 2.7184929847717285 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.04716921, "balance_loss_mlp": 1.01992226, "epoch": 0.4688411243048249, "flos": 19972761212160.0, "grad_norm": 2.050220537358762, "language_loss": 0.67158788, "learning_rate": 2.297800280150454e-06, "loss": 0.69302827, "num_input_tokens_seen": 167432405, "step": 7798, "time_per_iteration": 2.707491159439087 }, { "auxiliary_loss_clip": 0.01033434, "auxiliary_loss_mlp": 0.00999628, "balance_loss_clip": 1.01507461, "balance_loss_mlp": 0.99782771, "epoch": 0.46890124755749285, "flos": 63977015900160.0, "grad_norm": 0.9512995219109956, "language_loss": 0.64611268, "learning_rate": 2.2974151544513033e-06, "loss": 0.66644335, "num_input_tokens_seen": 167499365, "step": 7799, "time_per_iteration": 3.3521087169647217 }, { "auxiliary_loss_clip": 0.01103151, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.0488441, "balance_loss_mlp": 1.01467967, "epoch": 0.4689613708101608, "flos": 23768985905280.0, "grad_norm": 1.342329921678728, "language_loss": 0.72313237, "learning_rate": 2.2970300174741395e-06, "loss": 0.74445534, "num_input_tokens_seen": 167520390, "step": 7800, "time_per_iteration": 2.7983593940734863 }, { "auxiliary_loss_clip": 0.01128952, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.04984462, "balance_loss_mlp": 1.0224781, "epoch": 0.4690214940628288, "flos": 24788705109120.0, "grad_norm": 1.7150056694833848, "language_loss": 0.7285912, "learning_rate": 2.296644869233568e-06, "loss": 0.75023353, "num_input_tokens_seen": 167539865, "step": 7801, "time_per_iteration": 2.635540008544922 }, { "auxiliary_loss_clip": 0.01097741, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.04270506, "balance_loss_mlp": 1.02579427, "epoch": 0.46908161731549675, "flos": 18077324428800.0, "grad_norm": 1.930712957606368, "language_loss": 0.62748474, "learning_rate": 2.2962597097441936e-06, "loss": 0.64888108, "num_input_tokens_seen": 167558190, "step": 7802, "time_per_iteration": 2.8309857845306396 }, { "auxiliary_loss_clip": 0.01131707, "auxiliary_loss_mlp": 0.01041126, "balance_loss_clip": 1.04824543, "balance_loss_mlp": 1.02705908, "epoch": 0.4691417405681647, "flos": 25703350053120.0, "grad_norm": 2.0983906256852647, "language_loss": 0.73465741, "learning_rate": 2.2958745390206206e-06, "loss": 0.75638568, "num_input_tokens_seen": 167577685, "step": 7803, "time_per_iteration": 2.639453172683716 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.00771349, "balance_loss_clip": 1.04883635, "balance_loss_mlp": 1.00065053, "epoch": 0.46920186382083273, "flos": 17457039440640.0, "grad_norm": 2.3177200047102486, "language_loss": 0.77396876, "learning_rate": 2.2954893570774558e-06, "loss": 0.7927407, "num_input_tokens_seen": 167596390, "step": 7804, "time_per_iteration": 2.6661806106567383 }, { "auxiliary_loss_clip": 0.01105528, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.04877174, "balance_loss_mlp": 1.01763344, "epoch": 0.4692619870735007, "flos": 20339445202560.0, "grad_norm": 2.089417814933236, "language_loss": 0.77330643, "learning_rate": 2.295104163929305e-06, "loss": 0.79467863, "num_input_tokens_seen": 167614980, "step": 7805, "time_per_iteration": 2.6670541763305664 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01050591, "balance_loss_clip": 1.05195141, "balance_loss_mlp": 1.03487957, "epoch": 0.46932211032616866, "flos": 29496558003840.0, "grad_norm": 1.6834011453476339, "language_loss": 0.82446682, "learning_rate": 2.2947189595907742e-06, "loss": 0.84637654, "num_input_tokens_seen": 167635895, "step": 7806, "time_per_iteration": 2.641126871109009 }, { "auxiliary_loss_clip": 0.01109262, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.04739761, "balance_loss_mlp": 1.02634656, "epoch": 0.4693822335788366, "flos": 36211242735360.0, "grad_norm": 1.815437092056069, "language_loss": 0.77320337, "learning_rate": 2.294333744076472e-06, "loss": 0.79470897, "num_input_tokens_seen": 167657440, "step": 7807, "time_per_iteration": 2.768772840499878 }, { "auxiliary_loss_clip": 0.0110914, "auxiliary_loss_mlp": 0.01038695, "balance_loss_clip": 1.05083752, "balance_loss_mlp": 1.02354348, "epoch": 0.4694423568315046, "flos": 20338978325760.0, "grad_norm": 2.201580678066969, "language_loss": 0.51815701, "learning_rate": 2.2939485174010035e-06, "loss": 0.53963536, "num_input_tokens_seen": 167675025, "step": 7808, "time_per_iteration": 2.6565470695495605 }, { "auxiliary_loss_clip": 0.01003405, "auxiliary_loss_mlp": 0.01005455, "balance_loss_clip": 1.0168457, "balance_loss_mlp": 1.00391757, "epoch": 0.46950248008417256, "flos": 64326353621760.0, "grad_norm": 0.78732179125356, "language_loss": 0.57700193, "learning_rate": 2.293563279578978e-06, "loss": 0.59709048, "num_input_tokens_seen": 167729635, "step": 7809, "time_per_iteration": 3.1529645919799805 }, { "auxiliary_loss_clip": 0.01087624, "auxiliary_loss_mlp": 0.01039585, "balance_loss_clip": 1.04826307, "balance_loss_mlp": 1.02535129, "epoch": 0.4695626033368405, "flos": 19200106730880.0, "grad_norm": 2.4452536224375403, "language_loss": 0.7153672, "learning_rate": 2.2931780306250045e-06, "loss": 0.73663932, "num_input_tokens_seen": 167745135, "step": 7810, "time_per_iteration": 2.730975389480591 }, { "auxiliary_loss_clip": 0.01122205, "auxiliary_loss_mlp": 0.01041582, "balance_loss_clip": 1.04927683, "balance_loss_mlp": 1.02719331, "epoch": 0.4696227265895085, "flos": 23002436736000.0, "grad_norm": 3.7864250348919284, "language_loss": 0.81469715, "learning_rate": 2.29279277055369e-06, "loss": 0.83633506, "num_input_tokens_seen": 167763875, "step": 7811, "time_per_iteration": 2.689089059829712 }, { "auxiliary_loss_clip": 0.01117579, "auxiliary_loss_mlp": 0.01038248, "balance_loss_clip": 1.04989529, "balance_loss_mlp": 1.02302504, "epoch": 0.46968284984217645, "flos": 21870855601920.0, "grad_norm": 1.6520361935296233, "language_loss": 0.8041414, "learning_rate": 2.292407499379644e-06, "loss": 0.82569969, "num_input_tokens_seen": 167784895, "step": 7812, "time_per_iteration": 2.6615161895751953 }, { "auxiliary_loss_clip": 0.01075193, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.04313707, "balance_loss_mlp": 1.02170289, "epoch": 0.4697429730948444, "flos": 19974987855360.0, "grad_norm": 1.6393784799199496, "language_loss": 0.74155343, "learning_rate": 2.292022217117477e-06, "loss": 0.76266813, "num_input_tokens_seen": 167803185, "step": 7813, "time_per_iteration": 2.7426726818084717 }, { "auxiliary_loss_clip": 0.01102658, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.04594994, "balance_loss_mlp": 1.02108407, "epoch": 0.4698030963475124, "flos": 15156206784000.0, "grad_norm": 2.3178266219619994, "language_loss": 0.84324849, "learning_rate": 2.291636923781798e-06, "loss": 0.86464167, "num_input_tokens_seen": 167816550, "step": 7814, "time_per_iteration": 2.6519999504089355 }, { "auxiliary_loss_clip": 0.01105673, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.04427862, "balance_loss_mlp": 1.02291358, "epoch": 0.46986321960018035, "flos": 15151178880000.0, "grad_norm": 1.8698068393605216, "language_loss": 0.81723464, "learning_rate": 2.291251619387217e-06, "loss": 0.83865952, "num_input_tokens_seen": 167831845, "step": 7815, "time_per_iteration": 5.720506906509399 }, { "auxiliary_loss_clip": 0.01088353, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04897821, "balance_loss_mlp": 1.023808, "epoch": 0.4699233428528483, "flos": 23108911626240.0, "grad_norm": 2.071255754681328, "language_loss": 0.77463031, "learning_rate": 2.2908663039483468e-06, "loss": 0.79590356, "num_input_tokens_seen": 167850360, "step": 7816, "time_per_iteration": 2.738074541091919 }, { "auxiliary_loss_clip": 0.01044982, "auxiliary_loss_mlp": 0.01001103, "balance_loss_clip": 1.01830792, "balance_loss_mlp": 0.99944633, "epoch": 0.46998346610551633, "flos": 68105558246400.0, "grad_norm": 0.838650178196428, "language_loss": 0.58987319, "learning_rate": 2.290480977479796e-06, "loss": 0.6103341, "num_input_tokens_seen": 167908660, "step": 7817, "time_per_iteration": 3.1292662620544434 }, { "auxiliary_loss_clip": 0.01107632, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.04874861, "balance_loss_mlp": 1.02005172, "epoch": 0.4700435893581843, "flos": 24129456842880.0, "grad_norm": 1.7123630681211415, "language_loss": 0.79417968, "learning_rate": 2.2900956399961775e-06, "loss": 0.81559694, "num_input_tokens_seen": 167927905, "step": 7818, "time_per_iteration": 5.943104028701782 }, { "auxiliary_loss_clip": 0.0113212, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.04868269, "balance_loss_mlp": 1.02325034, "epoch": 0.47010371261085226, "flos": 20150518642560.0, "grad_norm": 1.6838154241149696, "language_loss": 0.83469647, "learning_rate": 2.289710291512104e-06, "loss": 0.85638928, "num_input_tokens_seen": 167945995, "step": 7819, "time_per_iteration": 2.6600770950317383 }, { "auxiliary_loss_clip": 0.01101069, "auxiliary_loss_mlp": 0.0103721, "balance_loss_clip": 1.04507041, "balance_loss_mlp": 1.02214193, "epoch": 0.47016383586352023, "flos": 15122199582720.0, "grad_norm": 2.5448578806987974, "language_loss": 0.7640624, "learning_rate": 2.289324932042186e-06, "loss": 0.78544521, "num_input_tokens_seen": 167963380, "step": 7820, "time_per_iteration": 2.720524549484253 }, { "auxiliary_loss_clip": 0.01114996, "auxiliary_loss_mlp": 0.01040886, "balance_loss_clip": 1.05066848, "balance_loss_mlp": 1.02641368, "epoch": 0.4702239591161882, "flos": 13552975140480.0, "grad_norm": 1.835793139157851, "language_loss": 0.74591041, "learning_rate": 2.288939561601039e-06, "loss": 0.76746929, "num_input_tokens_seen": 167981740, "step": 7821, "time_per_iteration": 2.6208953857421875 }, { "auxiliary_loss_clip": 0.0112785, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.04762793, "balance_loss_mlp": 1.03104329, "epoch": 0.47028408236885616, "flos": 24276511123200.0, "grad_norm": 1.8086110799443134, "language_loss": 0.89176404, "learning_rate": 2.2885541802032746e-06, "loss": 0.91348755, "num_input_tokens_seen": 167999380, "step": 7822, "time_per_iteration": 2.641425371170044 }, { "auxiliary_loss_clip": 0.01113329, "auxiliary_loss_mlp": 0.01033656, "balance_loss_clip": 1.04665482, "balance_loss_mlp": 1.01981544, "epoch": 0.4703442056215241, "flos": 22856926740480.0, "grad_norm": 1.7930134528553263, "language_loss": 0.79694283, "learning_rate": 2.2881687878635055e-06, "loss": 0.81841266, "num_input_tokens_seen": 168018395, "step": 7823, "time_per_iteration": 2.632756233215332 }, { "auxiliary_loss_clip": 0.01025068, "auxiliary_loss_mlp": 0.01003424, "balance_loss_clip": 1.02190793, "balance_loss_mlp": 1.00163603, "epoch": 0.4704043288741921, "flos": 69240227950080.0, "grad_norm": 0.8086269167579946, "language_loss": 0.56642514, "learning_rate": 2.2877833845963487e-06, "loss": 0.5867101, "num_input_tokens_seen": 168084080, "step": 7824, "time_per_iteration": 3.3140807151794434 }, { "auxiliary_loss_clip": 0.01104679, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.04395127, "balance_loss_mlp": 1.02718711, "epoch": 0.47046445212686006, "flos": 18041090584320.0, "grad_norm": 1.8843796036347318, "language_loss": 0.81223321, "learning_rate": 2.2873979704164157e-06, "loss": 0.83370888, "num_input_tokens_seen": 168101555, "step": 7825, "time_per_iteration": 2.700547695159912 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.0480845, "balance_loss_mlp": 1.02218676, "epoch": 0.470524575379528, "flos": 23951448017280.0, "grad_norm": 1.7729512383292405, "language_loss": 0.66719514, "learning_rate": 2.287012545338324e-06, "loss": 0.68867397, "num_input_tokens_seen": 168121530, "step": 7826, "time_per_iteration": 2.6998069286346436 }, { "auxiliary_loss_clip": 0.01105784, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.04433072, "balance_loss_mlp": 1.02479887, "epoch": 0.470584698632196, "flos": 18113558273280.0, "grad_norm": 1.8432989970829954, "language_loss": 0.84173524, "learning_rate": 2.2866271093766877e-06, "loss": 0.86319232, "num_input_tokens_seen": 168140335, "step": 7827, "time_per_iteration": 2.692657709121704 }, { "auxiliary_loss_clip": 0.01024445, "auxiliary_loss_mlp": 0.01004787, "balance_loss_clip": 1.01622581, "balance_loss_mlp": 1.00303495, "epoch": 0.47064482188486395, "flos": 57251916224640.0, "grad_norm": 0.8086690003326286, "language_loss": 0.5568617, "learning_rate": 2.286241662546122e-06, "loss": 0.57715398, "num_input_tokens_seen": 168200535, "step": 7828, "time_per_iteration": 3.184593439102173 }, { "auxiliary_loss_clip": 0.01128245, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.04770434, "balance_loss_mlp": 1.02036309, "epoch": 0.4707049451375319, "flos": 17895077798400.0, "grad_norm": 2.799236307786822, "language_loss": 0.80882025, "learning_rate": 2.285856204861245e-06, "loss": 0.8304407, "num_input_tokens_seen": 168219610, "step": 7829, "time_per_iteration": 2.5789284706115723 }, { "auxiliary_loss_clip": 0.01128236, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.04866183, "balance_loss_mlp": 1.02311337, "epoch": 0.47076506839019994, "flos": 25232669210880.0, "grad_norm": 1.589084017915349, "language_loss": 0.76252091, "learning_rate": 2.2854707363366703e-06, "loss": 0.78416359, "num_input_tokens_seen": 168242505, "step": 7830, "time_per_iteration": 2.6604039669036865 }, { "auxiliary_loss_clip": 0.01094201, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.04519463, "balance_loss_mlp": 1.01907206, "epoch": 0.4708251916428679, "flos": 13479681438720.0, "grad_norm": 1.9041514810278948, "language_loss": 0.7839942, "learning_rate": 2.2850852569870177e-06, "loss": 0.8052749, "num_input_tokens_seen": 168260220, "step": 7831, "time_per_iteration": 2.7709531784057617 }, { "auxiliary_loss_clip": 0.01084793, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.03967106, "balance_loss_mlp": 1.0289377, "epoch": 0.47088531489553587, "flos": 30147833450880.0, "grad_norm": 3.4524245779244045, "language_loss": 0.75518548, "learning_rate": 2.2846997668269033e-06, "loss": 0.7764889, "num_input_tokens_seen": 168277360, "step": 7832, "time_per_iteration": 2.9078352451324463 }, { "auxiliary_loss_clip": 0.01100887, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.04597783, "balance_loss_mlp": 1.01476312, "epoch": 0.47094543814820383, "flos": 21798280172160.0, "grad_norm": 1.3033633023675582, "language_loss": 0.74446917, "learning_rate": 2.2843142658709454e-06, "loss": 0.76574957, "num_input_tokens_seen": 168296605, "step": 7833, "time_per_iteration": 2.7040505409240723 }, { "auxiliary_loss_clip": 0.01115931, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.04605532, "balance_loss_mlp": 1.02489686, "epoch": 0.4710055614008718, "flos": 23003011353600.0, "grad_norm": 1.6784231271486025, "language_loss": 0.75652939, "learning_rate": 2.283928754133762e-06, "loss": 0.778081, "num_input_tokens_seen": 168316205, "step": 7834, "time_per_iteration": 2.651439666748047 }, { "auxiliary_loss_clip": 0.01080958, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.04571462, "balance_loss_mlp": 1.02942359, "epoch": 0.47106568465353976, "flos": 42741346452480.0, "grad_norm": 1.5705960877616694, "language_loss": 0.66198736, "learning_rate": 2.283543231629972e-06, "loss": 0.68323219, "num_input_tokens_seen": 168338935, "step": 7835, "time_per_iteration": 2.8833723068237305 }, { "auxiliary_loss_clip": 0.01030822, "auxiliary_loss_mlp": 0.0075266, "balance_loss_clip": 1.01354921, "balance_loss_mlp": 1.00055587, "epoch": 0.4711258079062077, "flos": 68554008570240.0, "grad_norm": 0.8682696962056556, "language_loss": 0.62114525, "learning_rate": 2.283157698374194e-06, "loss": 0.63898003, "num_input_tokens_seen": 168392800, "step": 7836, "time_per_iteration": 3.271106243133545 }, { "auxiliary_loss_clip": 0.01089899, "auxiliary_loss_mlp": 0.00772396, "balance_loss_clip": 1.04188919, "balance_loss_mlp": 1.00066912, "epoch": 0.4711859311588757, "flos": 25446588658560.0, "grad_norm": 2.9726849992756623, "language_loss": 0.69634271, "learning_rate": 2.2827721543810475e-06, "loss": 0.71496564, "num_input_tokens_seen": 168412940, "step": 7837, "time_per_iteration": 2.7227394580841064 }, { "auxiliary_loss_clip": 0.01114908, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.04658818, "balance_loss_mlp": 1.02449143, "epoch": 0.47124605441154366, "flos": 21981891519360.0, "grad_norm": 1.834184212780789, "language_loss": 0.66073495, "learning_rate": 2.282386599665153e-06, "loss": 0.68228424, "num_input_tokens_seen": 168431995, "step": 7838, "time_per_iteration": 2.63415265083313 }, { "auxiliary_loss_clip": 0.01101595, "auxiliary_loss_mlp": 0.01040478, "balance_loss_clip": 1.04245853, "balance_loss_mlp": 1.02488542, "epoch": 0.4713061776642116, "flos": 25412689198080.0, "grad_norm": 1.6613879226075605, "language_loss": 0.77071315, "learning_rate": 2.2820010342411304e-06, "loss": 0.79213387, "num_input_tokens_seen": 168454585, "step": 7839, "time_per_iteration": 2.702371835708618 }, { "auxiliary_loss_clip": 0.01089161, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.04446244, "balance_loss_mlp": 1.0215137, "epoch": 0.4713663009168796, "flos": 26542259170560.0, "grad_norm": 2.064347613929302, "language_loss": 0.72607076, "learning_rate": 2.2816154581235993e-06, "loss": 0.74731302, "num_input_tokens_seen": 168471265, "step": 7840, "time_per_iteration": 2.7578155994415283 }, { "auxiliary_loss_clip": 0.01098285, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.04248786, "balance_loss_mlp": 1.01975548, "epoch": 0.47142642416954755, "flos": 23623583650560.0, "grad_norm": 1.634270857219127, "language_loss": 0.75153434, "learning_rate": 2.2812298713271833e-06, "loss": 0.77285522, "num_input_tokens_seen": 168491360, "step": 7841, "time_per_iteration": 2.7571516036987305 }, { "auxiliary_loss_clip": 0.01097356, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.04522789, "balance_loss_mlp": 1.02271175, "epoch": 0.4714865474222155, "flos": 22310150935680.0, "grad_norm": 1.514171980299406, "language_loss": 0.70372689, "learning_rate": 2.280844273866501e-06, "loss": 0.72506565, "num_input_tokens_seen": 168511335, "step": 7842, "time_per_iteration": 2.6693220138549805 }, { "auxiliary_loss_clip": 0.01122506, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.05041289, "balance_loss_mlp": 1.02272844, "epoch": 0.4715466706748835, "flos": 17822430541440.0, "grad_norm": 2.3877412842319243, "language_loss": 0.78754079, "learning_rate": 2.280458665756177e-06, "loss": 0.80913448, "num_input_tokens_seen": 168529920, "step": 7843, "time_per_iteration": 2.584821939468384 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.01033598, "balance_loss_clip": 1.04609227, "balance_loss_mlp": 1.02013922, "epoch": 0.4716067939275515, "flos": 23659530186240.0, "grad_norm": 1.5083750473310347, "language_loss": 0.73945224, "learning_rate": 2.280073047010832e-06, "loss": 0.76091611, "num_input_tokens_seen": 168550595, "step": 7844, "time_per_iteration": 2.6947662830352783 }, { "auxiliary_loss_clip": 0.01103523, "auxiliary_loss_mlp": 0.01045426, "balance_loss_clip": 1.04754925, "balance_loss_mlp": 1.03077483, "epoch": 0.47166691718021947, "flos": 17930162407680.0, "grad_norm": 1.6596812780951513, "language_loss": 0.7849918, "learning_rate": 2.279687417645088e-06, "loss": 0.8064813, "num_input_tokens_seen": 168569765, "step": 7845, "time_per_iteration": 2.64786434173584 }, { "auxiliary_loss_clip": 0.01116093, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.04657555, "balance_loss_mlp": 1.02204597, "epoch": 0.47172704043288743, "flos": 26614583205120.0, "grad_norm": 1.4795134607526772, "language_loss": 0.73325998, "learning_rate": 2.2793017776735703e-06, "loss": 0.75477785, "num_input_tokens_seen": 168591525, "step": 7846, "time_per_iteration": 2.6890015602111816 }, { "auxiliary_loss_clip": 0.01112295, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.02053618, "epoch": 0.4717871636855554, "flos": 27922700707200.0, "grad_norm": 1.365245213481775, "language_loss": 0.74306214, "learning_rate": 2.2789161271109e-06, "loss": 0.76451898, "num_input_tokens_seen": 168611235, "step": 7847, "time_per_iteration": 2.664600133895874 }, { "auxiliary_loss_clip": 0.01076671, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.04269147, "balance_loss_mlp": 1.02244806, "epoch": 0.47184728693822336, "flos": 14502237816960.0, "grad_norm": 1.614512390946798, "language_loss": 0.80744767, "learning_rate": 2.278530465971703e-06, "loss": 0.82857651, "num_input_tokens_seen": 168628710, "step": 7848, "time_per_iteration": 2.7662644386291504 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.04767179, "balance_loss_mlp": 1.02170014, "epoch": 0.47190741019089133, "flos": 17856545483520.0, "grad_norm": 3.381301580597114, "language_loss": 0.70282733, "learning_rate": 2.2781447942706032e-06, "loss": 0.72437274, "num_input_tokens_seen": 168645645, "step": 7849, "time_per_iteration": 2.628324031829834 }, { "auxiliary_loss_clip": 0.01102555, "auxiliary_loss_mlp": 0.01043039, "balance_loss_clip": 1.04688513, "balance_loss_mlp": 1.02679062, "epoch": 0.4719675334435593, "flos": 17895472848000.0, "grad_norm": 2.2108635677358968, "language_loss": 0.6920523, "learning_rate": 2.277759112022224e-06, "loss": 0.71350825, "num_input_tokens_seen": 168664165, "step": 7850, "time_per_iteration": 2.678515672683716 }, { "auxiliary_loss_clip": 0.01071934, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.04294968, "balance_loss_mlp": 1.0192523, "epoch": 0.47202765669622726, "flos": 20704369426560.0, "grad_norm": 1.8559154127156776, "language_loss": 0.75022864, "learning_rate": 2.2773734192411916e-06, "loss": 0.77128029, "num_input_tokens_seen": 168681940, "step": 7851, "time_per_iteration": 2.7907421588897705 }, { "auxiliary_loss_clip": 0.01058717, "auxiliary_loss_mlp": 0.0104416, "balance_loss_clip": 1.03438354, "balance_loss_mlp": 1.02636182, "epoch": 0.4720877799488952, "flos": 16360255607040.0, "grad_norm": 1.8954666463572496, "language_loss": 0.76087546, "learning_rate": 2.276987715942132e-06, "loss": 0.78190422, "num_input_tokens_seen": 168698830, "step": 7852, "time_per_iteration": 2.751862049102783 }, { "auxiliary_loss_clip": 0.01090696, "auxiliary_loss_mlp": 0.01031466, "balance_loss_clip": 1.0440855, "balance_loss_mlp": 1.01667845, "epoch": 0.4721479032015632, "flos": 20668171495680.0, "grad_norm": 1.6687991208994266, "language_loss": 0.69092613, "learning_rate": 2.2766020021396696e-06, "loss": 0.71214771, "num_input_tokens_seen": 168718305, "step": 7853, "time_per_iteration": 2.8860716819763184 }, { "auxiliary_loss_clip": 0.01023698, "auxiliary_loss_mlp": 0.01005171, "balance_loss_clip": 1.03293765, "balance_loss_mlp": 1.00360918, "epoch": 0.47220802645423116, "flos": 67750438435200.0, "grad_norm": 0.7060966439190681, "language_loss": 0.50175303, "learning_rate": 2.276216277848432e-06, "loss": 0.52204174, "num_input_tokens_seen": 168782365, "step": 7854, "time_per_iteration": 4.915671110153198 }, { "auxiliary_loss_clip": 0.0112187, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.04927993, "balance_loss_mlp": 1.02046967, "epoch": 0.4722681497068991, "flos": 20921449271040.0, "grad_norm": 1.8544471627611243, "language_loss": 0.63919318, "learning_rate": 2.2758305430830455e-06, "loss": 0.66076523, "num_input_tokens_seen": 168800485, "step": 7855, "time_per_iteration": 4.303591728210449 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.04526174, "balance_loss_mlp": 1.02463675, "epoch": 0.4723282729595671, "flos": 28293083798400.0, "grad_norm": 6.403691145457763, "language_loss": 0.75835574, "learning_rate": 2.2754447978581376e-06, "loss": 0.77991176, "num_input_tokens_seen": 168818965, "step": 7856, "time_per_iteration": 2.669156074523926 }, { "auxiliary_loss_clip": 0.01102045, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.04435217, "balance_loss_mlp": 1.02334714, "epoch": 0.4723883962122351, "flos": 27125053338240.0, "grad_norm": 1.8316073665627561, "language_loss": 0.7513321, "learning_rate": 2.2750590421883347e-06, "loss": 0.77271795, "num_input_tokens_seen": 168840355, "step": 7857, "time_per_iteration": 5.926163673400879 }, { "auxiliary_loss_clip": 0.0110506, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.04619288, "balance_loss_mlp": 1.03164554, "epoch": 0.47244851946490307, "flos": 31537253387520.0, "grad_norm": 1.4352718890089464, "language_loss": 0.64871937, "learning_rate": 2.2746732760882655e-06, "loss": 0.67021048, "num_input_tokens_seen": 168861765, "step": 7858, "time_per_iteration": 2.7516961097717285 }, { "auxiliary_loss_clip": 0.01115653, "auxiliary_loss_mlp": 0.00772171, "balance_loss_clip": 1.04487467, "balance_loss_mlp": 1.00070405, "epoch": 0.47250864271757104, "flos": 20886544229760.0, "grad_norm": 4.333924209566871, "language_loss": 0.70584702, "learning_rate": 2.2742874995725575e-06, "loss": 0.72472525, "num_input_tokens_seen": 168881310, "step": 7859, "time_per_iteration": 2.63272762298584 }, { "auxiliary_loss_clip": 0.01132339, "auxiliary_loss_mlp": 0.01038437, "balance_loss_clip": 1.0472064, "balance_loss_mlp": 1.02420318, "epoch": 0.472568765970239, "flos": 20522086882560.0, "grad_norm": 1.7578939418215658, "language_loss": 0.62056947, "learning_rate": 2.2739017126558413e-06, "loss": 0.64227724, "num_input_tokens_seen": 168899470, "step": 7860, "time_per_iteration": 2.579881429672241 }, { "auxiliary_loss_clip": 0.01104772, "auxiliary_loss_mlp": 0.01042498, "balance_loss_clip": 1.04455113, "balance_loss_mlp": 1.02835417, "epoch": 0.47262888922290697, "flos": 35805200417280.0, "grad_norm": 2.5847882369160584, "language_loss": 0.71352196, "learning_rate": 2.2735159153527445e-06, "loss": 0.73499465, "num_input_tokens_seen": 168921495, "step": 7861, "time_per_iteration": 2.7616021633148193 }, { "auxiliary_loss_clip": 0.01100093, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.04298115, "balance_loss_mlp": 1.02136993, "epoch": 0.47268901247557493, "flos": 20667740532480.0, "grad_norm": 1.877615917676971, "language_loss": 0.85056359, "learning_rate": 2.273130107677896e-06, "loss": 0.87191874, "num_input_tokens_seen": 168940515, "step": 7862, "time_per_iteration": 2.730851173400879 }, { "auxiliary_loss_clip": 0.01126067, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.04310465, "balance_loss_mlp": 1.01836395, "epoch": 0.4727491357282429, "flos": 19573291082880.0, "grad_norm": 1.8403668162610285, "language_loss": 0.84233111, "learning_rate": 2.272744289645927e-06, "loss": 0.86391521, "num_input_tokens_seen": 168958340, "step": 7863, "time_per_iteration": 2.7247161865234375 }, { "auxiliary_loss_clip": 0.01104075, "auxiliary_loss_mlp": 0.01041818, "balance_loss_clip": 1.04576826, "balance_loss_mlp": 1.02810335, "epoch": 0.47280925898091086, "flos": 18217231902720.0, "grad_norm": 2.0137135318025843, "language_loss": 0.66243893, "learning_rate": 2.272358461271467e-06, "loss": 0.68389785, "num_input_tokens_seen": 168974850, "step": 7864, "time_per_iteration": 2.7027535438537598 }, { "auxiliary_loss_clip": 0.01126031, "auxiliary_loss_mlp": 0.01038902, "balance_loss_clip": 1.04373837, "balance_loss_mlp": 1.02402425, "epoch": 0.4728693822335788, "flos": 17821820010240.0, "grad_norm": 1.9458421333469222, "language_loss": 0.64846861, "learning_rate": 2.271972622569147e-06, "loss": 0.67011791, "num_input_tokens_seen": 168992860, "step": 7865, "time_per_iteration": 2.599947214126587 }, { "auxiliary_loss_clip": 0.01095039, "auxiliary_loss_mlp": 0.00771615, "balance_loss_clip": 1.04065597, "balance_loss_mlp": 1.00069022, "epoch": 0.4729295054862468, "flos": 20595057361920.0, "grad_norm": 1.8988594463693396, "language_loss": 0.73979223, "learning_rate": 2.2715867735535976e-06, "loss": 0.75845885, "num_input_tokens_seen": 169010325, "step": 7866, "time_per_iteration": 2.6904079914093018 }, { "auxiliary_loss_clip": 0.01127633, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.0444746, "balance_loss_mlp": 1.02215528, "epoch": 0.47298962873891476, "flos": 23368079232000.0, "grad_norm": 1.7138995799513466, "language_loss": 0.82882631, "learning_rate": 2.271200914239451e-06, "loss": 0.85047078, "num_input_tokens_seen": 169029840, "step": 7867, "time_per_iteration": 2.66166353225708 }, { "auxiliary_loss_clip": 0.01113116, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.04474282, "balance_loss_mlp": 1.02197099, "epoch": 0.4730497519915827, "flos": 22052240305920.0, "grad_norm": 1.59304374398017, "language_loss": 0.79711115, "learning_rate": 2.2708150446413385e-06, "loss": 0.81859303, "num_input_tokens_seen": 169049975, "step": 7868, "time_per_iteration": 2.639418363571167 }, { "auxiliary_loss_clip": 0.01048577, "auxiliary_loss_mlp": 0.01036292, "balance_loss_clip": 1.03682256, "balance_loss_mlp": 1.02049041, "epoch": 0.4731098752442507, "flos": 21069724613760.0, "grad_norm": 2.2697646545371772, "language_loss": 0.74715841, "learning_rate": 2.2704291647738915e-06, "loss": 0.7680071, "num_input_tokens_seen": 169069540, "step": 7869, "time_per_iteration": 2.822831153869629 }, { "auxiliary_loss_clip": 0.01108509, "auxiliary_loss_mlp": 0.01048779, "balance_loss_clip": 1.04608214, "balance_loss_mlp": 1.03300154, "epoch": 0.4731699984969187, "flos": 22528775064960.0, "grad_norm": 2.141854382789547, "language_loss": 0.73684996, "learning_rate": 2.2700432746517443e-06, "loss": 0.75842285, "num_input_tokens_seen": 169089940, "step": 7870, "time_per_iteration": 2.7175748348236084 }, { "auxiliary_loss_clip": 0.01133545, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.04755211, "balance_loss_mlp": 1.02635777, "epoch": 0.4732301217495867, "flos": 24898124914560.0, "grad_norm": 2.253339307670162, "language_loss": 0.81085944, "learning_rate": 2.2696573742895292e-06, "loss": 0.83261085, "num_input_tokens_seen": 169109650, "step": 7871, "time_per_iteration": 2.6193602085113525 }, { "auxiliary_loss_clip": 0.01113818, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.04329586, "balance_loss_mlp": 1.02133834, "epoch": 0.47329024500225464, "flos": 22784423137920.0, "grad_norm": 1.5762073479047713, "language_loss": 0.75922841, "learning_rate": 2.269271463701879e-06, "loss": 0.78072715, "num_input_tokens_seen": 169128990, "step": 7872, "time_per_iteration": 2.6391725540161133 }, { "auxiliary_loss_clip": 0.01091788, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04121172, "balance_loss_mlp": 1.02376986, "epoch": 0.4733503682549226, "flos": 38695902220800.0, "grad_norm": 3.094756801604535, "language_loss": 0.67562377, "learning_rate": 2.268885542903428e-06, "loss": 0.696926, "num_input_tokens_seen": 169154645, "step": 7873, "time_per_iteration": 2.8466758728027344 }, { "auxiliary_loss_clip": 0.01117181, "auxiliary_loss_mlp": 0.01036678, "balance_loss_clip": 1.04567063, "balance_loss_mlp": 1.02267087, "epoch": 0.47341049150759057, "flos": 22966849336320.0, "grad_norm": 1.6392218744116203, "language_loss": 0.72839928, "learning_rate": 2.26849961190881e-06, "loss": 0.74993783, "num_input_tokens_seen": 169174995, "step": 7874, "time_per_iteration": 2.721020221710205 }, { "auxiliary_loss_clip": 0.01113028, "auxiliary_loss_mlp": 0.01038664, "balance_loss_clip": 1.04846478, "balance_loss_mlp": 1.02471697, "epoch": 0.47347061476025853, "flos": 14538471661440.0, "grad_norm": 3.032092549096925, "language_loss": 0.65002596, "learning_rate": 2.26811367073266e-06, "loss": 0.67154288, "num_input_tokens_seen": 169191815, "step": 7875, "time_per_iteration": 2.6652960777282715 }, { "auxiliary_loss_clip": 0.01083743, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.04805076, "balance_loss_mlp": 1.02059197, "epoch": 0.4735307380129265, "flos": 30263250827520.0, "grad_norm": 2.768907187204124, "language_loss": 0.8101728, "learning_rate": 2.2677277193896125e-06, "loss": 0.83136296, "num_input_tokens_seen": 169210430, "step": 7876, "time_per_iteration": 2.7860774993896484 }, { "auxiliary_loss_clip": 0.01096604, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.04034781, "balance_loss_mlp": 1.03362572, "epoch": 0.47359086126559446, "flos": 19391044452480.0, "grad_norm": 1.718915834241656, "language_loss": 0.79123086, "learning_rate": 2.267341757894304e-06, "loss": 0.81269091, "num_input_tokens_seen": 169229295, "step": 7877, "time_per_iteration": 2.6741349697113037 }, { "auxiliary_loss_clip": 0.01119367, "auxiliary_loss_mlp": 0.00771148, "balance_loss_clip": 1.04634619, "balance_loss_mlp": 1.00065994, "epoch": 0.47365098451826243, "flos": 21939408708480.0, "grad_norm": 1.9321122257733154, "language_loss": 0.7070595, "learning_rate": 2.2669557862613685e-06, "loss": 0.72596461, "num_input_tokens_seen": 169247855, "step": 7878, "time_per_iteration": 2.65336012840271 }, { "auxiliary_loss_clip": 0.01091201, "auxiliary_loss_mlp": 0.01041141, "balance_loss_clip": 1.04987168, "balance_loss_mlp": 1.02767622, "epoch": 0.4737111077709304, "flos": 25845053207040.0, "grad_norm": 1.650502341043129, "language_loss": 0.75037253, "learning_rate": 2.2665698045054425e-06, "loss": 0.77169597, "num_input_tokens_seen": 169268860, "step": 7879, "time_per_iteration": 2.731395721435547 }, { "auxiliary_loss_clip": 0.01030587, "auxiliary_loss_mlp": 0.01009103, "balance_loss_clip": 1.02360272, "balance_loss_mlp": 1.00741053, "epoch": 0.47377123102359836, "flos": 67760886314880.0, "grad_norm": 0.7327852929375173, "language_loss": 0.61306548, "learning_rate": 2.266183812641164e-06, "loss": 0.63346243, "num_input_tokens_seen": 169331855, "step": 7880, "time_per_iteration": 3.224714756011963 }, { "auxiliary_loss_clip": 0.0110857, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.04677773, "balance_loss_mlp": 1.02690625, "epoch": 0.4738313542762663, "flos": 24315977191680.0, "grad_norm": 1.5081125335533625, "language_loss": 0.68397921, "learning_rate": 2.2657978106831675e-06, "loss": 0.70548451, "num_input_tokens_seen": 169352175, "step": 7881, "time_per_iteration": 2.7536203861236572 }, { "auxiliary_loss_clip": 0.01068036, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.04936802, "balance_loss_mlp": 1.01798737, "epoch": 0.4738914775289343, "flos": 20705339093760.0, "grad_norm": 1.7877053000392102, "language_loss": 0.77066004, "learning_rate": 2.265411798646092e-06, "loss": 0.7916562, "num_input_tokens_seen": 169371215, "step": 7882, "time_per_iteration": 2.873434543609619 }, { "auxiliary_loss_clip": 0.01116489, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.04511285, "balance_loss_mlp": 1.02132463, "epoch": 0.4739516007816023, "flos": 25446337263360.0, "grad_norm": 2.3087904075212204, "language_loss": 0.76111883, "learning_rate": 2.2650257765445747e-06, "loss": 0.78264266, "num_input_tokens_seen": 169391745, "step": 7883, "time_per_iteration": 2.7326574325561523 }, { "auxiliary_loss_clip": 0.01107432, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.04656231, "balance_loss_mlp": 1.01863456, "epoch": 0.4740117240342703, "flos": 19974341410560.0, "grad_norm": 1.7217647008431887, "language_loss": 0.72281808, "learning_rate": 2.2646397443932525e-06, "loss": 0.74420893, "num_input_tokens_seen": 169409845, "step": 7884, "time_per_iteration": 2.660172462463379 }, { "auxiliary_loss_clip": 0.01123059, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.04745269, "balance_loss_mlp": 1.02225614, "epoch": 0.47407184728693824, "flos": 15661146222720.0, "grad_norm": 2.1356892731193557, "language_loss": 0.82255256, "learning_rate": 2.2642537022067655e-06, "loss": 0.8441596, "num_input_tokens_seen": 169426085, "step": 7885, "time_per_iteration": 2.6816513538360596 }, { "auxiliary_loss_clip": 0.01093494, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.0418942, "balance_loss_mlp": 1.0338043, "epoch": 0.4741319705396062, "flos": 18588800142720.0, "grad_norm": 1.6528542083339792, "language_loss": 0.73020607, "learning_rate": 2.263867649999751e-06, "loss": 0.75164127, "num_input_tokens_seen": 169444705, "step": 7886, "time_per_iteration": 2.6734073162078857 }, { "auxiliary_loss_clip": 0.01110604, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.04582644, "balance_loss_mlp": 1.02251315, "epoch": 0.47419209379227417, "flos": 13261093223040.0, "grad_norm": 2.0346146652784327, "language_loss": 0.74043691, "learning_rate": 2.263481587786849e-06, "loss": 0.76192516, "num_input_tokens_seen": 169460850, "step": 7887, "time_per_iteration": 2.6761467456817627 }, { "auxiliary_loss_clip": 0.01118145, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.0474298, "balance_loss_mlp": 1.01849771, "epoch": 0.47425221704494214, "flos": 20044043752320.0, "grad_norm": 1.7788052130685665, "language_loss": 0.77452385, "learning_rate": 2.2630955155826993e-06, "loss": 0.79602331, "num_input_tokens_seen": 169478890, "step": 7888, "time_per_iteration": 2.6402924060821533 }, { "auxiliary_loss_clip": 0.01118769, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.0469296, "balance_loss_mlp": 1.02044427, "epoch": 0.4743123402976101, "flos": 27271892136960.0, "grad_norm": 4.211713497556063, "language_loss": 0.72521853, "learning_rate": 2.2627094334019406e-06, "loss": 0.7467528, "num_input_tokens_seen": 169499690, "step": 7889, "time_per_iteration": 2.693746566772461 }, { "auxiliary_loss_clip": 0.0104991, "auxiliary_loss_mlp": 0.01005818, "balance_loss_clip": 1.02273417, "balance_loss_mlp": 1.00418472, "epoch": 0.47437246355027807, "flos": 55393970261760.0, "grad_norm": 0.7194077429508707, "language_loss": 0.5605737, "learning_rate": 2.262323341259214e-06, "loss": 0.58113098, "num_input_tokens_seen": 169560475, "step": 7890, "time_per_iteration": 3.180250883102417 }, { "auxiliary_loss_clip": 0.01120493, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.04944348, "balance_loss_mlp": 1.01705146, "epoch": 0.47443258680294603, "flos": 23878477537920.0, "grad_norm": 1.9527728253341778, "language_loss": 0.65866226, "learning_rate": 2.2619372391691605e-06, "loss": 0.68019128, "num_input_tokens_seen": 169580110, "step": 7891, "time_per_iteration": 2.6768221855163574 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.04865634, "balance_loss_mlp": 1.02342188, "epoch": 0.474492710055614, "flos": 21977761455360.0, "grad_norm": 2.2722368949670493, "language_loss": 0.7100271, "learning_rate": 2.26155112714642e-06, "loss": 0.73177087, "num_input_tokens_seen": 169597510, "step": 7892, "time_per_iteration": 2.5857720375061035 }, { "auxiliary_loss_clip": 0.01021432, "auxiliary_loss_mlp": 0.01001129, "balance_loss_clip": 1.01879561, "balance_loss_mlp": 0.99938869, "epoch": 0.47455283330828196, "flos": 62557180122240.0, "grad_norm": 0.8083016633053688, "language_loss": 0.5854069, "learning_rate": 2.2611650052056355e-06, "loss": 0.60563254, "num_input_tokens_seen": 169660010, "step": 7893, "time_per_iteration": 3.298412799835205 }, { "auxiliary_loss_clip": 0.01119918, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.04893851, "balance_loss_mlp": 1.02661026, "epoch": 0.47461295656094993, "flos": 12093637380480.0, "grad_norm": 2.1787400532077608, "language_loss": 0.77515149, "learning_rate": 2.2607788733614463e-06, "loss": 0.79675728, "num_input_tokens_seen": 169678485, "step": 7894, "time_per_iteration": 4.300025463104248 }, { "auxiliary_loss_clip": 0.01119579, "auxiliary_loss_mlp": 0.01038145, "balance_loss_clip": 1.04634869, "balance_loss_mlp": 1.02365553, "epoch": 0.4746730798136179, "flos": 20884568981760.0, "grad_norm": 1.6992264056336024, "language_loss": 0.75134289, "learning_rate": 2.260392731628497e-06, "loss": 0.77292013, "num_input_tokens_seen": 169697335, "step": 7895, "time_per_iteration": 4.2042882442474365 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.04379582, "balance_loss_mlp": 1.02000451, "epoch": 0.4747332030662859, "flos": 19974808287360.0, "grad_norm": 2.3363867956596462, "language_loss": 0.83016753, "learning_rate": 2.260006580021429e-06, "loss": 0.85167164, "num_input_tokens_seen": 169715395, "step": 7896, "time_per_iteration": 2.6993515491485596 }, { "auxiliary_loss_clip": 0.01115945, "auxiliary_loss_mlp": 0.01033612, "balance_loss_clip": 1.04578996, "balance_loss_mlp": 1.01843619, "epoch": 0.4747933263189539, "flos": 16034186920320.0, "grad_norm": 2.109517003677199, "language_loss": 0.7557857, "learning_rate": 2.259620418554886e-06, "loss": 0.77728134, "num_input_tokens_seen": 169733755, "step": 7897, "time_per_iteration": 4.253166198730469 }, { "auxiliary_loss_clip": 0.01108787, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04561198, "balance_loss_mlp": 1.02645135, "epoch": 0.47485344957162184, "flos": 13955102876160.0, "grad_norm": 2.267424442093673, "language_loss": 0.63623869, "learning_rate": 2.25923424724351e-06, "loss": 0.65773547, "num_input_tokens_seen": 169751390, "step": 7898, "time_per_iteration": 2.672621011734009 }, { "auxiliary_loss_clip": 0.01091849, "auxiliary_loss_mlp": 0.01057132, "balance_loss_clip": 1.04254556, "balance_loss_mlp": 1.03949475, "epoch": 0.4749135728242898, "flos": 20449080489600.0, "grad_norm": 3.549969153580447, "language_loss": 0.70200998, "learning_rate": 2.258848066101946e-06, "loss": 0.72349977, "num_input_tokens_seen": 169769500, "step": 7899, "time_per_iteration": 2.6986401081085205 }, { "auxiliary_loss_clip": 0.01119057, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.04576528, "balance_loss_mlp": 1.02590108, "epoch": 0.4749736960769578, "flos": 28949961767040.0, "grad_norm": 1.9384177803560112, "language_loss": 0.68627715, "learning_rate": 2.258461875144837e-06, "loss": 0.70787489, "num_input_tokens_seen": 169789215, "step": 7900, "time_per_iteration": 2.695420265197754 }, { "auxiliary_loss_clip": 0.01088615, "auxiliary_loss_mlp": 0.01048142, "balance_loss_clip": 1.04223442, "balance_loss_mlp": 1.0335629, "epoch": 0.47503381932962574, "flos": 31938770592000.0, "grad_norm": 2.214181272016126, "language_loss": 0.70571202, "learning_rate": 2.2580756743868273e-06, "loss": 0.72707957, "num_input_tokens_seen": 169808825, "step": 7901, "time_per_iteration": 2.7880799770355225 }, { "auxiliary_loss_clip": 0.01101024, "auxiliary_loss_mlp": 0.01063852, "balance_loss_clip": 1.04344749, "balance_loss_mlp": 1.04805636, "epoch": 0.4750939425822937, "flos": 22127257860480.0, "grad_norm": 1.723548754677231, "language_loss": 0.73669708, "learning_rate": 2.2576894638425636e-06, "loss": 0.75834584, "num_input_tokens_seen": 169827590, "step": 7902, "time_per_iteration": 2.67350172996521 }, { "auxiliary_loss_clip": 0.01087876, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.04317856, "balance_loss_mlp": 1.02710962, "epoch": 0.47515406583496167, "flos": 20850094903680.0, "grad_norm": 1.7450056007143964, "language_loss": 0.68050694, "learning_rate": 2.257303243526688e-06, "loss": 0.70179355, "num_input_tokens_seen": 169844925, "step": 7903, "time_per_iteration": 2.7626256942749023 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.043818, "balance_loss_mlp": 1.02206981, "epoch": 0.47521418908762963, "flos": 17524802448000.0, "grad_norm": 1.9051075920789844, "language_loss": 0.72356462, "learning_rate": 2.256917013453848e-06, "loss": 0.74496931, "num_input_tokens_seen": 169862705, "step": 7904, "time_per_iteration": 2.6790597438812256 }, { "auxiliary_loss_clip": 0.01065198, "auxiliary_loss_mlp": 0.01045369, "balance_loss_clip": 1.03584373, "balance_loss_mlp": 1.02957416, "epoch": 0.4752743123402976, "flos": 20559434048640.0, "grad_norm": 1.6154437659751681, "language_loss": 0.86472631, "learning_rate": 2.25653077363869e-06, "loss": 0.88583207, "num_input_tokens_seen": 169880155, "step": 7905, "time_per_iteration": 2.733799457550049 }, { "auxiliary_loss_clip": 0.0110676, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.04021764, "balance_loss_mlp": 1.02423561, "epoch": 0.47533443559296557, "flos": 26360623071360.0, "grad_norm": 1.7729713006372103, "language_loss": 0.82212102, "learning_rate": 2.2561445240958583e-06, "loss": 0.84355921, "num_input_tokens_seen": 169901525, "step": 7906, "time_per_iteration": 2.6994829177856445 }, { "auxiliary_loss_clip": 0.01029489, "auxiliary_loss_mlp": 0.01023044, "balance_loss_clip": 1.03056157, "balance_loss_mlp": 1.02150619, "epoch": 0.47539455884563353, "flos": 65949660967680.0, "grad_norm": 0.6767545541611142, "language_loss": 0.58947372, "learning_rate": 2.255758264840002e-06, "loss": 0.60999906, "num_input_tokens_seen": 169970345, "step": 7907, "time_per_iteration": 3.409289836883545 }, { "auxiliary_loss_clip": 0.01112328, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.04298031, "balance_loss_mlp": 1.02575445, "epoch": 0.4754546820983015, "flos": 17238128002560.0, "grad_norm": 2.5037076646878664, "language_loss": 0.81147426, "learning_rate": 2.255371995885765e-06, "loss": 0.83299541, "num_input_tokens_seen": 169986440, "step": 7908, "time_per_iteration": 2.6126997470855713 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04887652, "balance_loss_mlp": 1.03041351, "epoch": 0.47551480535096946, "flos": 19825886499840.0, "grad_norm": 1.7145689882234993, "language_loss": 0.73805857, "learning_rate": 2.254985717247797e-06, "loss": 0.75971419, "num_input_tokens_seen": 170005705, "step": 7909, "time_per_iteration": 2.7153172492980957 }, { "auxiliary_loss_clip": 0.01098915, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04232681, "balance_loss_mlp": 1.02348399, "epoch": 0.4755749286036375, "flos": 22163958581760.0, "grad_norm": 1.5099683944930966, "language_loss": 0.75533628, "learning_rate": 2.2545994289407457e-06, "loss": 0.77669942, "num_input_tokens_seen": 170023415, "step": 7910, "time_per_iteration": 2.7330431938171387 }, { "auxiliary_loss_clip": 0.01113687, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.04379678, "balance_loss_mlp": 1.01749897, "epoch": 0.47563505185630545, "flos": 21648280976640.0, "grad_norm": 1.931062443356086, "language_loss": 0.79401493, "learning_rate": 2.2542131309792577e-06, "loss": 0.81545365, "num_input_tokens_seen": 170042395, "step": 7911, "time_per_iteration": 2.6149117946624756 }, { "auxiliary_loss_clip": 0.01098041, "auxiliary_loss_mlp": 0.00773063, "balance_loss_clip": 1.04096794, "balance_loss_mlp": 1.00061882, "epoch": 0.4756951751089734, "flos": 20628777254400.0, "grad_norm": 2.2768804327487113, "language_loss": 0.75414324, "learning_rate": 2.253826823377983e-06, "loss": 0.77285427, "num_input_tokens_seen": 170061610, "step": 7912, "time_per_iteration": 2.680414915084839 }, { "auxiliary_loss_clip": 0.01123715, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.04319668, "balance_loss_mlp": 1.02353013, "epoch": 0.4757552983616414, "flos": 25848788221440.0, "grad_norm": 1.4371041113730632, "language_loss": 0.74065906, "learning_rate": 2.253440506151569e-06, "loss": 0.76227093, "num_input_tokens_seen": 170083505, "step": 7913, "time_per_iteration": 2.6565608978271484 }, { "auxiliary_loss_clip": 0.0110748, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.04591024, "balance_loss_mlp": 1.01694882, "epoch": 0.47581542161430934, "flos": 18223013992320.0, "grad_norm": 2.17158702079863, "language_loss": 0.72123522, "learning_rate": 2.253054179314666e-06, "loss": 0.7426281, "num_input_tokens_seen": 170100690, "step": 7914, "time_per_iteration": 2.6789934635162354 }, { "auxiliary_loss_clip": 0.01103912, "auxiliary_loss_mlp": 0.01042984, "balance_loss_clip": 1.04652143, "balance_loss_mlp": 1.02944756, "epoch": 0.4758755448669773, "flos": 21579763783680.0, "grad_norm": 2.3786315570139345, "language_loss": 0.64855683, "learning_rate": 2.2526678428819227e-06, "loss": 0.67002577, "num_input_tokens_seen": 170119240, "step": 7915, "time_per_iteration": 2.65608549118042 }, { "auxiliary_loss_clip": 0.01123163, "auxiliary_loss_mlp": 0.01041838, "balance_loss_clip": 1.04508734, "balance_loss_mlp": 1.02774107, "epoch": 0.47593566811964527, "flos": 15231152511360.0, "grad_norm": 1.7019759484121837, "language_loss": 0.76935744, "learning_rate": 2.2522814968679896e-06, "loss": 0.79100746, "num_input_tokens_seen": 170136450, "step": 7916, "time_per_iteration": 2.585491418838501 }, { "auxiliary_loss_clip": 0.01125392, "auxiliary_loss_mlp": 0.01036553, "balance_loss_clip": 1.04389, "balance_loss_mlp": 1.02302265, "epoch": 0.47599579137231324, "flos": 21543242630400.0, "grad_norm": 2.0866631919048175, "language_loss": 0.63895321, "learning_rate": 2.2518951412875173e-06, "loss": 0.66057259, "num_input_tokens_seen": 170155295, "step": 7917, "time_per_iteration": 2.5544540882110596 }, { "auxiliary_loss_clip": 0.01017258, "auxiliary_loss_mlp": 0.01002335, "balance_loss_clip": 1.01986837, "balance_loss_mlp": 1.00074983, "epoch": 0.4760559146249812, "flos": 64554602595840.0, "grad_norm": 0.8370962757635343, "language_loss": 0.65689212, "learning_rate": 2.2515087761551557e-06, "loss": 0.67708808, "num_input_tokens_seen": 170222325, "step": 7918, "time_per_iteration": 3.4263010025024414 }, { "auxiliary_loss_clip": 0.01114985, "auxiliary_loss_mlp": 0.00771917, "balance_loss_clip": 1.04313397, "balance_loss_mlp": 1.00057673, "epoch": 0.47611603787764917, "flos": 22233876405120.0, "grad_norm": 2.4555452771674067, "language_loss": 0.68450713, "learning_rate": 2.2511224014855563e-06, "loss": 0.70337617, "num_input_tokens_seen": 170241625, "step": 7919, "time_per_iteration": 2.7581801414489746 }, { "auxiliary_loss_clip": 0.01105197, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04329574, "balance_loss_mlp": 1.02922797, "epoch": 0.47617616113031713, "flos": 22780005765120.0, "grad_norm": 1.6063666097186406, "language_loss": 0.75389183, "learning_rate": 2.2507360172933694e-06, "loss": 0.77537358, "num_input_tokens_seen": 170262470, "step": 7920, "time_per_iteration": 2.7888362407684326 }, { "auxiliary_loss_clip": 0.01109747, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.04727352, "balance_loss_mlp": 1.01956415, "epoch": 0.4762362843829851, "flos": 24133802388480.0, "grad_norm": 1.5207523519625543, "language_loss": 0.7761817, "learning_rate": 2.2503496235932487e-06, "loss": 0.79762518, "num_input_tokens_seen": 170283460, "step": 7921, "time_per_iteration": 2.7462785243988037 }, { "auxiliary_loss_clip": 0.01108901, "auxiliary_loss_mlp": 0.01043608, "balance_loss_clip": 1.0445503, "balance_loss_mlp": 1.02778864, "epoch": 0.47629640763565306, "flos": 22452069571200.0, "grad_norm": 3.2907516590332024, "language_loss": 0.78146785, "learning_rate": 2.249963220399845e-06, "loss": 0.80299294, "num_input_tokens_seen": 170304225, "step": 7922, "time_per_iteration": 2.6893417835235596 }, { "auxiliary_loss_clip": 0.01094796, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.04391539, "balance_loss_mlp": 1.02719617, "epoch": 0.4763565308883211, "flos": 11181398647680.0, "grad_norm": 1.6628631162014398, "language_loss": 0.7275365, "learning_rate": 2.2495768077278104e-06, "loss": 0.74891508, "num_input_tokens_seen": 170322110, "step": 7923, "time_per_iteration": 2.732468605041504 }, { "auxiliary_loss_clip": 0.01102187, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.04838657, "balance_loss_mlp": 1.02511382, "epoch": 0.47641665414098905, "flos": 22382151747840.0, "grad_norm": 1.679365493038583, "language_loss": 0.82141626, "learning_rate": 2.2491903855917992e-06, "loss": 0.84282875, "num_input_tokens_seen": 170340700, "step": 7924, "time_per_iteration": 2.7680320739746094 }, { "auxiliary_loss_clip": 0.01126329, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.0495019, "balance_loss_mlp": 1.02264822, "epoch": 0.476476777393657, "flos": 25046148862080.0, "grad_norm": 2.2679110024074705, "language_loss": 0.80316466, "learning_rate": 2.2488039540064626e-06, "loss": 0.82481372, "num_input_tokens_seen": 170359780, "step": 7925, "time_per_iteration": 2.649615526199341 }, { "auxiliary_loss_clip": 0.01101728, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.04264617, "balance_loss_mlp": 1.02741158, "epoch": 0.476536900646325, "flos": 27269916888960.0, "grad_norm": 1.5530829773494035, "language_loss": 0.72051573, "learning_rate": 2.2484175129864558e-06, "loss": 0.74194521, "num_input_tokens_seen": 170381260, "step": 7926, "time_per_iteration": 2.7393877506256104 }, { "auxiliary_loss_clip": 0.0111858, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.04556048, "balance_loss_mlp": 1.02015448, "epoch": 0.47659702389899294, "flos": 25301401885440.0, "grad_norm": 1.973296217359943, "language_loss": 0.68039131, "learning_rate": 2.248031062546432e-06, "loss": 0.70193255, "num_input_tokens_seen": 170400595, "step": 7927, "time_per_iteration": 2.7364554405212402 }, { "auxiliary_loss_clip": 0.01088729, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.04246449, "balance_loss_mlp": 1.01772344, "epoch": 0.4766571471516609, "flos": 25992861672960.0, "grad_norm": 1.624613635266834, "language_loss": 0.67674315, "learning_rate": 2.247644602701045e-06, "loss": 0.69794345, "num_input_tokens_seen": 170421110, "step": 7928, "time_per_iteration": 2.7200751304626465 }, { "auxiliary_loss_clip": 0.01128959, "auxiliary_loss_mlp": 0.0103446, "balance_loss_clip": 1.04645658, "balance_loss_mlp": 1.01979089, "epoch": 0.4767172704043289, "flos": 16032211672320.0, "grad_norm": 2.0796504226810497, "language_loss": 0.78678215, "learning_rate": 2.2472581334649496e-06, "loss": 0.80841631, "num_input_tokens_seen": 170436700, "step": 7929, "time_per_iteration": 2.6817221641540527 }, { "auxiliary_loss_clip": 0.01102478, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.04257607, "balance_loss_mlp": 1.0262301, "epoch": 0.47677739365699684, "flos": 39235351651200.0, "grad_norm": 1.8131309373477071, "language_loss": 0.6663419, "learning_rate": 2.2468716548528016e-06, "loss": 0.68776643, "num_input_tokens_seen": 170459555, "step": 7930, "time_per_iteration": 2.856072187423706 }, { "auxiliary_loss_clip": 0.0111358, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.04616833, "balance_loss_mlp": 1.02318919, "epoch": 0.4768375169096648, "flos": 24717781704960.0, "grad_norm": 7.611219304969564, "language_loss": 0.7973817, "learning_rate": 2.2464851668792555e-06, "loss": 0.81888509, "num_input_tokens_seen": 170479175, "step": 7931, "time_per_iteration": 2.646108865737915 }, { "auxiliary_loss_clip": 0.01100642, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.04248762, "balance_loss_mlp": 1.02181768, "epoch": 0.47689764016233277, "flos": 22528667324160.0, "grad_norm": 1.747640555146421, "language_loss": 0.76035368, "learning_rate": 2.2460986695589678e-06, "loss": 0.78172839, "num_input_tokens_seen": 170498450, "step": 7932, "time_per_iteration": 2.6632022857666016 }, { "auxiliary_loss_clip": 0.01103619, "auxiliary_loss_mlp": 0.00770594, "balance_loss_clip": 1.04416108, "balance_loss_mlp": 1.00076032, "epoch": 0.47695776341500074, "flos": 15120619384320.0, "grad_norm": 1.7743205398157191, "language_loss": 0.79733002, "learning_rate": 2.245712162906593e-06, "loss": 0.81607223, "num_input_tokens_seen": 170516255, "step": 7933, "time_per_iteration": 4.2387471199035645 }, { "auxiliary_loss_clip": 0.01123015, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.04555225, "balance_loss_mlp": 1.02532899, "epoch": 0.4770178866676687, "flos": 14678917839360.0, "grad_norm": 1.9828909232489866, "language_loss": 0.73883361, "learning_rate": 2.2453256469367888e-06, "loss": 0.76047885, "num_input_tokens_seen": 170532705, "step": 7934, "time_per_iteration": 4.074187517166138 }, { "auxiliary_loss_clip": 0.01116756, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.04362082, "balance_loss_mlp": 1.02075577, "epoch": 0.47707800992033667, "flos": 22565583527040.0, "grad_norm": 1.8305920873714958, "language_loss": 0.80197936, "learning_rate": 2.244939121664211e-06, "loss": 0.8234967, "num_input_tokens_seen": 170551925, "step": 7935, "time_per_iteration": 2.650474786758423 }, { "auxiliary_loss_clip": 0.01101181, "auxiliary_loss_mlp": 0.01043502, "balance_loss_clip": 1.04532123, "balance_loss_mlp": 1.02818346, "epoch": 0.4771381331730047, "flos": 30918225375360.0, "grad_norm": 5.908138115579588, "language_loss": 0.71829689, "learning_rate": 2.2445525871035177e-06, "loss": 0.73974371, "num_input_tokens_seen": 170572320, "step": 7936, "time_per_iteration": 4.428630113601685 }, { "auxiliary_loss_clip": 0.01130752, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.04646921, "balance_loss_mlp": 1.02419913, "epoch": 0.47719825642567265, "flos": 25738901539200.0, "grad_norm": 2.4038439056994156, "language_loss": 0.675704, "learning_rate": 2.2441660432693656e-06, "loss": 0.69740188, "num_input_tokens_seen": 170589470, "step": 7937, "time_per_iteration": 4.458148241043091 }, { "auxiliary_loss_clip": 0.01034806, "auxiliary_loss_mlp": 0.00999407, "balance_loss_clip": 1.01822138, "balance_loss_mlp": 0.99804842, "epoch": 0.4772583796783406, "flos": 66355128668160.0, "grad_norm": 0.7105047811157361, "language_loss": 0.56384945, "learning_rate": 2.2437794901764128e-06, "loss": 0.58419156, "num_input_tokens_seen": 170662265, "step": 7938, "time_per_iteration": 3.3967578411102295 }, { "auxiliary_loss_clip": 0.01099667, "auxiliary_loss_mlp": 0.0104562, "balance_loss_clip": 1.04193783, "balance_loss_mlp": 1.02908564, "epoch": 0.4773185029310086, "flos": 22051091070720.0, "grad_norm": 3.053079154163393, "language_loss": 0.88725203, "learning_rate": 2.243392927839317e-06, "loss": 0.90870488, "num_input_tokens_seen": 170679680, "step": 7939, "time_per_iteration": 2.7099897861480713 }, { "auxiliary_loss_clip": 0.01115778, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.04160845, "balance_loss_mlp": 1.02400148, "epoch": 0.47737862618367655, "flos": 16727801523840.0, "grad_norm": 1.7393189284877646, "language_loss": 0.77381486, "learning_rate": 2.2430063562727367e-06, "loss": 0.79534876, "num_input_tokens_seen": 170697340, "step": 7940, "time_per_iteration": 2.5913469791412354 }, { "auxiliary_loss_clip": 0.01104457, "auxiliary_loss_mlp": 0.01036057, "balance_loss_clip": 1.04589248, "balance_loss_mlp": 1.02288485, "epoch": 0.4774387494363445, "flos": 19609453100160.0, "grad_norm": 1.5893003235088359, "language_loss": 0.8474015, "learning_rate": 2.2426197754913322e-06, "loss": 0.8688066, "num_input_tokens_seen": 170714905, "step": 7941, "time_per_iteration": 2.605090856552124 }, { "auxiliary_loss_clip": 0.0110803, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.04433787, "balance_loss_mlp": 1.02682161, "epoch": 0.4774988726890125, "flos": 16653969118080.0, "grad_norm": 2.1303607813841237, "language_loss": 0.75943714, "learning_rate": 2.24223318550976e-06, "loss": 0.78093278, "num_input_tokens_seen": 170731810, "step": 7942, "time_per_iteration": 2.612901449203491 }, { "auxiliary_loss_clip": 0.01115811, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04779172, "balance_loss_mlp": 1.02491331, "epoch": 0.47755899594168044, "flos": 20485565729280.0, "grad_norm": 1.7564628488897216, "language_loss": 0.6467554, "learning_rate": 2.241846586342682e-06, "loss": 0.66830152, "num_input_tokens_seen": 170750270, "step": 7943, "time_per_iteration": 2.6675846576690674 }, { "auxiliary_loss_clip": 0.01088131, "auxiliary_loss_mlp": 0.01040732, "balance_loss_clip": 1.04014313, "balance_loss_mlp": 1.02544951, "epoch": 0.4776191191943484, "flos": 21652806090240.0, "grad_norm": 3.30514620611564, "language_loss": 0.73474699, "learning_rate": 2.2414599780047577e-06, "loss": 0.75603563, "num_input_tokens_seen": 170769015, "step": 7944, "time_per_iteration": 2.6938626766204834 }, { "auxiliary_loss_clip": 0.01116316, "auxiliary_loss_mlp": 0.01035661, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.01982975, "epoch": 0.4776792424470164, "flos": 18770220760320.0, "grad_norm": 2.01255819211095, "language_loss": 0.67873627, "learning_rate": 2.2410733605106456e-06, "loss": 0.70025599, "num_input_tokens_seen": 170785725, "step": 7945, "time_per_iteration": 2.5940043926239014 }, { "auxiliary_loss_clip": 0.0108787, "auxiliary_loss_mlp": 0.00774963, "balance_loss_clip": 1.03865957, "balance_loss_mlp": 1.00055337, "epoch": 0.47773936569968434, "flos": 29715828577920.0, "grad_norm": 1.9730762461064726, "language_loss": 0.75473535, "learning_rate": 2.240686733875009e-06, "loss": 0.77336371, "num_input_tokens_seen": 170804600, "step": 7946, "time_per_iteration": 2.762983560562134 }, { "auxiliary_loss_clip": 0.01105207, "auxiliary_loss_mlp": 0.01042769, "balance_loss_clip": 1.04477096, "balance_loss_mlp": 1.0274632, "epoch": 0.4777994889523523, "flos": 24791542283520.0, "grad_norm": 2.190560640838335, "language_loss": 0.79071236, "learning_rate": 2.240300098112506e-06, "loss": 0.81219208, "num_input_tokens_seen": 170824230, "step": 7947, "time_per_iteration": 2.692763328552246 }, { "auxiliary_loss_clip": 0.010955, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.0440042, "balance_loss_mlp": 1.02317524, "epoch": 0.47785961220502027, "flos": 17858161595520.0, "grad_norm": 2.294285239078615, "language_loss": 0.7329706, "learning_rate": 2.2399134532377998e-06, "loss": 0.75429833, "num_input_tokens_seen": 170843365, "step": 7948, "time_per_iteration": 2.6743998527526855 }, { "auxiliary_loss_clip": 0.01106692, "auxiliary_loss_mlp": 0.01038667, "balance_loss_clip": 1.04329944, "balance_loss_mlp": 1.0235039, "epoch": 0.4779197354576883, "flos": 20266546550400.0, "grad_norm": 1.7991446580624026, "language_loss": 0.78139675, "learning_rate": 2.2395267992655514e-06, "loss": 0.80285037, "num_input_tokens_seen": 170863515, "step": 7949, "time_per_iteration": 2.694549560546875 }, { "auxiliary_loss_clip": 0.01096582, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.04018211, "balance_loss_mlp": 1.0263133, "epoch": 0.47797985871035625, "flos": 17056599644160.0, "grad_norm": 2.242348781817659, "language_loss": 0.74315739, "learning_rate": 2.2391401362104227e-06, "loss": 0.76452386, "num_input_tokens_seen": 170881245, "step": 7950, "time_per_iteration": 2.718254327774048 }, { "auxiliary_loss_clip": 0.01095843, "auxiliary_loss_mlp": 0.01046859, "balance_loss_clip": 1.04172587, "balance_loss_mlp": 1.03179109, "epoch": 0.4780399819630242, "flos": 31358418549120.0, "grad_norm": 1.9896022122587003, "language_loss": 0.74343586, "learning_rate": 2.2387534640870756e-06, "loss": 0.7648629, "num_input_tokens_seen": 170901285, "step": 7951, "time_per_iteration": 2.7827391624450684 }, { "auxiliary_loss_clip": 0.01094802, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.04424548, "balance_loss_mlp": 1.02120781, "epoch": 0.4781001052156922, "flos": 24899597372160.0, "grad_norm": 2.198904574593956, "language_loss": 0.80032581, "learning_rate": 2.238366782910174e-06, "loss": 0.82163477, "num_input_tokens_seen": 170919740, "step": 7952, "time_per_iteration": 2.812988519668579 }, { "auxiliary_loss_clip": 0.01107213, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.04275584, "balance_loss_mlp": 1.03007555, "epoch": 0.47816022846836015, "flos": 18697717157760.0, "grad_norm": 1.8177204893019177, "language_loss": 0.7794894, "learning_rate": 2.23798009269438e-06, "loss": 0.80101049, "num_input_tokens_seen": 170938510, "step": 7953, "time_per_iteration": 2.6617591381073 }, { "auxiliary_loss_clip": 0.01120456, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.04588997, "balance_loss_mlp": 1.0237813, "epoch": 0.4782203517210281, "flos": 11977573559040.0, "grad_norm": 2.347215604083738, "language_loss": 0.84714645, "learning_rate": 2.2375933934543566e-06, "loss": 0.86873269, "num_input_tokens_seen": 170951170, "step": 7954, "time_per_iteration": 2.6208479404449463 }, { "auxiliary_loss_clip": 0.01097068, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.0426054, "balance_loss_mlp": 1.0248698, "epoch": 0.4782804749736961, "flos": 20813501923200.0, "grad_norm": 1.4277916214046864, "language_loss": 0.70472121, "learning_rate": 2.237206685204768e-06, "loss": 0.72608161, "num_input_tokens_seen": 170970990, "step": 7955, "time_per_iteration": 2.821913719177246 }, { "auxiliary_loss_clip": 0.0110203, "auxiliary_loss_mlp": 0.01041668, "balance_loss_clip": 1.04433143, "balance_loss_mlp": 1.0281322, "epoch": 0.47834059822636404, "flos": 23840304359040.0, "grad_norm": 1.5047634516845327, "language_loss": 0.82269239, "learning_rate": 2.2368199679602787e-06, "loss": 0.84412932, "num_input_tokens_seen": 170991215, "step": 7956, "time_per_iteration": 2.683924913406372 }, { "auxiliary_loss_clip": 0.01105668, "auxiliary_loss_mlp": 0.01036371, "balance_loss_clip": 1.04529083, "balance_loss_mlp": 1.02021837, "epoch": 0.478400721479032, "flos": 22633777497600.0, "grad_norm": 2.448858103633137, "language_loss": 0.84977531, "learning_rate": 2.2364332417355516e-06, "loss": 0.87119567, "num_input_tokens_seen": 171007325, "step": 7957, "time_per_iteration": 2.6371145248413086 }, { "auxiliary_loss_clip": 0.01118227, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.04562736, "balance_loss_mlp": 1.02653635, "epoch": 0.4784608447317, "flos": 19354954262400.0, "grad_norm": 1.5628888100251457, "language_loss": 0.79777038, "learning_rate": 2.2360465065452527e-06, "loss": 0.81935579, "num_input_tokens_seen": 171025650, "step": 7958, "time_per_iteration": 2.639721632003784 }, { "auxiliary_loss_clip": 0.01085054, "auxiliary_loss_mlp": 0.0077548, "balance_loss_clip": 1.03763032, "balance_loss_mlp": 1.00064015, "epoch": 0.47852096798436794, "flos": 24021114445440.0, "grad_norm": 1.8018566992199279, "language_loss": 0.82972836, "learning_rate": 2.235659762404047e-06, "loss": 0.84833372, "num_input_tokens_seen": 171045045, "step": 7959, "time_per_iteration": 2.733668565750122 }, { "auxiliary_loss_clip": 0.01090487, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04364586, "balance_loss_mlp": 1.02436292, "epoch": 0.4785810912370359, "flos": 25666433850240.0, "grad_norm": 2.7562627438628504, "language_loss": 0.73275614, "learning_rate": 2.235273009326599e-06, "loss": 0.75402862, "num_input_tokens_seen": 171062910, "step": 7960, "time_per_iteration": 2.6994166374206543 }, { "auxiliary_loss_clip": 0.0109086, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 1.04504585, "balance_loss_mlp": 1.02170801, "epoch": 0.47864121448970387, "flos": 21432134885760.0, "grad_norm": 1.6649690841938434, "language_loss": 0.76878142, "learning_rate": 2.2348862473275745e-06, "loss": 0.79003716, "num_input_tokens_seen": 171080875, "step": 7961, "time_per_iteration": 2.7051572799682617 }, { "auxiliary_loss_clip": 0.01087757, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.04447055, "balance_loss_mlp": 1.02050352, "epoch": 0.47870133774237184, "flos": 16143894034560.0, "grad_norm": 7.35679067145723, "language_loss": 0.7769649, "learning_rate": 2.2344994764216405e-06, "loss": 0.79818881, "num_input_tokens_seen": 171099190, "step": 7962, "time_per_iteration": 2.7466347217559814 }, { "auxiliary_loss_clip": 0.0110573, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.04702401, "balance_loss_mlp": 1.02871001, "epoch": 0.47876146099503986, "flos": 26906788344960.0, "grad_norm": 1.6387698321198922, "language_loss": 0.64764994, "learning_rate": 2.2341126966234635e-06, "loss": 0.66913652, "num_input_tokens_seen": 171119060, "step": 7963, "time_per_iteration": 2.77663516998291 }, { "auxiliary_loss_clip": 0.01117113, "auxiliary_loss_mlp": 0.01035904, "balance_loss_clip": 1.04389668, "balance_loss_mlp": 1.02196217, "epoch": 0.4788215842477078, "flos": 45332085778560.0, "grad_norm": 1.655648847764305, "language_loss": 0.77503848, "learning_rate": 2.2337259079477083e-06, "loss": 0.79656863, "num_input_tokens_seen": 171141900, "step": 7964, "time_per_iteration": 2.9196712970733643 }, { "auxiliary_loss_clip": 0.01120902, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.02042508, "epoch": 0.4788817075003758, "flos": 22237180456320.0, "grad_norm": 2.8996801764774505, "language_loss": 0.76540697, "learning_rate": 2.233339110409044e-06, "loss": 0.78698087, "num_input_tokens_seen": 171161045, "step": 7965, "time_per_iteration": 2.6720781326293945 }, { "auxiliary_loss_clip": 0.0106828, "auxiliary_loss_mlp": 0.0105005, "balance_loss_clip": 1.03929722, "balance_loss_mlp": 1.03433788, "epoch": 0.47894183075304375, "flos": 16471183783680.0, "grad_norm": 1.712219755604538, "language_loss": 0.74560332, "learning_rate": 2.232952304022137e-06, "loss": 0.76678663, "num_input_tokens_seen": 171179675, "step": 7966, "time_per_iteration": 2.7669286727905273 }, { "auxiliary_loss_clip": 0.01101486, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.0444622, "balance_loss_mlp": 1.02388787, "epoch": 0.4790019540057117, "flos": 24282688262400.0, "grad_norm": 2.605899190258409, "language_loss": 0.73308432, "learning_rate": 2.232565488801655e-06, "loss": 0.75448, "num_input_tokens_seen": 171201175, "step": 7967, "time_per_iteration": 2.7271900177001953 }, { "auxiliary_loss_clip": 0.01102984, "auxiliary_loss_mlp": 0.01032784, "balance_loss_clip": 1.04409146, "balance_loss_mlp": 1.01838326, "epoch": 0.4790620772583797, "flos": 25666469763840.0, "grad_norm": 2.103515425969552, "language_loss": 0.79279423, "learning_rate": 2.232178664762267e-06, "loss": 0.81415194, "num_input_tokens_seen": 171221750, "step": 7968, "time_per_iteration": 2.707740545272827 }, { "auxiliary_loss_clip": 0.0102077, "auxiliary_loss_mlp": 0.01020427, "balance_loss_clip": 1.02207994, "balance_loss_mlp": 1.01903248, "epoch": 0.47912220051104765, "flos": 69428077102080.0, "grad_norm": 0.7660555925772923, "language_loss": 0.62198806, "learning_rate": 2.2317918319186408e-06, "loss": 0.64240003, "num_input_tokens_seen": 171292235, "step": 7969, "time_per_iteration": 3.3662569522857666 }, { "auxiliary_loss_clip": 0.01087594, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.04418397, "balance_loss_mlp": 1.01662636, "epoch": 0.4791823237637156, "flos": 24168922911360.0, "grad_norm": 1.7596129166374368, "language_loss": 0.77306086, "learning_rate": 2.2314049902854446e-06, "loss": 0.79423159, "num_input_tokens_seen": 171312215, "step": 7970, "time_per_iteration": 2.69364857673645 }, { "auxiliary_loss_clip": 0.01116664, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.04511642, "balance_loss_mlp": 1.0235939, "epoch": 0.4792424470163836, "flos": 24751465683840.0, "grad_norm": 1.5706742055007812, "language_loss": 0.70431626, "learning_rate": 2.231018139877349e-06, "loss": 0.72586453, "num_input_tokens_seen": 171332975, "step": 7971, "time_per_iteration": 2.690791130065918 }, { "auxiliary_loss_clip": 0.01072275, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.03982508, "balance_loss_mlp": 1.01899683, "epoch": 0.47930257026905154, "flos": 23257905240960.0, "grad_norm": 1.30993945009872, "language_loss": 0.79995155, "learning_rate": 2.230631280709021e-06, "loss": 0.82101291, "num_input_tokens_seen": 171353880, "step": 7972, "time_per_iteration": 2.829455852508545 }, { "auxiliary_loss_clip": 0.0111891, "auxiliary_loss_mlp": 0.01028077, "balance_loss_clip": 1.0466361, "balance_loss_mlp": 1.01299727, "epoch": 0.4793626935217195, "flos": 14064091718400.0, "grad_norm": 2.2411370214837807, "language_loss": 0.69401908, "learning_rate": 2.2302444127951327e-06, "loss": 0.71548891, "num_input_tokens_seen": 171370930, "step": 7973, "time_per_iteration": 4.2368669509887695 }, { "auxiliary_loss_clip": 0.01120125, "auxiliary_loss_mlp": 0.01039183, "balance_loss_clip": 1.05002046, "balance_loss_mlp": 1.02575445, "epoch": 0.4794228167743875, "flos": 21798854789760.0, "grad_norm": 1.967830357691446, "language_loss": 0.78792048, "learning_rate": 2.2298575361503523e-06, "loss": 0.80951357, "num_input_tokens_seen": 171387575, "step": 7974, "time_per_iteration": 2.666619300842285 }, { "auxiliary_loss_clip": 0.01029245, "auxiliary_loss_mlp": 0.01003452, "balance_loss_clip": 1.02188838, "balance_loss_mlp": 1.00173593, "epoch": 0.47948294002705544, "flos": 66968805553920.0, "grad_norm": 0.7538441683533625, "language_loss": 0.54051983, "learning_rate": 2.2294706507893517e-06, "loss": 0.56084681, "num_input_tokens_seen": 171449980, "step": 7975, "time_per_iteration": 4.964555501937866 }, { "auxiliary_loss_clip": 0.01114672, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.04530835, "balance_loss_mlp": 1.02287221, "epoch": 0.47954306327972346, "flos": 12422471414400.0, "grad_norm": 2.0524308160251707, "language_loss": 0.89917016, "learning_rate": 2.2290837567268008e-06, "loss": 0.92070794, "num_input_tokens_seen": 171465290, "step": 7976, "time_per_iteration": 4.202557802200317 }, { "auxiliary_loss_clip": 0.01135185, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.05056477, "balance_loss_mlp": 1.02431524, "epoch": 0.4796031865323914, "flos": 18361951799040.0, "grad_norm": 2.222330138734667, "language_loss": 0.73720783, "learning_rate": 2.2286968539773713e-06, "loss": 0.75895566, "num_input_tokens_seen": 171481130, "step": 7977, "time_per_iteration": 2.653036117553711 }, { "auxiliary_loss_clip": 0.01112997, "auxiliary_loss_mlp": 0.00772063, "balance_loss_clip": 1.0468123, "balance_loss_mlp": 1.00047266, "epoch": 0.4796633097850594, "flos": 21835088634240.0, "grad_norm": 1.5823767711410588, "language_loss": 0.78372079, "learning_rate": 2.228309942555734e-06, "loss": 0.80257142, "num_input_tokens_seen": 171501140, "step": 7978, "time_per_iteration": 2.7036852836608887 }, { "auxiliary_loss_clip": 0.01106382, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.04519784, "balance_loss_mlp": 1.02526784, "epoch": 0.47972343303772735, "flos": 23437350610560.0, "grad_norm": 2.6635738232298944, "language_loss": 0.89488423, "learning_rate": 2.22792302247656e-06, "loss": 0.91634321, "num_input_tokens_seen": 171519835, "step": 7979, "time_per_iteration": 2.653221845626831 }, { "auxiliary_loss_clip": 0.01122392, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.04798067, "balance_loss_mlp": 1.02475905, "epoch": 0.4797835562903953, "flos": 24899776940160.0, "grad_norm": 1.5901773617653536, "language_loss": 0.76710582, "learning_rate": 2.227536093754523e-06, "loss": 0.78873634, "num_input_tokens_seen": 171540980, "step": 7980, "time_per_iteration": 2.6700520515441895 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.01039114, "balance_loss_clip": 1.04525447, "balance_loss_mlp": 1.02261567, "epoch": 0.4798436795430633, "flos": 35042996793600.0, "grad_norm": 1.9068781398056245, "language_loss": 0.7128244, "learning_rate": 2.227149156404295e-06, "loss": 0.73425198, "num_input_tokens_seen": 171563600, "step": 7981, "time_per_iteration": 2.817458391189575 }, { "auxiliary_loss_clip": 0.01130721, "auxiliary_loss_mlp": 0.01034361, "balance_loss_clip": 1.05059981, "balance_loss_mlp": 1.02040792, "epoch": 0.47990380279573125, "flos": 20590209025920.0, "grad_norm": 2.189836625005686, "language_loss": 0.70604527, "learning_rate": 2.2267622104405473e-06, "loss": 0.72769606, "num_input_tokens_seen": 171580700, "step": 7982, "time_per_iteration": 2.639772891998291 }, { "auxiliary_loss_clip": 0.01101365, "auxiliary_loss_mlp": 0.01031884, "balance_loss_clip": 1.04456162, "balance_loss_mlp": 1.01928937, "epoch": 0.4799639260483992, "flos": 26359402008960.0, "grad_norm": 6.366705109750511, "language_loss": 0.71019757, "learning_rate": 2.2263752558779544e-06, "loss": 0.73153007, "num_input_tokens_seen": 171602035, "step": 7983, "time_per_iteration": 2.7794454097747803 }, { "auxiliary_loss_clip": 0.01038182, "auxiliary_loss_mlp": 0.00752618, "balance_loss_clip": 1.0209136, "balance_loss_mlp": 1.00015247, "epoch": 0.4800240493010672, "flos": 70979021521920.0, "grad_norm": 0.8025064053403466, "language_loss": 0.59461898, "learning_rate": 2.2259882927311883e-06, "loss": 0.61252695, "num_input_tokens_seen": 171659215, "step": 7984, "time_per_iteration": 3.1715712547302246 }, { "auxiliary_loss_clip": 0.01068728, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.03732944, "balance_loss_mlp": 1.03350329, "epoch": 0.48008417255373514, "flos": 17086656349440.0, "grad_norm": 1.9659657952718743, "language_loss": 0.66784835, "learning_rate": 2.2256013210149247e-06, "loss": 0.68903708, "num_input_tokens_seen": 171675710, "step": 7985, "time_per_iteration": 2.8482425212860107 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.010384, "balance_loss_clip": 1.04168916, "balance_loss_mlp": 1.02367198, "epoch": 0.4801442958064031, "flos": 15413435055360.0, "grad_norm": 1.731655766205416, "language_loss": 0.69907761, "learning_rate": 2.225214340743835e-06, "loss": 0.72051871, "num_input_tokens_seen": 171692510, "step": 7986, "time_per_iteration": 2.78254771232605 }, { "auxiliary_loss_clip": 0.01094439, "auxiliary_loss_mlp": 0.0104069, "balance_loss_clip": 1.04537976, "balance_loss_mlp": 1.02534223, "epoch": 0.4802044190590711, "flos": 11473747441920.0, "grad_norm": 2.3008677930118844, "language_loss": 0.78930938, "learning_rate": 2.2248273519325956e-06, "loss": 0.81066066, "num_input_tokens_seen": 171710235, "step": 7987, "time_per_iteration": 2.8055880069732666 }, { "auxiliary_loss_clip": 0.01076423, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.04216504, "balance_loss_mlp": 1.02793634, "epoch": 0.48026454231173904, "flos": 20951003185920.0, "grad_norm": 2.0041399034857537, "language_loss": 0.75381374, "learning_rate": 2.2244403545958812e-06, "loss": 0.77499998, "num_input_tokens_seen": 171726715, "step": 7988, "time_per_iteration": 2.7931642532348633 }, { "auxiliary_loss_clip": 0.01099185, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.04829884, "balance_loss_mlp": 1.01920891, "epoch": 0.48032466556440706, "flos": 20448110822400.0, "grad_norm": 2.2052350267481984, "language_loss": 0.79056877, "learning_rate": 2.224053348748365e-06, "loss": 0.81189418, "num_input_tokens_seen": 171743605, "step": 7989, "time_per_iteration": 2.7195966243743896 }, { "auxiliary_loss_clip": 0.01109361, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.04376316, "balance_loss_mlp": 1.03094292, "epoch": 0.480384788817075, "flos": 37120823861760.0, "grad_norm": 1.9525154549019321, "language_loss": 0.73684812, "learning_rate": 2.223666334404724e-06, "loss": 0.75840676, "num_input_tokens_seen": 171765445, "step": 7990, "time_per_iteration": 2.8826913833618164 }, { "auxiliary_loss_clip": 0.01039921, "auxiliary_loss_mlp": 0.00752733, "balance_loss_clip": 1.02231336, "balance_loss_mlp": 1.00023639, "epoch": 0.480444912069743, "flos": 69552577641600.0, "grad_norm": 0.7651324576674445, "language_loss": 0.59016085, "learning_rate": 2.223279311579633e-06, "loss": 0.60808742, "num_input_tokens_seen": 171830115, "step": 7991, "time_per_iteration": 3.325892448425293 }, { "auxiliary_loss_clip": 0.01119355, "auxiliary_loss_mlp": 0.00772289, "balance_loss_clip": 1.04751837, "balance_loss_mlp": 1.00058734, "epoch": 0.48050503532241096, "flos": 29822231640960.0, "grad_norm": 2.03548436048953, "language_loss": 0.67551184, "learning_rate": 2.222892280287768e-06, "loss": 0.69442832, "num_input_tokens_seen": 171849135, "step": 7992, "time_per_iteration": 2.7717204093933105 }, { "auxiliary_loss_clip": 0.01102719, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.04047358, "balance_loss_mlp": 1.02267683, "epoch": 0.4805651585750789, "flos": 23948539015680.0, "grad_norm": 1.7261557593206558, "language_loss": 0.76166683, "learning_rate": 2.2225052405438056e-06, "loss": 0.78307372, "num_input_tokens_seen": 171868880, "step": 7993, "time_per_iteration": 2.739190101623535 }, { "auxiliary_loss_clip": 0.01080291, "auxiliary_loss_mlp": 0.01038498, "balance_loss_clip": 1.04301596, "balance_loss_mlp": 1.02469933, "epoch": 0.4806252818277469, "flos": 25665428269440.0, "grad_norm": 1.8324818551458955, "language_loss": 0.79029763, "learning_rate": 2.222118192362422e-06, "loss": 0.81148541, "num_input_tokens_seen": 171889455, "step": 7994, "time_per_iteration": 2.775120973587036 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.04342794, "balance_loss_mlp": 1.01851845, "epoch": 0.48068540508041485, "flos": 13151996640000.0, "grad_norm": 2.168964016546684, "language_loss": 0.79452056, "learning_rate": 2.2217311357582946e-06, "loss": 0.81592381, "num_input_tokens_seen": 171906070, "step": 7995, "time_per_iteration": 2.684086561203003 }, { "auxiliary_loss_clip": 0.01071477, "auxiliary_loss_mlp": 0.01034963, "balance_loss_clip": 1.04075575, "balance_loss_mlp": 1.02081871, "epoch": 0.4807455283330828, "flos": 21176738208000.0, "grad_norm": 1.4272883159105954, "language_loss": 0.82732481, "learning_rate": 2.2213440707461e-06, "loss": 0.84838915, "num_input_tokens_seen": 171926515, "step": 7996, "time_per_iteration": 2.801893711090088 }, { "auxiliary_loss_clip": 0.0105538, "auxiliary_loss_mlp": 0.01038724, "balance_loss_clip": 1.03635919, "balance_loss_mlp": 1.02432358, "epoch": 0.4808056515857508, "flos": 12275991751680.0, "grad_norm": 1.7665973767451764, "language_loss": 0.81008822, "learning_rate": 2.220956997340516e-06, "loss": 0.8310293, "num_input_tokens_seen": 171943845, "step": 7997, "time_per_iteration": 2.7309181690216064 }, { "auxiliary_loss_clip": 0.01079437, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.04144287, "balance_loss_mlp": 1.0246973, "epoch": 0.48086577483841875, "flos": 24826052275200.0, "grad_norm": 4.4511101438837555, "language_loss": 0.7285195, "learning_rate": 2.220569915556221e-06, "loss": 0.74970436, "num_input_tokens_seen": 171964970, "step": 7998, "time_per_iteration": 2.793765068054199 }, { "auxiliary_loss_clip": 0.01129175, "auxiliary_loss_mlp": 0.01042213, "balance_loss_clip": 1.04769647, "balance_loss_mlp": 1.02756786, "epoch": 0.4809258980910867, "flos": 24465365856000.0, "grad_norm": 1.6928626075088686, "language_loss": 0.71266204, "learning_rate": 2.220182825407892e-06, "loss": 0.73437595, "num_input_tokens_seen": 171986340, "step": 7999, "time_per_iteration": 2.698373556137085 }, { "auxiliary_loss_clip": 0.01120573, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.04650939, "balance_loss_mlp": 1.035707, "epoch": 0.4809860213437547, "flos": 21215952881280.0, "grad_norm": 3.5525090623309525, "language_loss": 0.71445537, "learning_rate": 2.2197957269102083e-06, "loss": 0.73615789, "num_input_tokens_seen": 172007300, "step": 8000, "time_per_iteration": 2.677906036376953 }, { "auxiliary_loss_clip": 0.01120936, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.04962945, "balance_loss_mlp": 1.03024244, "epoch": 0.48104614459642264, "flos": 37632084094080.0, "grad_norm": 1.397364252260559, "language_loss": 0.75031364, "learning_rate": 2.2194086200778485e-06, "loss": 0.77197301, "num_input_tokens_seen": 172029585, "step": 8001, "time_per_iteration": 2.8079638481140137 }, { "auxiliary_loss_clip": 0.01120097, "auxiliary_loss_mlp": 0.01045878, "balance_loss_clip": 1.04740191, "balance_loss_mlp": 1.03150105, "epoch": 0.48110626784909066, "flos": 18406122549120.0, "grad_norm": 1.760961408245497, "language_loss": 0.8157444, "learning_rate": 2.219021504925493e-06, "loss": 0.83740413, "num_input_tokens_seen": 172047495, "step": 8002, "time_per_iteration": 2.6615140438079834 }, { "auxiliary_loss_clip": 0.01127724, "auxiliary_loss_mlp": 0.01043569, "balance_loss_clip": 1.05275476, "balance_loss_mlp": 1.02780938, "epoch": 0.48116639110175863, "flos": 28439814856320.0, "grad_norm": 1.7356718355873448, "language_loss": 0.71858382, "learning_rate": 2.218634381467819e-06, "loss": 0.74029678, "num_input_tokens_seen": 172067625, "step": 8003, "time_per_iteration": 2.7304186820983887 }, { "auxiliary_loss_clip": 0.01114781, "auxiliary_loss_mlp": 0.01040333, "balance_loss_clip": 1.04751146, "balance_loss_mlp": 1.02654088, "epoch": 0.4812265143544266, "flos": 21725237865600.0, "grad_norm": 1.7533221004579713, "language_loss": 0.82598346, "learning_rate": 2.218247249719507e-06, "loss": 0.84753454, "num_input_tokens_seen": 172087885, "step": 8004, "time_per_iteration": 2.718576192855835 }, { "auxiliary_loss_clip": 0.01110853, "auxiliary_loss_mlp": 0.01042863, "balance_loss_clip": 1.04705787, "balance_loss_mlp": 1.02601874, "epoch": 0.48128663760709456, "flos": 13224679810560.0, "grad_norm": 2.3721289724239587, "language_loss": 0.77786469, "learning_rate": 2.217860109695239e-06, "loss": 0.79940188, "num_input_tokens_seen": 172105815, "step": 8005, "time_per_iteration": 2.7602009773254395 }, { "auxiliary_loss_clip": 0.01116298, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.04861951, "balance_loss_mlp": 1.02662444, "epoch": 0.4813467608597625, "flos": 24243437675520.0, "grad_norm": 1.8330364183017236, "language_loss": 0.70666707, "learning_rate": 2.217472961409692e-06, "loss": 0.72823763, "num_input_tokens_seen": 172126125, "step": 8006, "time_per_iteration": 2.7916948795318604 }, { "auxiliary_loss_clip": 0.01101733, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.04409337, "balance_loss_mlp": 1.02521324, "epoch": 0.4814068841124305, "flos": 27480424544640.0, "grad_norm": 1.7951056960252978, "language_loss": 0.70724428, "learning_rate": 2.2170858048775495e-06, "loss": 0.72866029, "num_input_tokens_seen": 172141945, "step": 8007, "time_per_iteration": 2.7661349773406982 }, { "auxiliary_loss_clip": 0.01130133, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.0476191, "balance_loss_mlp": 1.01881254, "epoch": 0.48146700736509845, "flos": 19572896033280.0, "grad_norm": 11.665968104344772, "language_loss": 0.71553946, "learning_rate": 2.2166986401134914e-06, "loss": 0.73716843, "num_input_tokens_seen": 172161095, "step": 8008, "time_per_iteration": 2.7019124031066895 }, { "auxiliary_loss_clip": 0.01096611, "auxiliary_loss_mlp": 0.01050794, "balance_loss_clip": 1.04696894, "balance_loss_mlp": 1.03467739, "epoch": 0.4815271306177664, "flos": 20627771673600.0, "grad_norm": 2.289909942865545, "language_loss": 0.60779428, "learning_rate": 2.216311467132199e-06, "loss": 0.62926841, "num_input_tokens_seen": 172178750, "step": 8009, "time_per_iteration": 2.713092088699341 }, { "auxiliary_loss_clip": 0.01022233, "auxiliary_loss_mlp": 0.01005627, "balance_loss_clip": 1.02350807, "balance_loss_mlp": 1.00431013, "epoch": 0.4815872538704344, "flos": 67691076232320.0, "grad_norm": 0.8584252589427176, "language_loss": 0.61326265, "learning_rate": 2.2159242859483547e-06, "loss": 0.63354123, "num_input_tokens_seen": 172240235, "step": 8010, "time_per_iteration": 3.2182729244232178 }, { "auxiliary_loss_clip": 0.01123367, "auxiliary_loss_mlp": 0.01044563, "balance_loss_clip": 1.0506475, "balance_loss_mlp": 1.02956653, "epoch": 0.48164737712310235, "flos": 22820764723200.0, "grad_norm": 1.7901877328371896, "language_loss": 0.73432398, "learning_rate": 2.215537096576639e-06, "loss": 0.75600326, "num_input_tokens_seen": 172259875, "step": 8011, "time_per_iteration": 2.671487331390381 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.04422355, "balance_loss_mlp": 1.02199948, "epoch": 0.4817075003757703, "flos": 23733865382400.0, "grad_norm": 1.7774743588215727, "language_loss": 0.79526579, "learning_rate": 2.2151498990317354e-06, "loss": 0.81665325, "num_input_tokens_seen": 172280150, "step": 8012, "time_per_iteration": 5.769195079803467 }, { "auxiliary_loss_clip": 0.01092738, "auxiliary_loss_mlp": 0.01042222, "balance_loss_clip": 1.04738641, "balance_loss_mlp": 1.02718425, "epoch": 0.4817676236284383, "flos": 28182909807360.0, "grad_norm": 1.8494845342416013, "language_loss": 0.73714077, "learning_rate": 2.214762693328326e-06, "loss": 0.75849032, "num_input_tokens_seen": 172300810, "step": 8013, "time_per_iteration": 2.77451491355896 }, { "auxiliary_loss_clip": 0.01105203, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.05056131, "balance_loss_mlp": 1.02266848, "epoch": 0.48182774688110624, "flos": 17091756080640.0, "grad_norm": 2.3240899529345858, "language_loss": 0.90755451, "learning_rate": 2.214375479481094e-06, "loss": 0.92896926, "num_input_tokens_seen": 172317930, "step": 8014, "time_per_iteration": 4.2677695751190186 }, { "auxiliary_loss_clip": 0.0113526, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.04945207, "balance_loss_mlp": 1.02497888, "epoch": 0.4818878701337742, "flos": 12567873669120.0, "grad_norm": 3.070306284191698, "language_loss": 0.7404421, "learning_rate": 2.213988257504722e-06, "loss": 0.76219249, "num_input_tokens_seen": 172336340, "step": 8015, "time_per_iteration": 4.188862085342407 }, { "auxiliary_loss_clip": 0.01113922, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.04792023, "balance_loss_mlp": 1.02514613, "epoch": 0.48194799338644223, "flos": 24608505553920.0, "grad_norm": 2.1594847398910164, "language_loss": 0.80143541, "learning_rate": 2.213601027413894e-06, "loss": 0.82297611, "num_input_tokens_seen": 172354315, "step": 8016, "time_per_iteration": 2.745352268218994 }, { "auxiliary_loss_clip": 0.01115904, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.04995775, "balance_loss_mlp": 1.01803231, "epoch": 0.4820081166391102, "flos": 21105204272640.0, "grad_norm": 1.9897571760317019, "language_loss": 0.77120233, "learning_rate": 2.2132137892232933e-06, "loss": 0.79267836, "num_input_tokens_seen": 172372695, "step": 8017, "time_per_iteration": 2.7234907150268555 }, { "auxiliary_loss_clip": 0.01117431, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.05067015, "balance_loss_mlp": 1.01848102, "epoch": 0.48206823989177816, "flos": 25264593423360.0, "grad_norm": 2.391907623354337, "language_loss": 0.80211884, "learning_rate": 2.2128265429476043e-06, "loss": 0.8236239, "num_input_tokens_seen": 172390905, "step": 8018, "time_per_iteration": 2.805011749267578 }, { "auxiliary_loss_clip": 0.01113573, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.05918038, "balance_loss_mlp": 1.01767111, "epoch": 0.4821283631444461, "flos": 24645062620800.0, "grad_norm": 1.818966225047076, "language_loss": 0.75859058, "learning_rate": 2.2124392886015124e-06, "loss": 0.78003991, "num_input_tokens_seen": 172412295, "step": 8019, "time_per_iteration": 2.767993688583374 }, { "auxiliary_loss_clip": 0.01092977, "auxiliary_loss_mlp": 0.01036734, "balance_loss_clip": 1.04580545, "balance_loss_mlp": 1.02204108, "epoch": 0.4821884863971141, "flos": 23952094462080.0, "grad_norm": 1.8745546244507358, "language_loss": 0.7907865, "learning_rate": 2.212052026199701e-06, "loss": 0.8120836, "num_input_tokens_seen": 172432625, "step": 8020, "time_per_iteration": 2.708779811859131 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.04847205, "balance_loss_mlp": 1.02219176, "epoch": 0.48224860964978206, "flos": 17160668323200.0, "grad_norm": 2.712415162483374, "language_loss": 0.69893312, "learning_rate": 2.211664755756855e-06, "loss": 0.72057003, "num_input_tokens_seen": 172450010, "step": 8021, "time_per_iteration": 2.6083900928497314 }, { "auxiliary_loss_clip": 0.01102125, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.04406881, "balance_loss_mlp": 1.01672244, "epoch": 0.48230873290245, "flos": 23075838178560.0, "grad_norm": 1.7410194963021717, "language_loss": 0.62778926, "learning_rate": 2.2112774772876603e-06, "loss": 0.6491257, "num_input_tokens_seen": 172469080, "step": 8022, "time_per_iteration": 2.677368640899658 }, { "auxiliary_loss_clip": 0.01108316, "auxiliary_loss_mlp": 0.00770954, "balance_loss_clip": 1.04996586, "balance_loss_mlp": 1.00044918, "epoch": 0.482368856155118, "flos": 19353517718400.0, "grad_norm": 2.505400955117215, "language_loss": 0.66446078, "learning_rate": 2.2108901908068028e-06, "loss": 0.68325341, "num_input_tokens_seen": 172484850, "step": 8023, "time_per_iteration": 2.6412739753723145 }, { "auxiliary_loss_clip": 0.01054811, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.03875041, "balance_loss_mlp": 1.02531052, "epoch": 0.48242897940778595, "flos": 20078984707200.0, "grad_norm": 1.7010312143912936, "language_loss": 0.76777267, "learning_rate": 2.2105028963289683e-06, "loss": 0.78872806, "num_input_tokens_seen": 172503525, "step": 8024, "time_per_iteration": 2.858891010284424 }, { "auxiliary_loss_clip": 0.01109606, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.04908574, "balance_loss_mlp": 1.02432442, "epoch": 0.4824891026604539, "flos": 23403989854080.0, "grad_norm": 1.4778625856906076, "language_loss": 0.75417542, "learning_rate": 2.2101155938688423e-06, "loss": 0.77566242, "num_input_tokens_seen": 172524360, "step": 8025, "time_per_iteration": 2.6743719577789307 }, { "auxiliary_loss_clip": 0.01129031, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.04835987, "balance_loss_mlp": 1.01994324, "epoch": 0.4825492259131219, "flos": 20368675895040.0, "grad_norm": 1.785974704334164, "language_loss": 0.71310222, "learning_rate": 2.209728283441112e-06, "loss": 0.73473275, "num_input_tokens_seen": 172541480, "step": 8026, "time_per_iteration": 2.5739991664886475 }, { "auxiliary_loss_clip": 0.01115668, "auxiliary_loss_mlp": 0.01045724, "balance_loss_clip": 1.04429471, "balance_loss_mlp": 1.02949929, "epoch": 0.48260934916578985, "flos": 14319021519360.0, "grad_norm": 2.0186797289800182, "language_loss": 0.74956793, "learning_rate": 2.209340965060465e-06, "loss": 0.77118182, "num_input_tokens_seen": 172559005, "step": 8027, "time_per_iteration": 2.7139828205108643 }, { "auxiliary_loss_clip": 0.01105318, "auxiliary_loss_mlp": 0.01037258, "balance_loss_clip": 1.04597318, "balance_loss_mlp": 1.02348971, "epoch": 0.4826694724184578, "flos": 22121152548480.0, "grad_norm": 1.6779938031508344, "language_loss": 0.67332339, "learning_rate": 2.2089536387415868e-06, "loss": 0.69474924, "num_input_tokens_seen": 172578435, "step": 8028, "time_per_iteration": 2.809757709503174 }, { "auxiliary_loss_clip": 0.01105459, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.04472148, "balance_loss_mlp": 1.02583039, "epoch": 0.48272959567112583, "flos": 16181169373440.0, "grad_norm": 1.5400710398474027, "language_loss": 0.72719157, "learning_rate": 2.2085663044991655e-06, "loss": 0.7486459, "num_input_tokens_seen": 172596095, "step": 8029, "time_per_iteration": 2.692643165588379 }, { "auxiliary_loss_clip": 0.01103521, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.04666233, "balance_loss_mlp": 1.01880252, "epoch": 0.4827897189237938, "flos": 23180445561600.0, "grad_norm": 1.8484439777749806, "language_loss": 0.84841061, "learning_rate": 2.2081789623478896e-06, "loss": 0.86977708, "num_input_tokens_seen": 172615255, "step": 8030, "time_per_iteration": 2.6717677116394043 }, { "auxiliary_loss_clip": 0.01094989, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.04217124, "balance_loss_mlp": 1.02120733, "epoch": 0.48284984217646176, "flos": 21652626522240.0, "grad_norm": 2.0183604756392715, "language_loss": 0.74026352, "learning_rate": 2.2077916123024466e-06, "loss": 0.76156056, "num_input_tokens_seen": 172633185, "step": 8031, "time_per_iteration": 2.640707015991211 }, { "auxiliary_loss_clip": 0.01099826, "auxiliary_loss_mlp": 0.0104306, "balance_loss_clip": 1.04307055, "balance_loss_mlp": 1.02747965, "epoch": 0.48290996542912973, "flos": 31467443304960.0, "grad_norm": 1.5998759210668847, "language_loss": 0.71785772, "learning_rate": 2.2074042543775245e-06, "loss": 0.7392866, "num_input_tokens_seen": 172654280, "step": 8032, "time_per_iteration": 2.803567886352539 }, { "auxiliary_loss_clip": 0.0110819, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.04093766, "balance_loss_mlp": 1.02310669, "epoch": 0.4829700886817977, "flos": 24461954064000.0, "grad_norm": 1.7179702458807065, "language_loss": 0.73965132, "learning_rate": 2.2070168885878126e-06, "loss": 0.76111305, "num_input_tokens_seen": 172675545, "step": 8033, "time_per_iteration": 2.7292799949645996 }, { "auxiliary_loss_clip": 0.01073662, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.04669857, "balance_loss_mlp": 1.0225054, "epoch": 0.48303021193446566, "flos": 25702164904320.0, "grad_norm": 1.7431687715385025, "language_loss": 0.83544624, "learning_rate": 2.2066295149479996e-06, "loss": 0.85655004, "num_input_tokens_seen": 172696455, "step": 8034, "time_per_iteration": 2.807359218597412 }, { "auxiliary_loss_clip": 0.01095417, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.04668856, "balance_loss_mlp": 1.01843822, "epoch": 0.4830903351871336, "flos": 20085233673600.0, "grad_norm": 1.6936524854207098, "language_loss": 0.79185474, "learning_rate": 2.2062421334727744e-06, "loss": 0.81312621, "num_input_tokens_seen": 172716720, "step": 8035, "time_per_iteration": 2.7641072273254395 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.00772882, "balance_loss_clip": 1.04296494, "balance_loss_mlp": 1.00034285, "epoch": 0.4831504584398016, "flos": 39452216014080.0, "grad_norm": 1.8720500560152205, "language_loss": 0.69804895, "learning_rate": 2.2058547441768267e-06, "loss": 0.71679878, "num_input_tokens_seen": 172737435, "step": 8036, "time_per_iteration": 2.8137052059173584 }, { "auxiliary_loss_clip": 0.01112606, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.04274416, "balance_loss_mlp": 1.01839805, "epoch": 0.48321058169246955, "flos": 20006588845440.0, "grad_norm": 1.9208219105474362, "language_loss": 0.72910142, "learning_rate": 2.205467347074847e-06, "loss": 0.75054711, "num_input_tokens_seen": 172755700, "step": 8037, "time_per_iteration": 2.635277271270752 }, { "auxiliary_loss_clip": 0.01078506, "auxiliary_loss_mlp": 0.0104898, "balance_loss_clip": 1.04335546, "balance_loss_mlp": 1.03224301, "epoch": 0.4832707049451375, "flos": 20741465197440.0, "grad_norm": 3.147603880487906, "language_loss": 0.68890101, "learning_rate": 2.205079942181525e-06, "loss": 0.71017587, "num_input_tokens_seen": 172775185, "step": 8038, "time_per_iteration": 2.782864570617676 }, { "auxiliary_loss_clip": 0.01090364, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.04244566, "balance_loss_mlp": 1.02438653, "epoch": 0.4833308281978055, "flos": 33145584762240.0, "grad_norm": 1.8173480840244864, "language_loss": 0.79258525, "learning_rate": 2.20469252951155e-06, "loss": 0.81387138, "num_input_tokens_seen": 172796990, "step": 8039, "time_per_iteration": 2.7726707458496094 }, { "auxiliary_loss_clip": 0.01115294, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.04610348, "balance_loss_mlp": 1.02035379, "epoch": 0.48339095145047345, "flos": 19099234362240.0, "grad_norm": 1.6327731998252513, "language_loss": 0.77608567, "learning_rate": 2.2043051090796143e-06, "loss": 0.79758161, "num_input_tokens_seen": 172814915, "step": 8040, "time_per_iteration": 2.634373903274536 }, { "auxiliary_loss_clip": 0.01117481, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.04517746, "balance_loss_mlp": 1.02007651, "epoch": 0.4834510747031414, "flos": 34459448440320.0, "grad_norm": 1.603418513383397, "language_loss": 0.75737631, "learning_rate": 2.203917680900409e-06, "loss": 0.7789005, "num_input_tokens_seen": 172837060, "step": 8041, "time_per_iteration": 2.7551445960998535 }, { "auxiliary_loss_clip": 0.01089791, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.04363966, "balance_loss_mlp": 1.02388501, "epoch": 0.48351119795580944, "flos": 27380845065600.0, "grad_norm": 1.7873938615261085, "language_loss": 0.6681267, "learning_rate": 2.203530244988624e-06, "loss": 0.6894092, "num_input_tokens_seen": 172856545, "step": 8042, "time_per_iteration": 2.7318594455718994 }, { "auxiliary_loss_clip": 0.01029662, "auxiliary_loss_mlp": 0.0100431, "balance_loss_clip": 1.0224936, "balance_loss_mlp": 1.00289762, "epoch": 0.4835713212084774, "flos": 67143941291520.0, "grad_norm": 0.6894070214322334, "language_loss": 0.5854131, "learning_rate": 2.2031428013589517e-06, "loss": 0.60575283, "num_input_tokens_seen": 172923055, "step": 8043, "time_per_iteration": 3.2759408950805664 }, { "auxiliary_loss_clip": 0.01104355, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.02605903, "epoch": 0.48363144446114537, "flos": 17967473660160.0, "grad_norm": 1.92903629391714, "language_loss": 0.71673858, "learning_rate": 2.2027553500260847e-06, "loss": 0.73820192, "num_input_tokens_seen": 172940700, "step": 8044, "time_per_iteration": 2.6627197265625 }, { "auxiliary_loss_clip": 0.01073602, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.04103553, "balance_loss_mlp": 1.01863277, "epoch": 0.48369156771381333, "flos": 20593513077120.0, "grad_norm": 1.3783700874379357, "language_loss": 0.75982356, "learning_rate": 2.202367891004714e-06, "loss": 0.7808938, "num_input_tokens_seen": 172961125, "step": 8045, "time_per_iteration": 2.7301156520843506 }, { "auxiliary_loss_clip": 0.01083343, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.04626942, "balance_loss_mlp": 1.02615929, "epoch": 0.4837516909664813, "flos": 22675075159680.0, "grad_norm": 1.8085917066759625, "language_loss": 0.70038342, "learning_rate": 2.201980424309533e-06, "loss": 0.72162569, "num_input_tokens_seen": 172980405, "step": 8046, "time_per_iteration": 2.853160858154297 }, { "auxiliary_loss_clip": 0.01127438, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04603601, "balance_loss_mlp": 1.02220488, "epoch": 0.48381181421914926, "flos": 25518625384320.0, "grad_norm": 2.1605387354357193, "language_loss": 0.82558095, "learning_rate": 2.2015929499552337e-06, "loss": 0.84722322, "num_input_tokens_seen": 172999105, "step": 8047, "time_per_iteration": 2.711172103881836 }, { "auxiliary_loss_clip": 0.01095021, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.04198444, "balance_loss_mlp": 1.02066541, "epoch": 0.4838719374718172, "flos": 24207491139840.0, "grad_norm": 1.6956601095110444, "language_loss": 0.80573416, "learning_rate": 2.2012054679565092e-06, "loss": 0.82702971, "num_input_tokens_seen": 173019935, "step": 8048, "time_per_iteration": 2.714733839035034 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.01039156, "balance_loss_clip": 1.04571271, "balance_loss_mlp": 1.02458251, "epoch": 0.4839320607244852, "flos": 26724577628160.0, "grad_norm": 1.6136989522042802, "language_loss": 0.81565118, "learning_rate": 2.200817978328054e-06, "loss": 0.83723497, "num_input_tokens_seen": 173039700, "step": 8049, "time_per_iteration": 2.740396738052368 }, { "auxiliary_loss_clip": 0.0110148, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.04652369, "balance_loss_mlp": 1.01979959, "epoch": 0.48399218397715316, "flos": 20448900921600.0, "grad_norm": 1.738899019363266, "language_loss": 0.72696805, "learning_rate": 2.2004304810845602e-06, "loss": 0.74830616, "num_input_tokens_seen": 173059170, "step": 8050, "time_per_iteration": 2.671696424484253 }, { "auxiliary_loss_clip": 0.01036049, "auxiliary_loss_mlp": 0.00752282, "balance_loss_clip": 1.01914835, "balance_loss_mlp": 1.00025868, "epoch": 0.4840523072298211, "flos": 67180570185600.0, "grad_norm": 0.6909377773009905, "language_loss": 0.562814, "learning_rate": 2.200042976240723e-06, "loss": 0.5806973, "num_input_tokens_seen": 173119000, "step": 8051, "time_per_iteration": 6.922944784164429 }, { "auxiliary_loss_clip": 0.01088902, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.04290557, "balance_loss_mlp": 1.0208869, "epoch": 0.4841124304824891, "flos": 22411490181120.0, "grad_norm": 1.8410570377760342, "language_loss": 0.75224304, "learning_rate": 2.199655463811236e-06, "loss": 0.77348751, "num_input_tokens_seen": 173137570, "step": 8052, "time_per_iteration": 2.7672088146209717 }, { "auxiliary_loss_clip": 0.01115072, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.04730511, "balance_loss_mlp": 1.02388382, "epoch": 0.48417255373515705, "flos": 13843959217920.0, "grad_norm": 2.7757616025011296, "language_loss": 0.6599009, "learning_rate": 2.1992679438107936e-06, "loss": 0.68142503, "num_input_tokens_seen": 173154355, "step": 8053, "time_per_iteration": 2.7092020511627197 }, { "auxiliary_loss_clip": 0.01118659, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.04970407, "balance_loss_mlp": 1.02048898, "epoch": 0.484232676987825, "flos": 31649689935360.0, "grad_norm": 1.9021914395644282, "language_loss": 0.69075954, "learning_rate": 2.198880416254091e-06, "loss": 0.7122823, "num_input_tokens_seen": 173174845, "step": 8054, "time_per_iteration": 5.934173583984375 }, { "auxiliary_loss_clip": 0.01055753, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.03702974, "balance_loss_mlp": 1.01789522, "epoch": 0.48429280024049304, "flos": 24095377814400.0, "grad_norm": 1.7332498206286664, "language_loss": 0.69624376, "learning_rate": 2.1984928811558233e-06, "loss": 0.71712232, "num_input_tokens_seen": 173195025, "step": 8055, "time_per_iteration": 2.811734676361084 }, { "auxiliary_loss_clip": 0.01121016, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.04966474, "balance_loss_mlp": 1.02396512, "epoch": 0.484352923493161, "flos": 17530081747200.0, "grad_norm": 2.8015304711701154, "language_loss": 0.63522434, "learning_rate": 2.198105338530685e-06, "loss": 0.6568222, "num_input_tokens_seen": 173213065, "step": 8056, "time_per_iteration": 2.6111772060394287 }, { "auxiliary_loss_clip": 0.01115568, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 1.04465592, "balance_loss_mlp": 1.0212791, "epoch": 0.48441304674582897, "flos": 29166862043520.0, "grad_norm": 2.044514393553715, "language_loss": 0.67968506, "learning_rate": 2.1977177883933726e-06, "loss": 0.70120221, "num_input_tokens_seen": 173234545, "step": 8057, "time_per_iteration": 2.678311824798584 }, { "auxiliary_loss_clip": 0.01089017, "auxiliary_loss_mlp": 0.01041569, "balance_loss_clip": 1.04114962, "balance_loss_mlp": 1.02560723, "epoch": 0.48447316999849693, "flos": 15886701676800.0, "grad_norm": 1.6304795591829788, "language_loss": 0.8145591, "learning_rate": 2.1973302307585827e-06, "loss": 0.83586496, "num_input_tokens_seen": 173252175, "step": 8058, "time_per_iteration": 2.676553964614868 }, { "auxiliary_loss_clip": 0.0111574, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.04488969, "balance_loss_mlp": 1.02229452, "epoch": 0.4845332932511649, "flos": 24381405815040.0, "grad_norm": 1.66967797618368, "language_loss": 0.79851902, "learning_rate": 2.1969426656410097e-06, "loss": 0.82004976, "num_input_tokens_seen": 173268790, "step": 8059, "time_per_iteration": 2.672071933746338 }, { "auxiliary_loss_clip": 0.01134552, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.04998326, "balance_loss_mlp": 1.02804327, "epoch": 0.48459341650383286, "flos": 37116478316160.0, "grad_norm": 1.8700605031219397, "language_loss": 0.6685822, "learning_rate": 2.196555093055352e-06, "loss": 0.69036371, "num_input_tokens_seen": 173288030, "step": 8060, "time_per_iteration": 2.7481517791748047 }, { "auxiliary_loss_clip": 0.01115717, "auxiliary_loss_mlp": 0.01047797, "balance_loss_clip": 1.04782832, "balance_loss_mlp": 1.03283644, "epoch": 0.48465353975650083, "flos": 22966777509120.0, "grad_norm": 1.918934253409618, "language_loss": 0.67403054, "learning_rate": 2.1961675130163046e-06, "loss": 0.69566566, "num_input_tokens_seen": 173305965, "step": 8061, "time_per_iteration": 2.6991710662841797 }, { "auxiliary_loss_clip": 0.01112971, "auxiliary_loss_mlp": 0.01047446, "balance_loss_clip": 1.0495888, "balance_loss_mlp": 1.03176975, "epoch": 0.4847136630091688, "flos": 17707695523200.0, "grad_norm": 2.027913918653662, "language_loss": 0.82387316, "learning_rate": 2.1957799255385653e-06, "loss": 0.84547728, "num_input_tokens_seen": 173321985, "step": 8062, "time_per_iteration": 2.6427886486053467 }, { "auxiliary_loss_clip": 0.01062707, "auxiliary_loss_mlp": 0.0103913, "balance_loss_clip": 1.04044425, "balance_loss_mlp": 1.02433586, "epoch": 0.48477378626183676, "flos": 22018269018240.0, "grad_norm": 1.5908761940571217, "language_loss": 0.74599862, "learning_rate": 2.1953923306368325e-06, "loss": 0.76701701, "num_input_tokens_seen": 173341315, "step": 8063, "time_per_iteration": 2.767857313156128 }, { "auxiliary_loss_clip": 0.01103538, "auxiliary_loss_mlp": 0.01036681, "balance_loss_clip": 1.04380846, "balance_loss_mlp": 1.02177346, "epoch": 0.4848339095145047, "flos": 27962956874880.0, "grad_norm": 1.679199539296889, "language_loss": 0.7897141, "learning_rate": 2.1950047283258023e-06, "loss": 0.81111628, "num_input_tokens_seen": 173361055, "step": 8064, "time_per_iteration": 2.702838182449341 }, { "auxiliary_loss_clip": 0.01127143, "auxiliary_loss_mlp": 0.0077039, "balance_loss_clip": 1.04982877, "balance_loss_mlp": 1.00042999, "epoch": 0.4848940327671727, "flos": 21688752625920.0, "grad_norm": 1.758395032785765, "language_loss": 0.78960353, "learning_rate": 2.194617118620173e-06, "loss": 0.80857891, "num_input_tokens_seen": 173379255, "step": 8065, "time_per_iteration": 2.6464266777038574 }, { "auxiliary_loss_clip": 0.01109206, "auxiliary_loss_mlp": 0.00771166, "balance_loss_clip": 1.04239869, "balance_loss_mlp": 1.00034332, "epoch": 0.48495415601984065, "flos": 20631578515200.0, "grad_norm": 1.717828669503626, "language_loss": 0.76373905, "learning_rate": 2.194229501534644e-06, "loss": 0.78254277, "num_input_tokens_seen": 173398370, "step": 8066, "time_per_iteration": 2.622279405593872 }, { "auxiliary_loss_clip": 0.01129705, "auxiliary_loss_mlp": 0.01032468, "balance_loss_clip": 1.05031133, "balance_loss_mlp": 1.0188905, "epoch": 0.4850142792725086, "flos": 25628152930560.0, "grad_norm": 1.606995638926956, "language_loss": 0.7245208, "learning_rate": 2.193841877083912e-06, "loss": 0.74614257, "num_input_tokens_seen": 173419595, "step": 8067, "time_per_iteration": 2.6863858699798584 }, { "auxiliary_loss_clip": 0.01062315, "auxiliary_loss_mlp": 0.01036403, "balance_loss_clip": 1.04658556, "balance_loss_mlp": 1.02155542, "epoch": 0.4850744025251766, "flos": 13771958405760.0, "grad_norm": 2.9723717970034826, "language_loss": 0.79098403, "learning_rate": 2.1934542452826767e-06, "loss": 0.81197119, "num_input_tokens_seen": 173435390, "step": 8068, "time_per_iteration": 2.736361503601074 }, { "auxiliary_loss_clip": 0.01096742, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.04122019, "balance_loss_mlp": 1.02254295, "epoch": 0.4851345257778446, "flos": 20261339078400.0, "grad_norm": 1.4037595191012704, "language_loss": 0.84329617, "learning_rate": 2.193066606145638e-06, "loss": 0.86462128, "num_input_tokens_seen": 173454095, "step": 8069, "time_per_iteration": 2.6671814918518066 }, { "auxiliary_loss_clip": 0.01091404, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.04400659, "balance_loss_mlp": 1.01972818, "epoch": 0.48519464903051257, "flos": 27089681420160.0, "grad_norm": 1.7638547734342187, "language_loss": 0.78171504, "learning_rate": 2.192678959687493e-06, "loss": 0.80295968, "num_input_tokens_seen": 173475300, "step": 8070, "time_per_iteration": 2.7715907096862793 }, { "auxiliary_loss_clip": 0.01066151, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.04079247, "balance_loss_mlp": 1.01808023, "epoch": 0.48525477228318054, "flos": 17127235739520.0, "grad_norm": 1.9176398781406192, "language_loss": 0.78054178, "learning_rate": 2.192291305922943e-06, "loss": 0.80152905, "num_input_tokens_seen": 173492005, "step": 8071, "time_per_iteration": 2.7427566051483154 }, { "auxiliary_loss_clip": 0.01063848, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.04013515, "balance_loss_mlp": 1.01852274, "epoch": 0.4853148955358485, "flos": 28180324028160.0, "grad_norm": 1.9286974806008035, "language_loss": 0.72312587, "learning_rate": 2.1919036448666873e-06, "loss": 0.7440955, "num_input_tokens_seen": 173511995, "step": 8072, "time_per_iteration": 2.8457834720611572 }, { "auxiliary_loss_clip": 0.01077736, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.04195118, "balance_loss_mlp": 1.02361333, "epoch": 0.48537501878851647, "flos": 17493309198720.0, "grad_norm": 2.206546835183074, "language_loss": 0.87933266, "learning_rate": 2.1915159765334262e-06, "loss": 0.90049368, "num_input_tokens_seen": 173530215, "step": 8073, "time_per_iteration": 2.7190656661987305 }, { "auxiliary_loss_clip": 0.01081944, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.03932655, "balance_loss_mlp": 1.02555168, "epoch": 0.48543514204118443, "flos": 28584857975040.0, "grad_norm": 1.6453725477912577, "language_loss": 0.60954368, "learning_rate": 2.19112830093786e-06, "loss": 0.63077909, "num_input_tokens_seen": 173550920, "step": 8074, "time_per_iteration": 2.757408857345581 }, { "auxiliary_loss_clip": 0.01088022, "auxiliary_loss_mlp": 0.00773092, "balance_loss_clip": 1.0409627, "balance_loss_mlp": 1.00044906, "epoch": 0.4854952652938524, "flos": 20959981585920.0, "grad_norm": 1.6130644581425704, "language_loss": 0.735416, "learning_rate": 2.19074061809469e-06, "loss": 0.75402713, "num_input_tokens_seen": 173569065, "step": 8075, "time_per_iteration": 2.8191847801208496 }, { "auxiliary_loss_clip": 0.01121809, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.04537582, "balance_loss_mlp": 1.02567613, "epoch": 0.48555538854652036, "flos": 66529543155840.0, "grad_norm": 2.2867687714704665, "language_loss": 0.81751764, "learning_rate": 2.1903529280186163e-06, "loss": 0.83912885, "num_input_tokens_seen": 173596085, "step": 8076, "time_per_iteration": 3.0270113945007324 }, { "auxiliary_loss_clip": 0.01107841, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.04600549, "balance_loss_mlp": 1.02161372, "epoch": 0.4856155117991883, "flos": 15924982596480.0, "grad_norm": 2.702312951735234, "language_loss": 0.86105502, "learning_rate": 2.1899652307243407e-06, "loss": 0.88251674, "num_input_tokens_seen": 173613900, "step": 8077, "time_per_iteration": 2.6272876262664795 }, { "auxiliary_loss_clip": 0.01006449, "auxiliary_loss_mlp": 0.0100721, "balance_loss_clip": 1.01856184, "balance_loss_mlp": 1.00564885, "epoch": 0.4856756350518563, "flos": 71047395060480.0, "grad_norm": 0.8998346956373826, "language_loss": 0.58465588, "learning_rate": 2.189577526226564e-06, "loss": 0.60479248, "num_input_tokens_seen": 173671305, "step": 8078, "time_per_iteration": 3.254561424255371 }, { "auxiliary_loss_clip": 0.01132159, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.04961872, "balance_loss_mlp": 1.01946878, "epoch": 0.48573575830452426, "flos": 29825679346560.0, "grad_norm": 1.7198368274974891, "language_loss": 0.72365242, "learning_rate": 2.1891898145399884e-06, "loss": 0.74531311, "num_input_tokens_seen": 173692070, "step": 8079, "time_per_iteration": 2.6532506942749023 }, { "auxiliary_loss_clip": 0.01088509, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.04440176, "balance_loss_mlp": 1.01868141, "epoch": 0.4857958815571922, "flos": 17639501552640.0, "grad_norm": 2.749999314487442, "language_loss": 0.79557705, "learning_rate": 2.1888020956793172e-06, "loss": 0.81678975, "num_input_tokens_seen": 173709785, "step": 8080, "time_per_iteration": 2.6242940425872803 }, { "auxiliary_loss_clip": 0.01097632, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.04023981, "balance_loss_mlp": 1.01881862, "epoch": 0.4858560048098602, "flos": 21105491581440.0, "grad_norm": 1.9603729393952303, "language_loss": 0.84016395, "learning_rate": 2.188414369659251e-06, "loss": 0.86147618, "num_input_tokens_seen": 173728770, "step": 8081, "time_per_iteration": 2.6701998710632324 }, { "auxiliary_loss_clip": 0.01110096, "auxiliary_loss_mlp": 0.01036956, "balance_loss_clip": 1.04121375, "balance_loss_mlp": 1.02081513, "epoch": 0.4859161280625282, "flos": 22090844448000.0, "grad_norm": 1.4026106187948555, "language_loss": 0.83353597, "learning_rate": 2.1880266364944924e-06, "loss": 0.85500646, "num_input_tokens_seen": 173747355, "step": 8082, "time_per_iteration": 2.6535134315490723 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.04525304, "balance_loss_mlp": 1.02117527, "epoch": 0.4859762513151962, "flos": 17493452853120.0, "grad_norm": 1.9462739217424578, "language_loss": 0.87314546, "learning_rate": 2.187638896199746e-06, "loss": 0.89451694, "num_input_tokens_seen": 173764825, "step": 8083, "time_per_iteration": 2.6324520111083984 }, { "auxiliary_loss_clip": 0.01080799, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.04719186, "balance_loss_mlp": 1.03410375, "epoch": 0.48603637456786414, "flos": 18004246208640.0, "grad_norm": 1.6025248177358018, "language_loss": 0.80759108, "learning_rate": 2.1872511487897126e-06, "loss": 0.82887018, "num_input_tokens_seen": 173783215, "step": 8084, "time_per_iteration": 2.679032325744629 }, { "auxiliary_loss_clip": 0.01114846, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.04544878, "balance_loss_mlp": 1.02149308, "epoch": 0.4860964978205321, "flos": 22492038430080.0, "grad_norm": 1.9539653340908196, "language_loss": 0.68145066, "learning_rate": 2.186863394279098e-06, "loss": 0.70295715, "num_input_tokens_seen": 173801905, "step": 8085, "time_per_iteration": 2.6305296421051025 }, { "auxiliary_loss_clip": 0.01113875, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04487717, "balance_loss_mlp": 1.02714896, "epoch": 0.48615662107320007, "flos": 23372532518400.0, "grad_norm": 1.3763064439222144, "language_loss": 0.77494752, "learning_rate": 2.1864756326826046e-06, "loss": 0.79649526, "num_input_tokens_seen": 173824690, "step": 8086, "time_per_iteration": 2.6941890716552734 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.04536629, "balance_loss_mlp": 1.01461661, "epoch": 0.48621674432586803, "flos": 34418833136640.0, "grad_norm": 2.3947564981199347, "language_loss": 0.7014342, "learning_rate": 2.1860878640149355e-06, "loss": 0.72297299, "num_input_tokens_seen": 173844450, "step": 8087, "time_per_iteration": 2.7329354286193848 }, { "auxiliary_loss_clip": 0.01119086, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.04627323, "balance_loss_mlp": 1.0251466, "epoch": 0.486276867578536, "flos": 33107555237760.0, "grad_norm": 1.710106545545042, "language_loss": 0.72521967, "learning_rate": 2.1857000882907974e-06, "loss": 0.74681354, "num_input_tokens_seen": 173864975, "step": 8088, "time_per_iteration": 2.747058391571045 }, { "auxiliary_loss_clip": 0.01103115, "auxiliary_loss_mlp": 0.01037287, "balance_loss_clip": 1.04365635, "balance_loss_mlp": 1.02306569, "epoch": 0.48633699083120396, "flos": 21470703114240.0, "grad_norm": 1.7297894528285667, "language_loss": 0.7543239, "learning_rate": 2.185312305524892e-06, "loss": 0.77572793, "num_input_tokens_seen": 173883805, "step": 8089, "time_per_iteration": 2.6639740467071533 }, { "auxiliary_loss_clip": 0.01092992, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.04379344, "balance_loss_mlp": 1.01733255, "epoch": 0.48639711408387193, "flos": 20084335833600.0, "grad_norm": 1.6351614757671693, "language_loss": 0.84245062, "learning_rate": 2.184924515731926e-06, "loss": 0.86369717, "num_input_tokens_seen": 173903520, "step": 8090, "time_per_iteration": 4.404139757156372 }, { "auxiliary_loss_clip": 0.01122239, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.04544723, "balance_loss_mlp": 1.0203594, "epoch": 0.4864572373365399, "flos": 20778884190720.0, "grad_norm": 1.7197214823091769, "language_loss": 0.76290631, "learning_rate": 2.1845367189266045e-06, "loss": 0.78446829, "num_input_tokens_seen": 173924255, "step": 8091, "time_per_iteration": 2.7133665084838867 }, { "auxiliary_loss_clip": 0.01115621, "auxiliary_loss_mlp": 0.01029044, "balance_loss_clip": 1.04440069, "balance_loss_mlp": 1.01553202, "epoch": 0.48651736058920786, "flos": 26025360503040.0, "grad_norm": 1.4953838782762103, "language_loss": 0.80510461, "learning_rate": 2.184148915123631e-06, "loss": 0.82655126, "num_input_tokens_seen": 173943285, "step": 8092, "time_per_iteration": 2.682349920272827 }, { "auxiliary_loss_clip": 0.0110052, "auxiliary_loss_mlp": 0.00775072, "balance_loss_clip": 1.04398346, "balance_loss_mlp": 1.00031447, "epoch": 0.4865774838418758, "flos": 20485601642880.0, "grad_norm": 1.434156215667662, "language_loss": 0.71867287, "learning_rate": 2.1837611043377126e-06, "loss": 0.73742878, "num_input_tokens_seen": 173962205, "step": 8093, "time_per_iteration": 5.686015367507935 }, { "auxiliary_loss_clip": 0.01123791, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.04521751, "balance_loss_mlp": 1.02074885, "epoch": 0.4866376070945438, "flos": 23547704169600.0, "grad_norm": 1.581585117496142, "language_loss": 0.67704266, "learning_rate": 2.1833732865835545e-06, "loss": 0.69862258, "num_input_tokens_seen": 173980945, "step": 8094, "time_per_iteration": 2.5890355110168457 }, { "auxiliary_loss_clip": 0.01109259, "auxiliary_loss_mlp": 0.01038119, "balance_loss_clip": 1.04752278, "balance_loss_mlp": 1.02342701, "epoch": 0.4866977303472118, "flos": 16690598012160.0, "grad_norm": 2.317379685093866, "language_loss": 0.66784161, "learning_rate": 2.1829854618758636e-06, "loss": 0.68931544, "num_input_tokens_seen": 173998860, "step": 8095, "time_per_iteration": 2.640468120574951 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.04456031, "balance_loss_mlp": 1.02123296, "epoch": 0.4867578535998798, "flos": 17896011552000.0, "grad_norm": 2.1481069791390346, "language_loss": 0.78540075, "learning_rate": 2.182597630229345e-06, "loss": 0.80686581, "num_input_tokens_seen": 174016665, "step": 8096, "time_per_iteration": 2.585015058517456 }, { "auxiliary_loss_clip": 0.01092726, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.03732872, "balance_loss_mlp": 1.02165902, "epoch": 0.48681797685254774, "flos": 22637799820800.0, "grad_norm": 1.880706326191671, "language_loss": 0.67753577, "learning_rate": 2.1822097916587067e-06, "loss": 0.69882447, "num_input_tokens_seen": 174034800, "step": 8097, "time_per_iteration": 2.6526336669921875 }, { "auxiliary_loss_clip": 0.01097124, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.04311764, "balance_loss_mlp": 1.02491093, "epoch": 0.4868781001052157, "flos": 20886077352960.0, "grad_norm": 1.6144910396326548, "language_loss": 0.71414316, "learning_rate": 2.1818219461786543e-06, "loss": 0.73550731, "num_input_tokens_seen": 174054445, "step": 8098, "time_per_iteration": 2.6669986248016357 }, { "auxiliary_loss_clip": 0.01119656, "auxiliary_loss_mlp": 0.01037345, "balance_loss_clip": 1.04642081, "balance_loss_mlp": 1.02226543, "epoch": 0.48693822335788367, "flos": 41974940937600.0, "grad_norm": 2.9804894060925458, "language_loss": 0.66267806, "learning_rate": 2.1814340938038956e-06, "loss": 0.68424809, "num_input_tokens_seen": 174077890, "step": 8099, "time_per_iteration": 2.7542026042938232 }, { "auxiliary_loss_clip": 0.01070284, "auxiliary_loss_mlp": 0.01040695, "balance_loss_clip": 1.0372566, "balance_loss_mlp": 1.02712917, "epoch": 0.48699834661055164, "flos": 24243294021120.0, "grad_norm": 1.700994432394141, "language_loss": 0.66787708, "learning_rate": 2.181046234549138e-06, "loss": 0.6889869, "num_input_tokens_seen": 174097460, "step": 8100, "time_per_iteration": 2.7499735355377197 }, { "auxiliary_loss_clip": 0.01087635, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.04155445, "balance_loss_mlp": 1.02084517, "epoch": 0.4870584698632196, "flos": 25923877603200.0, "grad_norm": 1.427277688843355, "language_loss": 0.76812327, "learning_rate": 2.180658368429088e-06, "loss": 0.78933728, "num_input_tokens_seen": 174120775, "step": 8101, "time_per_iteration": 2.7710418701171875 }, { "auxiliary_loss_clip": 0.010432, "auxiliary_loss_mlp": 0.00999689, "balance_loss_clip": 1.01742899, "balance_loss_mlp": 0.99847281, "epoch": 0.48711859311588757, "flos": 70211933648640.0, "grad_norm": 0.6877166097191185, "language_loss": 0.52341712, "learning_rate": 2.1802704954584565e-06, "loss": 0.54384601, "num_input_tokens_seen": 174189135, "step": 8102, "time_per_iteration": 3.3232975006103516 }, { "auxiliary_loss_clip": 0.0109639, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.04584694, "balance_loss_mlp": 1.02250218, "epoch": 0.48717871636855553, "flos": 12342964659840.0, "grad_norm": 2.1242457938350885, "language_loss": 0.7405737, "learning_rate": 2.1798826156519484e-06, "loss": 0.7618984, "num_input_tokens_seen": 174203250, "step": 8103, "time_per_iteration": 2.6988277435302734 }, { "auxiliary_loss_clip": 0.01116672, "auxiliary_loss_mlp": 0.01043644, "balance_loss_clip": 1.04631233, "balance_loss_mlp": 1.0288384, "epoch": 0.4872388396212235, "flos": 23477139901440.0, "grad_norm": 1.6106517558680102, "language_loss": 0.63064033, "learning_rate": 2.1794947290242737e-06, "loss": 0.65224349, "num_input_tokens_seen": 174224145, "step": 8104, "time_per_iteration": 2.629725456237793 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.04695344, "balance_loss_mlp": 1.01885152, "epoch": 0.48729896287389146, "flos": 31427582186880.0, "grad_norm": 2.7588286364308217, "language_loss": 0.69136071, "learning_rate": 2.1791068355901413e-06, "loss": 0.71294117, "num_input_tokens_seen": 174244435, "step": 8105, "time_per_iteration": 2.6670045852661133 }, { "auxiliary_loss_clip": 0.01084626, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.04264283, "balance_loss_mlp": 1.01766491, "epoch": 0.4873590861265594, "flos": 19057936700160.0, "grad_norm": 2.072109036230495, "language_loss": 0.73534381, "learning_rate": 2.178718935364259e-06, "loss": 0.75649679, "num_input_tokens_seen": 174262710, "step": 8106, "time_per_iteration": 2.679194927215576 }, { "auxiliary_loss_clip": 0.01107932, "auxiliary_loss_mlp": 0.00772241, "balance_loss_clip": 1.04675412, "balance_loss_mlp": 1.00038791, "epoch": 0.4874192093792274, "flos": 24348296453760.0, "grad_norm": 2.6438945384360157, "language_loss": 0.76877642, "learning_rate": 2.1783310283613373e-06, "loss": 0.78757817, "num_input_tokens_seen": 174281545, "step": 8107, "time_per_iteration": 2.6732285022735596 }, { "auxiliary_loss_clip": 0.01071333, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.04327512, "balance_loss_mlp": 1.01932359, "epoch": 0.4874793326318954, "flos": 23112610727040.0, "grad_norm": 3.5135482389125583, "language_loss": 0.75034302, "learning_rate": 2.1779431145960853e-06, "loss": 0.77137709, "num_input_tokens_seen": 174300290, "step": 8108, "time_per_iteration": 2.8071932792663574 }, { "auxiliary_loss_clip": 0.01111368, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.04524517, "balance_loss_mlp": 1.01917136, "epoch": 0.4875394558845634, "flos": 19026156142080.0, "grad_norm": 1.7033835018380465, "language_loss": 0.73611033, "learning_rate": 2.177555194083212e-06, "loss": 0.75753379, "num_input_tokens_seen": 174318490, "step": 8109, "time_per_iteration": 2.642854928970337 }, { "auxiliary_loss_clip": 0.01108586, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.04274952, "balance_loss_mlp": 1.01813245, "epoch": 0.48759957913723134, "flos": 21433607343360.0, "grad_norm": 1.8383730211114537, "language_loss": 0.78698927, "learning_rate": 2.177167266837428e-06, "loss": 0.80839157, "num_input_tokens_seen": 174335505, "step": 8110, "time_per_iteration": 2.6471641063690186 }, { "auxiliary_loss_clip": 0.01114056, "auxiliary_loss_mlp": 0.01041552, "balance_loss_clip": 1.04712057, "balance_loss_mlp": 1.02802181, "epoch": 0.4876597023898993, "flos": 17748669962880.0, "grad_norm": 1.8514316559502986, "language_loss": 0.72086185, "learning_rate": 2.176779332873444e-06, "loss": 0.74241793, "num_input_tokens_seen": 174353990, "step": 8111, "time_per_iteration": 2.6277401447296143 }, { "auxiliary_loss_clip": 0.01113402, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.04676926, "balance_loss_mlp": 1.02329946, "epoch": 0.4877198256425673, "flos": 17019647527680.0, "grad_norm": 1.5795214961704311, "language_loss": 0.76318377, "learning_rate": 2.17639139220597e-06, "loss": 0.78468353, "num_input_tokens_seen": 174373425, "step": 8112, "time_per_iteration": 2.598010301589966 }, { "auxiliary_loss_clip": 0.01117365, "auxiliary_loss_mlp": 0.01038377, "balance_loss_clip": 1.04562628, "balance_loss_mlp": 1.02425683, "epoch": 0.48777994889523524, "flos": 22384091082240.0, "grad_norm": 1.710789031048389, "language_loss": 0.75035822, "learning_rate": 2.1760034448497166e-06, "loss": 0.77191567, "num_input_tokens_seen": 174393070, "step": 8113, "time_per_iteration": 2.6348531246185303 }, { "auxiliary_loss_clip": 0.01028141, "auxiliary_loss_mlp": 0.0075288, "balance_loss_clip": 1.02038229, "balance_loss_mlp": 1.0004046, "epoch": 0.4878400721479032, "flos": 61241772159360.0, "grad_norm": 0.77879843500845, "language_loss": 0.4887349, "learning_rate": 2.1756154908193943e-06, "loss": 0.50654507, "num_input_tokens_seen": 174446880, "step": 8114, "time_per_iteration": 3.1273062229156494 }, { "auxiliary_loss_clip": 0.0109717, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.04649258, "balance_loss_mlp": 1.02591658, "epoch": 0.48790019540057117, "flos": 24536612482560.0, "grad_norm": 1.616579350296871, "language_loss": 0.76760268, "learning_rate": 2.1752275301297155e-06, "loss": 0.78897941, "num_input_tokens_seen": 174468485, "step": 8115, "time_per_iteration": 2.759444236755371 }, { "auxiliary_loss_clip": 0.01107443, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.0478245, "balance_loss_mlp": 1.01930535, "epoch": 0.48796031865323913, "flos": 21833939399040.0, "grad_norm": 2.031601085778298, "language_loss": 0.71910083, "learning_rate": 2.1748395627953915e-06, "loss": 0.74050689, "num_input_tokens_seen": 174486360, "step": 8116, "time_per_iteration": 2.7063751220703125 }, { "auxiliary_loss_clip": 0.01088547, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.04164481, "balance_loss_mlp": 1.02276874, "epoch": 0.4880204419059071, "flos": 18588907883520.0, "grad_norm": 3.4734402196051, "language_loss": 0.63002747, "learning_rate": 2.1744515888311335e-06, "loss": 0.65127283, "num_input_tokens_seen": 174505075, "step": 8117, "time_per_iteration": 2.713792562484741 }, { "auxiliary_loss_clip": 0.01093551, "auxiliary_loss_mlp": 0.01042447, "balance_loss_clip": 1.04097366, "balance_loss_mlp": 1.02740264, "epoch": 0.48808056515857506, "flos": 19172168928000.0, "grad_norm": 1.6679530296862457, "language_loss": 0.79487926, "learning_rate": 2.1740636082516533e-06, "loss": 0.81623924, "num_input_tokens_seen": 174523385, "step": 8118, "time_per_iteration": 2.6479125022888184 }, { "auxiliary_loss_clip": 0.01102071, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.04363036, "balance_loss_mlp": 1.02295303, "epoch": 0.48814068841124303, "flos": 20120497850880.0, "grad_norm": 1.8682176240686432, "language_loss": 0.6328088, "learning_rate": 2.1736756210716645e-06, "loss": 0.65419775, "num_input_tokens_seen": 174542200, "step": 8119, "time_per_iteration": 2.6599643230438232 }, { "auxiliary_loss_clip": 0.01061047, "auxiliary_loss_mlp": 0.00770426, "balance_loss_clip": 1.04209542, "balance_loss_mlp": 1.00037444, "epoch": 0.488200811663911, "flos": 22965592360320.0, "grad_norm": 1.676805190577927, "language_loss": 0.72166741, "learning_rate": 2.173287627305878e-06, "loss": 0.73998219, "num_input_tokens_seen": 174563620, "step": 8120, "time_per_iteration": 2.795185089111328 }, { "auxiliary_loss_clip": 0.01118613, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.0469954, "balance_loss_mlp": 1.01728177, "epoch": 0.48826093491657896, "flos": 33910697387520.0, "grad_norm": 2.388334225725702, "language_loss": 0.63951784, "learning_rate": 2.1728996269690075e-06, "loss": 0.66101694, "num_input_tokens_seen": 174586465, "step": 8121, "time_per_iteration": 2.7527153491973877 }, { "auxiliary_loss_clip": 0.01112786, "auxiliary_loss_mlp": 0.01036976, "balance_loss_clip": 1.04261351, "balance_loss_mlp": 1.02283835, "epoch": 0.488321058169247, "flos": 23070307484160.0, "grad_norm": 1.985568603421553, "language_loss": 0.82805705, "learning_rate": 2.1725116200757664e-06, "loss": 0.84955472, "num_input_tokens_seen": 174604035, "step": 8122, "time_per_iteration": 2.668754816055298 }, { "auxiliary_loss_clip": 0.0111403, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.04526711, "balance_loss_mlp": 1.02749181, "epoch": 0.48838118142191494, "flos": 19317714837120.0, "grad_norm": 1.7149683973709622, "language_loss": 0.85272485, "learning_rate": 2.172123606640866e-06, "loss": 0.87429905, "num_input_tokens_seen": 174621715, "step": 8123, "time_per_iteration": 2.6014883518218994 }, { "auxiliary_loss_clip": 0.01090574, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.04448855, "balance_loss_mlp": 1.0185523, "epoch": 0.4884413046745829, "flos": 25410678036480.0, "grad_norm": 1.3909354864913257, "language_loss": 0.85614896, "learning_rate": 2.1717355866790227e-06, "loss": 0.87737238, "num_input_tokens_seen": 174643835, "step": 8124, "time_per_iteration": 2.754786968231201 }, { "auxiliary_loss_clip": 0.01103222, "auxiliary_loss_mlp": 0.01031579, "balance_loss_clip": 1.04439664, "balance_loss_mlp": 1.0179534, "epoch": 0.4885014279272509, "flos": 20991546662400.0, "grad_norm": 1.926010658269172, "language_loss": 0.79547518, "learning_rate": 2.171347560204948e-06, "loss": 0.81682324, "num_input_tokens_seen": 174660955, "step": 8125, "time_per_iteration": 2.667335271835327 }, { "auxiliary_loss_clip": 0.01078395, "auxiliary_loss_mlp": 0.01040727, "balance_loss_clip": 1.04347515, "balance_loss_mlp": 1.0263145, "epoch": 0.48856155117991884, "flos": 13771599269760.0, "grad_norm": 2.02778788313487, "language_loss": 0.72584462, "learning_rate": 2.170959527233356e-06, "loss": 0.74703586, "num_input_tokens_seen": 174678270, "step": 8126, "time_per_iteration": 2.7370314598083496 }, { "auxiliary_loss_clip": 0.0111111, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.0410614, "balance_loss_mlp": 1.02405286, "epoch": 0.4886216744325868, "flos": 32087764206720.0, "grad_norm": 1.7703486674415694, "language_loss": 0.68917644, "learning_rate": 2.1705714877789633e-06, "loss": 0.71066898, "num_input_tokens_seen": 174698360, "step": 8127, "time_per_iteration": 2.811074733734131 }, { "auxiliary_loss_clip": 0.01125381, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.04334533, "balance_loss_mlp": 1.01993454, "epoch": 0.48868179768525477, "flos": 19610063631360.0, "grad_norm": 1.5960676368468543, "language_loss": 0.76178646, "learning_rate": 2.170183441856481e-06, "loss": 0.78337616, "num_input_tokens_seen": 174716755, "step": 8128, "time_per_iteration": 2.5751638412475586 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.04598355, "balance_loss_mlp": 1.01818776, "epoch": 0.48874192093792274, "flos": 21286912199040.0, "grad_norm": 1.5334009671548041, "language_loss": 0.7574327, "learning_rate": 2.1697953894806265e-06, "loss": 0.77901042, "num_input_tokens_seen": 174735560, "step": 8129, "time_per_iteration": 4.080120325088501 }, { "auxiliary_loss_clip": 0.01113338, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.04372275, "balance_loss_mlp": 1.0174098, "epoch": 0.4888020441905907, "flos": 14173439696640.0, "grad_norm": 2.756799094025314, "language_loss": 0.64951944, "learning_rate": 2.169407330666114e-06, "loss": 0.67096692, "num_input_tokens_seen": 174752730, "step": 8130, "time_per_iteration": 4.153359413146973 }, { "auxiliary_loss_clip": 0.01087218, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.0399828, "balance_loss_mlp": 1.02282333, "epoch": 0.48886216744325867, "flos": 24097891766400.0, "grad_norm": 1.9114203912665453, "language_loss": 0.72505724, "learning_rate": 2.169019265427658e-06, "loss": 0.746292, "num_input_tokens_seen": 174772520, "step": 8131, "time_per_iteration": 2.751070499420166 }, { "auxiliary_loss_clip": 0.0111646, "auxiliary_loss_mlp": 0.01041385, "balance_loss_clip": 1.04625905, "balance_loss_mlp": 1.0270561, "epoch": 0.48892229069592663, "flos": 38431419402240.0, "grad_norm": 1.3981624070335212, "language_loss": 0.69684219, "learning_rate": 2.1686311937799745e-06, "loss": 0.71842068, "num_input_tokens_seen": 174796540, "step": 8132, "time_per_iteration": 4.478942632675171 }, { "auxiliary_loss_clip": 0.01109765, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.04673529, "balance_loss_mlp": 1.01630616, "epoch": 0.4889824139485946, "flos": 23843321101440.0, "grad_norm": 1.328560083390073, "language_loss": 0.69882882, "learning_rate": 2.1682431157377797e-06, "loss": 0.72022772, "num_input_tokens_seen": 174817840, "step": 8133, "time_per_iteration": 4.2415807247161865 }, { "auxiliary_loss_clip": 0.01062397, "auxiliary_loss_mlp": 0.01042948, "balance_loss_clip": 1.03593254, "balance_loss_mlp": 1.02922726, "epoch": 0.48904253720126256, "flos": 24425827960320.0, "grad_norm": 1.919712430573748, "language_loss": 0.70950568, "learning_rate": 2.1678550313157883e-06, "loss": 0.73055917, "num_input_tokens_seen": 174837885, "step": 8134, "time_per_iteration": 2.772383689880371 }, { "auxiliary_loss_clip": 0.01084139, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.04342508, "balance_loss_mlp": 1.02082086, "epoch": 0.4891026604539306, "flos": 24170682677760.0, "grad_norm": 1.9244253075686233, "language_loss": 0.80356431, "learning_rate": 2.167466940528718e-06, "loss": 0.82475942, "num_input_tokens_seen": 174855240, "step": 8135, "time_per_iteration": 2.7362964153289795 }, { "auxiliary_loss_clip": 0.01124035, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.04567957, "balance_loss_mlp": 1.0232842, "epoch": 0.48916278370659855, "flos": 21470954509440.0, "grad_norm": 1.8037329109010316, "language_loss": 0.74794912, "learning_rate": 2.1670788433912843e-06, "loss": 0.76954633, "num_input_tokens_seen": 174875145, "step": 8136, "time_per_iteration": 2.766477346420288 }, { "auxiliary_loss_clip": 0.01097387, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.04352307, "balance_loss_mlp": 1.02971756, "epoch": 0.4892229069592665, "flos": 22309755886080.0, "grad_norm": 1.6588593954338173, "language_loss": 0.73403543, "learning_rate": 2.166690739918204e-06, "loss": 0.75544488, "num_input_tokens_seen": 174894770, "step": 8137, "time_per_iteration": 2.720778703689575 }, { "auxiliary_loss_clip": 0.01051073, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.03699243, "balance_loss_mlp": 1.01726234, "epoch": 0.4892830302119345, "flos": 12786856934400.0, "grad_norm": 2.090077124931452, "language_loss": 0.75336611, "learning_rate": 2.1663026301241944e-06, "loss": 0.77418739, "num_input_tokens_seen": 174912780, "step": 8138, "time_per_iteration": 2.7975735664367676 }, { "auxiliary_loss_clip": 0.01091927, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.04700375, "balance_loss_mlp": 1.02536893, "epoch": 0.48934315346460244, "flos": 20813896972800.0, "grad_norm": 1.6152276292204855, "language_loss": 0.74018902, "learning_rate": 2.165914514023972e-06, "loss": 0.76149184, "num_input_tokens_seen": 174931250, "step": 8139, "time_per_iteration": 2.7135186195373535 }, { "auxiliary_loss_clip": 0.01115319, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04502773, "balance_loss_mlp": 1.02416921, "epoch": 0.4894032767172704, "flos": 19755537713280.0, "grad_norm": 1.878714628680016, "language_loss": 0.62168998, "learning_rate": 2.165526391632255e-06, "loss": 0.64321709, "num_input_tokens_seen": 174951105, "step": 8140, "time_per_iteration": 2.6594550609588623 }, { "auxiliary_loss_clip": 0.0109215, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.04310822, "balance_loss_mlp": 1.02509928, "epoch": 0.4894633999699384, "flos": 17818982835840.0, "grad_norm": 1.7004882369900214, "language_loss": 0.82400143, "learning_rate": 2.1651382629637608e-06, "loss": 0.84532392, "num_input_tokens_seen": 174969120, "step": 8141, "time_per_iteration": 2.648696184158325 }, { "auxiliary_loss_clip": 0.01095522, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.04897892, "balance_loss_mlp": 1.01975965, "epoch": 0.48952352322260634, "flos": 25523222325120.0, "grad_norm": 1.6750975318537598, "language_loss": 0.72031653, "learning_rate": 2.1647501280332066e-06, "loss": 0.74161184, "num_input_tokens_seen": 174991295, "step": 8142, "time_per_iteration": 2.770524740219116 }, { "auxiliary_loss_clip": 0.01124129, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.04588366, "balance_loss_mlp": 1.02094769, "epoch": 0.4895836464752743, "flos": 29055502903680.0, "grad_norm": 8.902000760681485, "language_loss": 0.66877794, "learning_rate": 2.1643619868553105e-06, "loss": 0.6903578, "num_input_tokens_seen": 175012830, "step": 8143, "time_per_iteration": 2.717714786529541 }, { "auxiliary_loss_clip": 0.01116098, "auxiliary_loss_mlp": 0.00770078, "balance_loss_clip": 1.04774415, "balance_loss_mlp": 1.00015235, "epoch": 0.48964376972794227, "flos": 33546958312320.0, "grad_norm": 1.880195910988658, "language_loss": 0.75596797, "learning_rate": 2.163973839444793e-06, "loss": 0.77482975, "num_input_tokens_seen": 175035695, "step": 8144, "time_per_iteration": 2.801825761795044 }, { "auxiliary_loss_clip": 0.01099436, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.04169714, "balance_loss_mlp": 1.01753187, "epoch": 0.48970389298061023, "flos": 22054035985920.0, "grad_norm": 1.9123659180679726, "language_loss": 0.75693774, "learning_rate": 2.1635856858163695e-06, "loss": 0.77824795, "num_input_tokens_seen": 175056425, "step": 8145, "time_per_iteration": 2.781550168991089 }, { "auxiliary_loss_clip": 0.01108869, "auxiliary_loss_mlp": 0.0077212, "balance_loss_clip": 1.04549527, "balance_loss_mlp": 1.00018287, "epoch": 0.4897640162332782, "flos": 20084299920000.0, "grad_norm": 1.6675270752681912, "language_loss": 0.80437362, "learning_rate": 2.163197525984761e-06, "loss": 0.82318354, "num_input_tokens_seen": 175074800, "step": 8146, "time_per_iteration": 2.699277400970459 }, { "auxiliary_loss_clip": 0.01109996, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.04312873, "balance_loss_mlp": 1.02007508, "epoch": 0.48982413948594616, "flos": 23806225330560.0, "grad_norm": 2.022171046548427, "language_loss": 0.74193209, "learning_rate": 2.162809359964687e-06, "loss": 0.76336789, "num_input_tokens_seen": 175094500, "step": 8147, "time_per_iteration": 2.732973337173462 }, { "auxiliary_loss_clip": 0.01095071, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.0448947, "balance_loss_mlp": 1.0193938, "epoch": 0.4898842627386142, "flos": 17639645207040.0, "grad_norm": 2.1017800501084882, "language_loss": 0.8286857, "learning_rate": 2.162421187770864e-06, "loss": 0.84996164, "num_input_tokens_seen": 175112920, "step": 8148, "time_per_iteration": 2.662179708480835 }, { "auxiliary_loss_clip": 0.01091374, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.04345882, "balance_loss_mlp": 1.0213387, "epoch": 0.48994438599128215, "flos": 16617914841600.0, "grad_norm": 1.9007753197415815, "language_loss": 0.74256468, "learning_rate": 2.162033009418015e-06, "loss": 0.76381284, "num_input_tokens_seen": 175129910, "step": 8149, "time_per_iteration": 2.7373321056365967 }, { "auxiliary_loss_clip": 0.01130985, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.04766726, "balance_loss_mlp": 1.02247095, "epoch": 0.4900045092439501, "flos": 26614834600320.0, "grad_norm": 1.7000980888808985, "language_loss": 0.76319683, "learning_rate": 2.1616448249208567e-06, "loss": 0.78487676, "num_input_tokens_seen": 175148705, "step": 8150, "time_per_iteration": 2.653003692626953 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.04736936, "balance_loss_mlp": 1.02152276, "epoch": 0.4900646324966181, "flos": 19902125116800.0, "grad_norm": 2.127966402053614, "language_loss": 0.72754669, "learning_rate": 2.1612566342941106e-06, "loss": 0.7489413, "num_input_tokens_seen": 175167425, "step": 8151, "time_per_iteration": 2.7142715454101562 }, { "auxiliary_loss_clip": 0.01018676, "auxiliary_loss_mlp": 0.01008139, "balance_loss_clip": 1.02870607, "balance_loss_mlp": 1.00680435, "epoch": 0.49012475574928605, "flos": 59189620337280.0, "grad_norm": 0.8300028938034224, "language_loss": 0.54350889, "learning_rate": 2.1608684375524977e-06, "loss": 0.56377703, "num_input_tokens_seen": 175227985, "step": 8152, "time_per_iteration": 3.218646764755249 }, { "auxiliary_loss_clip": 0.01066533, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.04041779, "balance_loss_mlp": 1.02058959, "epoch": 0.490184879001954, "flos": 45259797657600.0, "grad_norm": 1.9767488244056508, "language_loss": 0.61212152, "learning_rate": 2.1604802347107364e-06, "loss": 0.6331318, "num_input_tokens_seen": 175251895, "step": 8153, "time_per_iteration": 3.043501615524292 }, { "auxiliary_loss_clip": 0.01091315, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.04408598, "balance_loss_mlp": 1.02139306, "epoch": 0.490245002254622, "flos": 28002135634560.0, "grad_norm": 1.494326859026801, "language_loss": 0.767699, "learning_rate": 2.160092025783549e-06, "loss": 0.78896195, "num_input_tokens_seen": 175272770, "step": 8154, "time_per_iteration": 2.783686399459839 }, { "auxiliary_loss_clip": 0.01032948, "auxiliary_loss_mlp": 0.01009488, "balance_loss_clip": 1.02573824, "balance_loss_mlp": 1.00805795, "epoch": 0.49030512550728994, "flos": 58951318533120.0, "grad_norm": 0.9569310885457037, "language_loss": 0.6699397, "learning_rate": 2.1597038107856564e-06, "loss": 0.69036406, "num_input_tokens_seen": 175336320, "step": 8155, "time_per_iteration": 3.2836861610412598 }, { "auxiliary_loss_clip": 0.01128627, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.04858041, "balance_loss_mlp": 1.01990271, "epoch": 0.4903652487599579, "flos": 19791843384960.0, "grad_norm": 1.7952288566158678, "language_loss": 0.76406527, "learning_rate": 2.1593155897317784e-06, "loss": 0.78568316, "num_input_tokens_seen": 175353540, "step": 8156, "time_per_iteration": 2.77978515625 }, { "auxiliary_loss_clip": 0.01115952, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.02066517, "epoch": 0.49042537201262587, "flos": 21762082241280.0, "grad_norm": 2.671892010748055, "language_loss": 0.83756495, "learning_rate": 2.1589273626366377e-06, "loss": 0.85906386, "num_input_tokens_seen": 175370445, "step": 8157, "time_per_iteration": 2.6860296726226807 }, { "auxiliary_loss_clip": 0.01116981, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.04626417, "balance_loss_mlp": 1.02103734, "epoch": 0.49048549526529384, "flos": 18953042008320.0, "grad_norm": 1.6916175452091182, "language_loss": 0.79447746, "learning_rate": 2.158539129514956e-06, "loss": 0.81599557, "num_input_tokens_seen": 175389020, "step": 8158, "time_per_iteration": 2.723398208618164 }, { "auxiliary_loss_clip": 0.01130092, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.0493114, "balance_loss_mlp": 1.02237535, "epoch": 0.4905456185179618, "flos": 26906393295360.0, "grad_norm": 1.5924994780725177, "language_loss": 0.69469124, "learning_rate": 2.158150890381454e-06, "loss": 0.71635228, "num_input_tokens_seen": 175409545, "step": 8159, "time_per_iteration": 2.685887575149536 }, { "auxiliary_loss_clip": 0.01109209, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.04416955, "balance_loss_mlp": 1.02719688, "epoch": 0.49060574177062977, "flos": 20412343854720.0, "grad_norm": 1.8488353997421354, "language_loss": 0.73372805, "learning_rate": 2.157762645250854e-06, "loss": 0.75523615, "num_input_tokens_seen": 175429335, "step": 8160, "time_per_iteration": 2.7002642154693604 }, { "auxiliary_loss_clip": 0.01111433, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.04374194, "balance_loss_mlp": 1.02655184, "epoch": 0.4906658650232978, "flos": 17493704248320.0, "grad_norm": 4.058452856445761, "language_loss": 0.71791285, "learning_rate": 2.1573743941378796e-06, "loss": 0.73943567, "num_input_tokens_seen": 175446955, "step": 8161, "time_per_iteration": 2.641211748123169 }, { "auxiliary_loss_clip": 0.01077408, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.04114866, "balance_loss_mlp": 1.02337408, "epoch": 0.49072598827596575, "flos": 26614439550720.0, "grad_norm": 1.5881872934975843, "language_loss": 0.68676394, "learning_rate": 2.1569861370572517e-06, "loss": 0.7079066, "num_input_tokens_seen": 175468195, "step": 8162, "time_per_iteration": 2.7768666744232178 }, { "auxiliary_loss_clip": 0.01114289, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.04699993, "balance_loss_mlp": 1.02219641, "epoch": 0.4907861115286337, "flos": 20412595249920.0, "grad_norm": 1.6090900616469643, "language_loss": 0.63697332, "learning_rate": 2.1565978740236944e-06, "loss": 0.65848798, "num_input_tokens_seen": 175487455, "step": 8163, "time_per_iteration": 2.658141851425171 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01032891, "balance_loss_clip": 1.03996313, "balance_loss_mlp": 1.01987886, "epoch": 0.4908462347813017, "flos": 14064271286400.0, "grad_norm": 2.5242130171230954, "language_loss": 0.77383208, "learning_rate": 2.1562096050519293e-06, "loss": 0.79502106, "num_input_tokens_seen": 175504450, "step": 8164, "time_per_iteration": 2.6626484394073486 }, { "auxiliary_loss_clip": 0.01110027, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.04298282, "balance_loss_mlp": 1.0221138, "epoch": 0.49090635803396965, "flos": 18735100237440.0, "grad_norm": 1.6753117148295888, "language_loss": 0.76749474, "learning_rate": 2.1558213301566806e-06, "loss": 0.78897208, "num_input_tokens_seen": 175523600, "step": 8165, "time_per_iteration": 2.5757079124450684 }, { "auxiliary_loss_clip": 0.0110394, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.04666007, "balance_loss_mlp": 1.02205336, "epoch": 0.4909664812866376, "flos": 20558500295040.0, "grad_norm": 1.5531816235742995, "language_loss": 0.77461708, "learning_rate": 2.1554330493526716e-06, "loss": 0.79601395, "num_input_tokens_seen": 175542720, "step": 8166, "time_per_iteration": 2.7169244289398193 }, { "auxiliary_loss_clip": 0.01040608, "auxiliary_loss_mlp": 0.00998968, "balance_loss_clip": 1.02393854, "balance_loss_mlp": 0.99768084, "epoch": 0.4910266045393056, "flos": 54684017948160.0, "grad_norm": 0.7914566078875801, "language_loss": 0.54175258, "learning_rate": 2.1550447626546253e-06, "loss": 0.56214833, "num_input_tokens_seen": 175598640, "step": 8167, "time_per_iteration": 3.192706823348999 }, { "auxiliary_loss_clip": 0.01081549, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.04554164, "balance_loss_mlp": 1.02288687, "epoch": 0.49108672779197354, "flos": 16246454342400.0, "grad_norm": 1.702915470367474, "language_loss": 0.85894108, "learning_rate": 2.1546564700772665e-06, "loss": 0.88012105, "num_input_tokens_seen": 175615675, "step": 8168, "time_per_iteration": 2.7353274822235107 }, { "auxiliary_loss_clip": 0.01107152, "auxiliary_loss_mlp": 0.01045094, "balance_loss_clip": 1.04374826, "balance_loss_mlp": 1.030586, "epoch": 0.4911468510446415, "flos": 19825419623040.0, "grad_norm": 1.7298624053450853, "language_loss": 0.73407066, "learning_rate": 2.1542681716353193e-06, "loss": 0.75559318, "num_input_tokens_seen": 175632255, "step": 8169, "time_per_iteration": 5.773583173751831 }, { "auxiliary_loss_clip": 0.01112799, "auxiliary_loss_mlp": 0.01029653, "balance_loss_clip": 1.04443777, "balance_loss_mlp": 1.01692092, "epoch": 0.4912069742973095, "flos": 21212684743680.0, "grad_norm": 1.4410309608870682, "language_loss": 0.77824241, "learning_rate": 2.1538798673435068e-06, "loss": 0.79966694, "num_input_tokens_seen": 175651625, "step": 8170, "time_per_iteration": 2.6583240032196045 }, { "auxiliary_loss_clip": 0.01096689, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04164565, "balance_loss_mlp": 1.02643037, "epoch": 0.49126709754997744, "flos": 19537129065600.0, "grad_norm": 2.2423824181328853, "language_loss": 0.76314211, "learning_rate": 2.1534915572165545e-06, "loss": 0.78449798, "num_input_tokens_seen": 175669265, "step": 8171, "time_per_iteration": 4.3524169921875 }, { "auxiliary_loss_clip": 0.01104096, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.04284763, "balance_loss_mlp": 1.02299559, "epoch": 0.4913272208026454, "flos": 12239686080000.0, "grad_norm": 1.898078833449508, "language_loss": 0.82055932, "learning_rate": 2.1531032412691875e-06, "loss": 0.84196377, "num_input_tokens_seen": 175686065, "step": 8172, "time_per_iteration": 4.201699495315552 }, { "auxiliary_loss_clip": 0.0104227, "auxiliary_loss_mlp": 0.01009809, "balance_loss_clip": 1.02604604, "balance_loss_mlp": 1.00842655, "epoch": 0.49138734405531337, "flos": 65465871661440.0, "grad_norm": 0.6872688544677212, "language_loss": 0.53258997, "learning_rate": 2.1527149195161295e-06, "loss": 0.55311078, "num_input_tokens_seen": 175748595, "step": 8173, "time_per_iteration": 3.1827917098999023 }, { "auxiliary_loss_clip": 0.0111451, "auxiliary_loss_mlp": 0.00771219, "balance_loss_clip": 1.04312336, "balance_loss_mlp": 1.00013208, "epoch": 0.4914474673079814, "flos": 18439052342400.0, "grad_norm": 2.1937948702767054, "language_loss": 0.63081181, "learning_rate": 2.152326591972107e-06, "loss": 0.64966911, "num_input_tokens_seen": 175766770, "step": 8174, "time_per_iteration": 2.591662883758545 }, { "auxiliary_loss_clip": 0.01086287, "auxiliary_loss_mlp": 0.01044728, "balance_loss_clip": 1.04296112, "balance_loss_mlp": 1.02985096, "epoch": 0.49150759056064935, "flos": 21685053525120.0, "grad_norm": 1.9252900771693722, "language_loss": 0.69252932, "learning_rate": 2.1519382586518445e-06, "loss": 0.71383941, "num_input_tokens_seen": 175783605, "step": 8175, "time_per_iteration": 2.7286670207977295 }, { "auxiliary_loss_clip": 0.01112428, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.0438236, "balance_loss_mlp": 1.02018952, "epoch": 0.4915677138133173, "flos": 22382439056640.0, "grad_norm": 1.7316891792167346, "language_loss": 0.74424642, "learning_rate": 2.151549919570068e-06, "loss": 0.76570022, "num_input_tokens_seen": 175801390, "step": 8176, "time_per_iteration": 2.623328685760498 }, { "auxiliary_loss_clip": 0.01117272, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.04691124, "balance_loss_mlp": 1.03022528, "epoch": 0.4916278370659853, "flos": 18402890325120.0, "grad_norm": 1.776030453931397, "language_loss": 0.70309961, "learning_rate": 2.1511615747415036e-06, "loss": 0.72470981, "num_input_tokens_seen": 175819830, "step": 8177, "time_per_iteration": 2.642073154449463 }, { "auxiliary_loss_clip": 0.01031811, "auxiliary_loss_mlp": 0.00752155, "balance_loss_clip": 1.02581143, "balance_loss_mlp": 0.99997473, "epoch": 0.49168796031865325, "flos": 66609124715520.0, "grad_norm": 0.6890109431226723, "language_loss": 0.46192822, "learning_rate": 2.150773224180877e-06, "loss": 0.47976786, "num_input_tokens_seen": 175881765, "step": 8178, "time_per_iteration": 3.195594072341919 }, { "auxiliary_loss_clip": 0.0112992, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.04735565, "balance_loss_mlp": 1.02215147, "epoch": 0.4917480835713212, "flos": 20959335141120.0, "grad_norm": 1.748461689040465, "language_loss": 0.65961659, "learning_rate": 2.1503848679029147e-06, "loss": 0.6812827, "num_input_tokens_seen": 175901795, "step": 8179, "time_per_iteration": 2.675170421600342 }, { "auxiliary_loss_clip": 0.01036062, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.03444839, "balance_loss_mlp": 1.031497, "epoch": 0.4918082068239892, "flos": 15772900412160.0, "grad_norm": 2.3413868243180493, "language_loss": 0.70163, "learning_rate": 2.149996505922343e-06, "loss": 0.72246957, "num_input_tokens_seen": 175917770, "step": 8180, "time_per_iteration": 2.9436681270599365 }, { "auxiliary_loss_clip": 0.01099418, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.04268646, "balance_loss_mlp": 1.02306247, "epoch": 0.49186833007665715, "flos": 24604806453120.0, "grad_norm": 1.915055420772654, "language_loss": 0.84369922, "learning_rate": 2.1496081382538895e-06, "loss": 0.86506534, "num_input_tokens_seen": 175937000, "step": 8181, "time_per_iteration": 2.8556039333343506 }, { "auxiliary_loss_clip": 0.01125975, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.04886341, "balance_loss_mlp": 1.0226841, "epoch": 0.4919284533293251, "flos": 22090557139200.0, "grad_norm": 2.841846979456106, "language_loss": 0.72482812, "learning_rate": 2.1492197649122793e-06, "loss": 0.74644387, "num_input_tokens_seen": 175955170, "step": 8182, "time_per_iteration": 2.5908985137939453 }, { "auxiliary_loss_clip": 0.01088743, "auxiliary_loss_mlp": 0.01035989, "balance_loss_clip": 1.04323542, "balance_loss_mlp": 1.0227685, "epoch": 0.4919885765819931, "flos": 23368043318400.0, "grad_norm": 2.038591418033226, "language_loss": 0.72608387, "learning_rate": 2.1488313859122412e-06, "loss": 0.74733126, "num_input_tokens_seen": 175973725, "step": 8183, "time_per_iteration": 2.7704007625579834 }, { "auxiliary_loss_clip": 0.0106529, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.03853834, "balance_loss_mlp": 1.0204556, "epoch": 0.49204869983466104, "flos": 21360493209600.0, "grad_norm": 3.5725391309360406, "language_loss": 0.77354276, "learning_rate": 2.1484430012685015e-06, "loss": 0.79454923, "num_input_tokens_seen": 175993885, "step": 8184, "time_per_iteration": 2.8195126056671143 }, { "auxiliary_loss_clip": 0.01094147, "auxiliary_loss_mlp": 0.01040773, "balance_loss_clip": 1.04233742, "balance_loss_mlp": 1.02739143, "epoch": 0.492108823087329, "flos": 21142695093120.0, "grad_norm": 1.8939343643350832, "language_loss": 0.70917577, "learning_rate": 2.148054610995789e-06, "loss": 0.73052496, "num_input_tokens_seen": 176014210, "step": 8185, "time_per_iteration": 2.678464412689209 }, { "auxiliary_loss_clip": 0.01108334, "auxiliary_loss_mlp": 0.01037918, "balance_loss_clip": 1.0468477, "balance_loss_mlp": 1.02306461, "epoch": 0.49216894633999697, "flos": 25116605389440.0, "grad_norm": 1.7900274786799464, "language_loss": 0.75134045, "learning_rate": 2.147666215108831e-06, "loss": 0.77280295, "num_input_tokens_seen": 176033890, "step": 8186, "time_per_iteration": 2.754204273223877 }, { "auxiliary_loss_clip": 0.01116557, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.04770708, "balance_loss_mlp": 1.02050531, "epoch": 0.49222906959266494, "flos": 22637943475200.0, "grad_norm": 2.9803647414716945, "language_loss": 0.67526996, "learning_rate": 2.1472778136223545e-06, "loss": 0.69678307, "num_input_tokens_seen": 176052720, "step": 8187, "time_per_iteration": 2.6845459938049316 }, { "auxiliary_loss_clip": 0.0108036, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.04077077, "balance_loss_mlp": 1.02301288, "epoch": 0.49228919284533296, "flos": 20410548174720.0, "grad_norm": 1.410632675975, "language_loss": 0.67109811, "learning_rate": 2.1468894065510894e-06, "loss": 0.6922701, "num_input_tokens_seen": 176072545, "step": 8188, "time_per_iteration": 2.8322603702545166 }, { "auxiliary_loss_clip": 0.01119978, "auxiliary_loss_mlp": 0.01034509, "balance_loss_clip": 1.04967701, "balance_loss_mlp": 1.02131248, "epoch": 0.4923493160980009, "flos": 27122359818240.0, "grad_norm": 1.8145698664310643, "language_loss": 0.74643195, "learning_rate": 2.1465009939097623e-06, "loss": 0.76797676, "num_input_tokens_seen": 176091490, "step": 8189, "time_per_iteration": 2.700728178024292 }, { "auxiliary_loss_clip": 0.01102804, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 1.04349804, "balance_loss_mlp": 1.0138967, "epoch": 0.4924094393506689, "flos": 35736683224320.0, "grad_norm": 1.5012400452063497, "language_loss": 0.63989937, "learning_rate": 2.146112575713104e-06, "loss": 0.66120183, "num_input_tokens_seen": 176113200, "step": 8190, "time_per_iteration": 2.781034231185913 }, { "auxiliary_loss_clip": 0.01127618, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.04802811, "balance_loss_mlp": 1.01666641, "epoch": 0.49246956260333685, "flos": 20412487509120.0, "grad_norm": 2.59854956867769, "language_loss": 0.71723747, "learning_rate": 2.1457241519758413e-06, "loss": 0.73882031, "num_input_tokens_seen": 176132485, "step": 8191, "time_per_iteration": 2.6378936767578125 }, { "auxiliary_loss_clip": 0.01125365, "auxiliary_loss_mlp": 0.00771087, "balance_loss_clip": 1.04543817, "balance_loss_mlp": 1.00005293, "epoch": 0.4925296858560048, "flos": 38976938231040.0, "grad_norm": 1.5444009886503365, "language_loss": 0.71964842, "learning_rate": 2.1453357227127043e-06, "loss": 0.73861289, "num_input_tokens_seen": 176155755, "step": 8192, "time_per_iteration": 2.748840570449829 }, { "auxiliary_loss_clip": 0.01029185, "auxiliary_loss_mlp": 0.01001084, "balance_loss_clip": 1.02257538, "balance_loss_mlp": 0.9996711, "epoch": 0.4925898091086728, "flos": 64278917712000.0, "grad_norm": 0.718294486843201, "language_loss": 0.52137887, "learning_rate": 2.1449472879384224e-06, "loss": 0.54168153, "num_input_tokens_seen": 176216295, "step": 8193, "time_per_iteration": 3.264312267303467 }, { "auxiliary_loss_clip": 0.01125829, "auxiliary_loss_mlp": 0.01041308, "balance_loss_clip": 1.04740691, "balance_loss_mlp": 1.02760482, "epoch": 0.49264993236134075, "flos": 23036372110080.0, "grad_norm": 1.4111181716707888, "language_loss": 0.76839447, "learning_rate": 2.1445588476677246e-06, "loss": 0.79006582, "num_input_tokens_seen": 176235925, "step": 8194, "time_per_iteration": 2.7086539268493652 }, { "auxiliary_loss_clip": 0.01098073, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.04026222, "balance_loss_mlp": 1.02031338, "epoch": 0.4927100556140087, "flos": 24718212668160.0, "grad_norm": 1.9420104205554047, "language_loss": 0.70233512, "learning_rate": 2.144170401915341e-06, "loss": 0.72365344, "num_input_tokens_seen": 176253865, "step": 8195, "time_per_iteration": 2.6881814002990723 }, { "auxiliary_loss_clip": 0.01087059, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.04724264, "balance_loss_mlp": 1.02013052, "epoch": 0.4927701788666767, "flos": 23505544581120.0, "grad_norm": 2.097647655801467, "language_loss": 0.81090224, "learning_rate": 2.143781950696001e-06, "loss": 0.83210671, "num_input_tokens_seen": 176271525, "step": 8196, "time_per_iteration": 2.7997779846191406 }, { "auxiliary_loss_clip": 0.01092387, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.04048955, "balance_loss_mlp": 1.0212965, "epoch": 0.49283030211934464, "flos": 22928891639040.0, "grad_norm": 1.9754651417860998, "language_loss": 0.70963365, "learning_rate": 2.1433934940244356e-06, "loss": 0.73091799, "num_input_tokens_seen": 176290810, "step": 8197, "time_per_iteration": 2.687640428543091 }, { "auxiliary_loss_clip": 0.01113685, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.04734302, "balance_loss_mlp": 1.0245595, "epoch": 0.4928904253720126, "flos": 16873024210560.0, "grad_norm": 2.0854468186505133, "language_loss": 0.84519106, "learning_rate": 2.143005031915374e-06, "loss": 0.86670601, "num_input_tokens_seen": 176309165, "step": 8198, "time_per_iteration": 2.660125255584717 }, { "auxiliary_loss_clip": 0.01120431, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.04784405, "balance_loss_mlp": 1.02326965, "epoch": 0.4929505486246806, "flos": 14866551509760.0, "grad_norm": 1.8081780640264744, "language_loss": 0.76137328, "learning_rate": 2.1426165643835467e-06, "loss": 0.78295165, "num_input_tokens_seen": 176324960, "step": 8199, "time_per_iteration": 2.6528286933898926 }, { "auxiliary_loss_clip": 0.0110111, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.0420711, "balance_loss_mlp": 1.02215934, "epoch": 0.49301067187734854, "flos": 23842351434240.0, "grad_norm": 1.5743655623972015, "language_loss": 0.60060918, "learning_rate": 2.1422280914436864e-06, "loss": 0.62199175, "num_input_tokens_seen": 176346195, "step": 8200, "time_per_iteration": 2.725208044052124 }, { "auxiliary_loss_clip": 0.01112367, "auxiliary_loss_mlp": 0.01042559, "balance_loss_clip": 1.04529691, "balance_loss_mlp": 1.0288918, "epoch": 0.49307079513001656, "flos": 22491284244480.0, "grad_norm": 1.489817328340962, "language_loss": 0.79219347, "learning_rate": 2.1418396131105213e-06, "loss": 0.81374276, "num_input_tokens_seen": 176366735, "step": 8201, "time_per_iteration": 2.6749329566955566 }, { "auxiliary_loss_clip": 0.0112059, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.04529119, "balance_loss_mlp": 1.02063608, "epoch": 0.4931309183826845, "flos": 15924587546880.0, "grad_norm": 2.8764138588073527, "language_loss": 0.67214566, "learning_rate": 2.141451129398785e-06, "loss": 0.69371456, "num_input_tokens_seen": 176384475, "step": 8202, "time_per_iteration": 2.6964852809906006 }, { "auxiliary_loss_clip": 0.01101254, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.04416037, "balance_loss_mlp": 1.01929486, "epoch": 0.4931910416353525, "flos": 27309059735040.0, "grad_norm": 2.180124290012348, "language_loss": 0.75387114, "learning_rate": 2.1410626403232076e-06, "loss": 0.77520943, "num_input_tokens_seen": 176402645, "step": 8203, "time_per_iteration": 2.725586175918579 }, { "auxiliary_loss_clip": 0.01070891, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.04055309, "balance_loss_mlp": 1.02355599, "epoch": 0.49325116488802045, "flos": 20806139635200.0, "grad_norm": 2.514240753505036, "language_loss": 0.8037259, "learning_rate": 2.1406741458985197e-06, "loss": 0.82481205, "num_input_tokens_seen": 176416715, "step": 8204, "time_per_iteration": 2.6802115440368652 }, { "auxiliary_loss_clip": 0.01112932, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.04543495, "balance_loss_mlp": 1.02662015, "epoch": 0.4933112881406884, "flos": 19865963099520.0, "grad_norm": 1.919360097124168, "language_loss": 0.65891969, "learning_rate": 2.140285646139455e-06, "loss": 0.68044984, "num_input_tokens_seen": 176435755, "step": 8205, "time_per_iteration": 2.6556243896484375 }, { "auxiliary_loss_clip": 0.01131728, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.02157259, "epoch": 0.4933714113933564, "flos": 21827977741440.0, "grad_norm": 2.0939603582763207, "language_loss": 0.66682738, "learning_rate": 2.139897141060744e-06, "loss": 0.68851495, "num_input_tokens_seen": 176453915, "step": 8206, "time_per_iteration": 2.6004998683929443 }, { "auxiliary_loss_clip": 0.01078434, "auxiliary_loss_mlp": 0.01042651, "balance_loss_clip": 1.04006064, "balance_loss_mlp": 1.02803612, "epoch": 0.49343153464602435, "flos": 27890130049920.0, "grad_norm": 1.7303473596412533, "language_loss": 0.76393557, "learning_rate": 2.1395086306771196e-06, "loss": 0.78514642, "num_input_tokens_seen": 176475175, "step": 8207, "time_per_iteration": 2.7545268535614014 }, { "auxiliary_loss_clip": 0.01104435, "auxiliary_loss_mlp": 0.01037384, "balance_loss_clip": 1.04703426, "balance_loss_mlp": 1.02245331, "epoch": 0.4934916578986923, "flos": 24681080983680.0, "grad_norm": 2.36511926609042, "language_loss": 0.60212123, "learning_rate": 2.1391201150033147e-06, "loss": 0.62353945, "num_input_tokens_seen": 176494250, "step": 8208, "time_per_iteration": 4.556094408035278 }, { "auxiliary_loss_clip": 0.01108642, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.04619265, "balance_loss_mlp": 1.01990545, "epoch": 0.4935517811513603, "flos": 23405139089280.0, "grad_norm": 1.7507431161374047, "language_loss": 0.78938925, "learning_rate": 2.1387315940540598e-06, "loss": 0.81082511, "num_input_tokens_seen": 176513325, "step": 8209, "time_per_iteration": 4.171698093414307 }, { "auxiliary_loss_clip": 0.01094204, "auxiliary_loss_mlp": 0.00774879, "balance_loss_clip": 1.03905034, "balance_loss_mlp": 1.00007224, "epoch": 0.49361190440402825, "flos": 21944508439680.0, "grad_norm": 2.001694580419455, "language_loss": 0.79098332, "learning_rate": 2.138343067844089e-06, "loss": 0.80967414, "num_input_tokens_seen": 176532915, "step": 8210, "time_per_iteration": 4.38470196723938 }, { "auxiliary_loss_clip": 0.01113566, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.04458427, "balance_loss_mlp": 1.02467823, "epoch": 0.4936720276566962, "flos": 25115671635840.0, "grad_norm": 1.6707024820379262, "language_loss": 0.81313854, "learning_rate": 2.1379545363881363e-06, "loss": 0.83466691, "num_input_tokens_seen": 176552775, "step": 8211, "time_per_iteration": 4.290592193603516 }, { "auxiliary_loss_clip": 0.01082515, "auxiliary_loss_mlp": 0.01050398, "balance_loss_clip": 1.04066169, "balance_loss_mlp": 1.03376865, "epoch": 0.4937321509093642, "flos": 26358935132160.0, "grad_norm": 2.2904212815365477, "language_loss": 0.9144789, "learning_rate": 2.137565999700933e-06, "loss": 0.93580806, "num_input_tokens_seen": 176572185, "step": 8212, "time_per_iteration": 2.77516508102417 }, { "auxiliary_loss_clip": 0.010785, "auxiliary_loss_mlp": 0.01041938, "balance_loss_clip": 1.03849816, "balance_loss_mlp": 1.02666783, "epoch": 0.49379227416203214, "flos": 22961390469120.0, "grad_norm": 2.314209741920176, "language_loss": 0.65430582, "learning_rate": 2.1371774577972138e-06, "loss": 0.67551017, "num_input_tokens_seen": 176591490, "step": 8213, "time_per_iteration": 2.844672203063965 }, { "auxiliary_loss_clip": 0.01074353, "auxiliary_loss_mlp": 0.00772712, "balance_loss_clip": 1.03954375, "balance_loss_mlp": 1.00013876, "epoch": 0.49385239741470016, "flos": 32489101843200.0, "grad_norm": 1.8844803433311228, "language_loss": 0.7592994, "learning_rate": 2.136788910691711e-06, "loss": 0.77777004, "num_input_tokens_seen": 176612715, "step": 8214, "time_per_iteration": 2.828538179397583 }, { "auxiliary_loss_clip": 0.01131168, "auxiliary_loss_mlp": 0.01038594, "balance_loss_clip": 1.0492506, "balance_loss_mlp": 1.02410388, "epoch": 0.4939125206673681, "flos": 22492864442880.0, "grad_norm": 2.152096163807918, "language_loss": 0.84490359, "learning_rate": 2.1364003583991594e-06, "loss": 0.86660123, "num_input_tokens_seen": 176631950, "step": 8215, "time_per_iteration": 2.6413228511810303 }, { "auxiliary_loss_clip": 0.01108159, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.04206347, "balance_loss_mlp": 1.02092147, "epoch": 0.4939726439200361, "flos": 31176351486720.0, "grad_norm": 1.5888417840027016, "language_loss": 0.83245987, "learning_rate": 2.136011800934292e-06, "loss": 0.8538785, "num_input_tokens_seen": 176653060, "step": 8216, "time_per_iteration": 2.67913818359375 }, { "auxiliary_loss_clip": 0.01097989, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.04419255, "balance_loss_mlp": 1.02112412, "epoch": 0.49403276717270406, "flos": 22674213233280.0, "grad_norm": 2.8019860461659087, "language_loss": 0.74546432, "learning_rate": 2.1356232383118442e-06, "loss": 0.76678985, "num_input_tokens_seen": 176673895, "step": 8217, "time_per_iteration": 2.686866283416748 }, { "auxiliary_loss_clip": 0.0112431, "auxiliary_loss_mlp": 0.00771315, "balance_loss_clip": 1.04717755, "balance_loss_mlp": 1.00011575, "epoch": 0.494092890425372, "flos": 20741070147840.0, "grad_norm": 1.5679905275329922, "language_loss": 0.78933907, "learning_rate": 2.1352346705465494e-06, "loss": 0.80829537, "num_input_tokens_seen": 176692550, "step": 8218, "time_per_iteration": 2.6126081943511963 }, { "auxiliary_loss_clip": 0.01073156, "auxiliary_loss_mlp": 0.00770777, "balance_loss_clip": 1.03962803, "balance_loss_mlp": 1.000103, "epoch": 0.49415301367804, "flos": 18369026778240.0, "grad_norm": 2.059466953332075, "language_loss": 0.77003837, "learning_rate": 2.134846097653142e-06, "loss": 0.78847766, "num_input_tokens_seen": 176709335, "step": 8219, "time_per_iteration": 2.705432176589966 }, { "auxiliary_loss_clip": 0.01103123, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.04458046, "balance_loss_mlp": 1.02009845, "epoch": 0.49421313693070795, "flos": 17530620451200.0, "grad_norm": 1.9177646932354293, "language_loss": 0.62838733, "learning_rate": 2.134457519646357e-06, "loss": 0.64975989, "num_input_tokens_seen": 176727715, "step": 8220, "time_per_iteration": 2.615745782852173 }, { "auxiliary_loss_clip": 0.01124834, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.04509032, "balance_loss_mlp": 1.01844347, "epoch": 0.4942732601833759, "flos": 20812173120000.0, "grad_norm": 1.9687050610151906, "language_loss": 0.72233951, "learning_rate": 2.1340689365409296e-06, "loss": 0.74391389, "num_input_tokens_seen": 176747530, "step": 8221, "time_per_iteration": 2.6178054809570312 }, { "auxiliary_loss_clip": 0.01085939, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.04544675, "balance_loss_mlp": 1.02218497, "epoch": 0.4943333834360439, "flos": 15048941794560.0, "grad_norm": 1.861092907129918, "language_loss": 0.792252, "learning_rate": 2.133680348351595e-06, "loss": 0.81346589, "num_input_tokens_seen": 176765260, "step": 8222, "time_per_iteration": 2.679504632949829 }, { "auxiliary_loss_clip": 0.01115599, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.04686999, "balance_loss_mlp": 1.022048, "epoch": 0.49439350668871185, "flos": 16070420764800.0, "grad_norm": 2.9899447612273784, "language_loss": 0.72679973, "learning_rate": 2.133291755093088e-06, "loss": 0.7483207, "num_input_tokens_seen": 176781770, "step": 8223, "time_per_iteration": 2.581552028656006 }, { "auxiliary_loss_clip": 0.01116938, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 1.04635167, "balance_loss_mlp": 1.0257324, "epoch": 0.4944536299413798, "flos": 20880079781760.0, "grad_norm": 2.0609443486265784, "language_loss": 0.75248039, "learning_rate": 2.132903156780144e-06, "loss": 0.77405405, "num_input_tokens_seen": 176800655, "step": 8224, "time_per_iteration": 2.6427581310272217 }, { "auxiliary_loss_clip": 0.0110423, "auxiliary_loss_mlp": 0.01033189, "balance_loss_clip": 1.04815972, "balance_loss_mlp": 1.01925385, "epoch": 0.4945137531940478, "flos": 26608908856320.0, "grad_norm": 2.070444808683487, "language_loss": 0.6428299, "learning_rate": 2.1325145534274997e-06, "loss": 0.66420412, "num_input_tokens_seen": 176820610, "step": 8225, "time_per_iteration": 2.685084104537964 }, { "auxiliary_loss_clip": 0.01105728, "auxiliary_loss_mlp": 0.01034446, "balance_loss_clip": 1.04689407, "balance_loss_mlp": 1.02097511, "epoch": 0.49457387644671574, "flos": 23988148738560.0, "grad_norm": 2.0654038990553834, "language_loss": 0.76539797, "learning_rate": 2.1321259450498893e-06, "loss": 0.78679967, "num_input_tokens_seen": 176840520, "step": 8226, "time_per_iteration": 2.776888132095337 }, { "auxiliary_loss_clip": 0.01130995, "auxiliary_loss_mlp": 0.01043657, "balance_loss_clip": 1.04843736, "balance_loss_mlp": 1.02849376, "epoch": 0.49463399969938376, "flos": 26976598427520.0, "grad_norm": 1.7138853183765776, "language_loss": 0.71274078, "learning_rate": 2.131737331662051e-06, "loss": 0.7344873, "num_input_tokens_seen": 176860265, "step": 8227, "time_per_iteration": 2.6920416355133057 }, { "auxiliary_loss_clip": 0.01109805, "auxiliary_loss_mlp": 0.01042947, "balance_loss_clip": 1.04749131, "balance_loss_mlp": 1.02879047, "epoch": 0.49469412295205173, "flos": 29681534067840.0, "grad_norm": 1.5610614491128025, "language_loss": 0.7156117, "learning_rate": 2.131348713278718e-06, "loss": 0.73713928, "num_input_tokens_seen": 176882910, "step": 8228, "time_per_iteration": 2.7586421966552734 }, { "auxiliary_loss_clip": 0.01126513, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.04834974, "balance_loss_mlp": 1.01664948, "epoch": 0.4947542462047197, "flos": 24131791226880.0, "grad_norm": 1.7062154527873281, "language_loss": 0.83690989, "learning_rate": 2.1309600899146304e-06, "loss": 0.85848153, "num_input_tokens_seen": 176903030, "step": 8229, "time_per_iteration": 2.643385887145996 }, { "auxiliary_loss_clip": 0.01117283, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.04470325, "balance_loss_mlp": 1.0201304, "epoch": 0.49481436945738766, "flos": 20045049333120.0, "grad_norm": 1.8291146066570236, "language_loss": 0.74686736, "learning_rate": 2.1305714615845227e-06, "loss": 0.76839477, "num_input_tokens_seen": 176919025, "step": 8230, "time_per_iteration": 2.6726033687591553 }, { "auxiliary_loss_clip": 0.01112312, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.04797947, "balance_loss_mlp": 1.01941717, "epoch": 0.4948744927100556, "flos": 15669550005120.0, "grad_norm": 1.946821067065893, "language_loss": 0.79830235, "learning_rate": 2.1301828283031314e-06, "loss": 0.81975138, "num_input_tokens_seen": 176937945, "step": 8231, "time_per_iteration": 2.627202272415161 }, { "auxiliary_loss_clip": 0.01038701, "auxiliary_loss_mlp": 0.01000467, "balance_loss_clip": 1.02304196, "balance_loss_mlp": 0.99924535, "epoch": 0.4949346159627236, "flos": 68872071502080.0, "grad_norm": 0.7441317598934056, "language_loss": 0.60252988, "learning_rate": 2.1297941900851944e-06, "loss": 0.62292159, "num_input_tokens_seen": 177004575, "step": 8232, "time_per_iteration": 3.299022912979126 }, { "auxiliary_loss_clip": 0.01103975, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.04270494, "balance_loss_mlp": 1.0220201, "epoch": 0.49499473921539155, "flos": 24790285307520.0, "grad_norm": 1.6243536723265515, "language_loss": 0.69376481, "learning_rate": 2.1294055469454496e-06, "loss": 0.71517295, "num_input_tokens_seen": 177024155, "step": 8233, "time_per_iteration": 2.7124898433685303 }, { "auxiliary_loss_clip": 0.01069129, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.03902805, "balance_loss_mlp": 1.02584291, "epoch": 0.4950548624680595, "flos": 32707905540480.0, "grad_norm": 1.998308286461765, "language_loss": 0.66344726, "learning_rate": 2.129016898898633e-06, "loss": 0.68455309, "num_input_tokens_seen": 177046185, "step": 8234, "time_per_iteration": 2.7932980060577393 }, { "auxiliary_loss_clip": 0.01031932, "auxiliary_loss_mlp": 0.01001723, "balance_loss_clip": 1.02630067, "balance_loss_mlp": 1.00048304, "epoch": 0.4951149857207275, "flos": 50082173066880.0, "grad_norm": 0.7974470380945157, "language_loss": 0.58048564, "learning_rate": 2.128628245959482e-06, "loss": 0.60082221, "num_input_tokens_seen": 177099025, "step": 8235, "time_per_iteration": 3.095088481903076 }, { "auxiliary_loss_clip": 0.01096356, "auxiliary_loss_mlp": 0.01043085, "balance_loss_clip": 1.0431416, "balance_loss_mlp": 1.02861345, "epoch": 0.49517510897339545, "flos": 22236785406720.0, "grad_norm": 1.5745194755893521, "language_loss": 0.77200663, "learning_rate": 2.1282395881427355e-06, "loss": 0.793401, "num_input_tokens_seen": 177118365, "step": 8236, "time_per_iteration": 2.7678022384643555 }, { "auxiliary_loss_clip": 0.01081616, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.0420413, "balance_loss_mlp": 1.02397156, "epoch": 0.4952352322260634, "flos": 25374120969600.0, "grad_norm": 1.6979000405196067, "language_loss": 0.73080051, "learning_rate": 2.1278509254631315e-06, "loss": 0.75199521, "num_input_tokens_seen": 177136415, "step": 8237, "time_per_iteration": 2.764728307723999 }, { "auxiliary_loss_clip": 0.01124754, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.04693317, "balance_loss_mlp": 1.02215445, "epoch": 0.4952953554787314, "flos": 24608721035520.0, "grad_norm": 1.914497446494958, "language_loss": 0.75439888, "learning_rate": 2.127462257935406e-06, "loss": 0.77600276, "num_input_tokens_seen": 177155690, "step": 8238, "time_per_iteration": 2.66549015045166 }, { "auxiliary_loss_clip": 0.01084433, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.04372036, "balance_loss_mlp": 1.03062415, "epoch": 0.49535547873139935, "flos": 17311278049920.0, "grad_norm": 2.2478036902932508, "language_loss": 0.73706102, "learning_rate": 2.1270735855743008e-06, "loss": 0.75837457, "num_input_tokens_seen": 177173350, "step": 8239, "time_per_iteration": 2.703118324279785 }, { "auxiliary_loss_clip": 0.0104307, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.04188919, "balance_loss_mlp": 1.0223105, "epoch": 0.4954156019840673, "flos": 20740315962240.0, "grad_norm": 2.5033228354450667, "language_loss": 0.7926327, "learning_rate": 2.126684908394552e-06, "loss": 0.8134526, "num_input_tokens_seen": 177191115, "step": 8240, "time_per_iteration": 2.9256656169891357 }, { "auxiliary_loss_clip": 0.01116686, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.04832554, "balance_loss_mlp": 1.0278666, "epoch": 0.49547572523673533, "flos": 12820684567680.0, "grad_norm": 2.1558656465787367, "language_loss": 0.8547368, "learning_rate": 2.126296226410898e-06, "loss": 0.87631238, "num_input_tokens_seen": 177206155, "step": 8241, "time_per_iteration": 2.9096901416778564 }, { "auxiliary_loss_clip": 0.01067537, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.04159331, "balance_loss_mlp": 1.02591348, "epoch": 0.4955358484894033, "flos": 15597046402560.0, "grad_norm": 1.820610909823573, "language_loss": 0.77092397, "learning_rate": 2.1259075396380794e-06, "loss": 0.7919935, "num_input_tokens_seen": 177224815, "step": 8242, "time_per_iteration": 2.6902410984039307 }, { "auxiliary_loss_clip": 0.01104403, "auxiliary_loss_mlp": 0.00771127, "balance_loss_clip": 1.04569447, "balance_loss_mlp": 1.00017774, "epoch": 0.49559597174207126, "flos": 26464368528000.0, "grad_norm": 1.9730293387874334, "language_loss": 0.67737073, "learning_rate": 2.125518848090833e-06, "loss": 0.69612604, "num_input_tokens_seen": 177244490, "step": 8243, "time_per_iteration": 2.6972243785858154 }, { "auxiliary_loss_clip": 0.01112124, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.04816341, "balance_loss_mlp": 1.02076697, "epoch": 0.4956560949947392, "flos": 23148234040320.0, "grad_norm": 2.2375947263106526, "language_loss": 0.67908239, "learning_rate": 2.125130151783901e-06, "loss": 0.70054448, "num_input_tokens_seen": 177264340, "step": 8244, "time_per_iteration": 2.762528419494629 }, { "auxiliary_loss_clip": 0.01097015, "auxiliary_loss_mlp": 0.01040284, "balance_loss_clip": 1.04337358, "balance_loss_mlp": 1.02460194, "epoch": 0.4957162182474072, "flos": 20773461237120.0, "grad_norm": 1.8772229473228363, "language_loss": 0.74776495, "learning_rate": 2.12474145073202e-06, "loss": 0.76913798, "num_input_tokens_seen": 177283055, "step": 8245, "time_per_iteration": 2.7792561054229736 }, { "auxiliary_loss_clip": 0.01115174, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.04705966, "balance_loss_mlp": 1.02214909, "epoch": 0.49577634150007516, "flos": 18734202397440.0, "grad_norm": 1.8990901453025917, "language_loss": 0.8153336, "learning_rate": 2.1243527449499306e-06, "loss": 0.83684695, "num_input_tokens_seen": 177301140, "step": 8246, "time_per_iteration": 2.5740935802459717 }, { "auxiliary_loss_clip": 0.01090358, "auxiliary_loss_mlp": 0.0104326, "balance_loss_clip": 1.04562306, "balance_loss_mlp": 1.02767944, "epoch": 0.4958364647527431, "flos": 25554176870400.0, "grad_norm": 1.8707658617569873, "language_loss": 0.83808625, "learning_rate": 2.1239640344523733e-06, "loss": 0.85942245, "num_input_tokens_seen": 177323095, "step": 8247, "time_per_iteration": 4.410465955734253 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.05016184, "balance_loss_mlp": 1.01716995, "epoch": 0.4958965880054111, "flos": 24425325169920.0, "grad_norm": 1.9625896451991354, "language_loss": 0.83650881, "learning_rate": 2.123575319254087e-06, "loss": 0.85783684, "num_input_tokens_seen": 177339845, "step": 8248, "time_per_iteration": 4.395894289016724 }, { "auxiliary_loss_clip": 0.01118567, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.01836419, "epoch": 0.49595671125807905, "flos": 25083460114560.0, "grad_norm": 1.8247689581014963, "language_loss": 0.73558569, "learning_rate": 2.123186599369812e-06, "loss": 0.75709867, "num_input_tokens_seen": 177359980, "step": 8249, "time_per_iteration": 4.36426305770874 }, { "auxiliary_loss_clip": 0.01110094, "auxiliary_loss_mlp": 0.01046161, "balance_loss_clip": 1.04773486, "balance_loss_mlp": 1.03169477, "epoch": 0.496016834510747, "flos": 16435883692800.0, "grad_norm": 1.900690676640245, "language_loss": 0.75902295, "learning_rate": 2.122797874814289e-06, "loss": 0.78058553, "num_input_tokens_seen": 177378580, "step": 8250, "time_per_iteration": 4.203567266464233 }, { "auxiliary_loss_clip": 0.011299, "auxiliary_loss_mlp": 0.01042712, "balance_loss_clip": 1.04861271, "balance_loss_mlp": 1.02788305, "epoch": 0.496076957763415, "flos": 23437925228160.0, "grad_norm": 1.7086851316152774, "language_loss": 0.69983917, "learning_rate": 2.1224091456022585e-06, "loss": 0.72156531, "num_input_tokens_seen": 177398790, "step": 8251, "time_per_iteration": 2.6825788021087646 }, { "auxiliary_loss_clip": 0.01092939, "auxiliary_loss_mlp": 0.00771421, "balance_loss_clip": 1.04950809, "balance_loss_mlp": 1.00016773, "epoch": 0.49613708101608295, "flos": 16909509450240.0, "grad_norm": 1.9257049963935782, "language_loss": 0.80088174, "learning_rate": 2.122020411748461e-06, "loss": 0.81952536, "num_input_tokens_seen": 177416515, "step": 8252, "time_per_iteration": 2.7017300128936768 }, { "auxiliary_loss_clip": 0.01130139, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.04937637, "balance_loss_mlp": 1.01769102, "epoch": 0.4961972042687509, "flos": 16618094409600.0, "grad_norm": 1.7413302103337327, "language_loss": 0.81005448, "learning_rate": 2.1216316732676363e-06, "loss": 0.83169258, "num_input_tokens_seen": 177434425, "step": 8253, "time_per_iteration": 2.5844311714172363 }, { "auxiliary_loss_clip": 0.01092121, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.04245412, "balance_loss_mlp": 1.01743925, "epoch": 0.49625732752141893, "flos": 28956749437440.0, "grad_norm": 1.4814612406319185, "language_loss": 0.67246485, "learning_rate": 2.1212429301745275e-06, "loss": 0.69369686, "num_input_tokens_seen": 177459675, "step": 8254, "time_per_iteration": 2.815851926803589 }, { "auxiliary_loss_clip": 0.01091336, "auxiliary_loss_mlp": 0.01052712, "balance_loss_clip": 1.04560924, "balance_loss_mlp": 1.03665471, "epoch": 0.4963174507740869, "flos": 23112359331840.0, "grad_norm": 1.7981030707772934, "language_loss": 0.74278247, "learning_rate": 2.1208541824838743e-06, "loss": 0.76422298, "num_input_tokens_seen": 177478895, "step": 8255, "time_per_iteration": 2.7599687576293945 }, { "auxiliary_loss_clip": 0.01098276, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.04286051, "balance_loss_mlp": 1.02203858, "epoch": 0.49637757402675486, "flos": 13917863450880.0, "grad_norm": 1.736601635944992, "language_loss": 0.81702995, "learning_rate": 2.1204654302104183e-06, "loss": 0.83837777, "num_input_tokens_seen": 177494920, "step": 8256, "time_per_iteration": 2.640913724899292 }, { "auxiliary_loss_clip": 0.01096211, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.04346132, "balance_loss_mlp": 1.02055597, "epoch": 0.49643769727942283, "flos": 22309001700480.0, "grad_norm": 1.6034861047711904, "language_loss": 0.81197649, "learning_rate": 2.120076673368901e-06, "loss": 0.83327824, "num_input_tokens_seen": 177515455, "step": 8257, "time_per_iteration": 2.724745512008667 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.04763043, "balance_loss_mlp": 1.02435732, "epoch": 0.4964978205320908, "flos": 19500248776320.0, "grad_norm": 1.9280789180083706, "language_loss": 0.66280329, "learning_rate": 2.1196879119740647e-06, "loss": 0.68453205, "num_input_tokens_seen": 177534040, "step": 8258, "time_per_iteration": 2.570275068283081 }, { "auxiliary_loss_clip": 0.01110241, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.04361916, "balance_loss_mlp": 1.01942396, "epoch": 0.49655794378475876, "flos": 23436524597760.0, "grad_norm": 1.42579071834104, "language_loss": 0.77627164, "learning_rate": 2.1192991460406502e-06, "loss": 0.79769588, "num_input_tokens_seen": 177554510, "step": 8259, "time_per_iteration": 2.676722288131714 }, { "auxiliary_loss_clip": 0.01097253, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.04436278, "balance_loss_mlp": 1.02406085, "epoch": 0.4966180670374267, "flos": 26831124345600.0, "grad_norm": 1.5162865829701626, "language_loss": 0.78461975, "learning_rate": 2.1189103755834e-06, "loss": 0.80597448, "num_input_tokens_seen": 177575780, "step": 8260, "time_per_iteration": 2.7226130962371826 }, { "auxiliary_loss_clip": 0.01100503, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.04154015, "balance_loss_mlp": 1.02135992, "epoch": 0.4966781902900947, "flos": 22009326531840.0, "grad_norm": 3.0057343325073456, "language_loss": 0.76335442, "learning_rate": 2.1185216006170573e-06, "loss": 0.78471756, "num_input_tokens_seen": 177588965, "step": 8261, "time_per_iteration": 2.6477174758911133 }, { "auxiliary_loss_clip": 0.01071745, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.03892851, "balance_loss_mlp": 1.01939654, "epoch": 0.49673831354276266, "flos": 26213353309440.0, "grad_norm": 1.835251427236856, "language_loss": 0.89503151, "learning_rate": 2.1181328211563627e-06, "loss": 0.9160741, "num_input_tokens_seen": 177608425, "step": 8262, "time_per_iteration": 2.757200241088867 }, { "auxiliary_loss_clip": 0.01068117, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.04000998, "balance_loss_mlp": 1.0223608, "epoch": 0.4967984367954306, "flos": 23182277155200.0, "grad_norm": 1.5869779774184047, "language_loss": 0.73859417, "learning_rate": 2.11774403721606e-06, "loss": 0.7596314, "num_input_tokens_seen": 177628240, "step": 8263, "time_per_iteration": 2.799468994140625 }, { "auxiliary_loss_clip": 0.0108327, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.0480659, "balance_loss_mlp": 1.02325881, "epoch": 0.4968585600480986, "flos": 19281445079040.0, "grad_norm": 3.1164108836460036, "language_loss": 0.70163679, "learning_rate": 2.1173552488108923e-06, "loss": 0.72286057, "num_input_tokens_seen": 177645920, "step": 8264, "time_per_iteration": 2.720449447631836 }, { "auxiliary_loss_clip": 0.01098192, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.04328251, "balance_loss_mlp": 1.01837087, "epoch": 0.49691868330076655, "flos": 22528703237760.0, "grad_norm": 1.6446636391121152, "language_loss": 0.65104395, "learning_rate": 2.1169664559556007e-06, "loss": 0.67235053, "num_input_tokens_seen": 177667185, "step": 8265, "time_per_iteration": 2.683858633041382 }, { "auxiliary_loss_clip": 0.01028918, "auxiliary_loss_mlp": 0.01002907, "balance_loss_clip": 1.0220778, "balance_loss_mlp": 1.00148249, "epoch": 0.4969788065534345, "flos": 66577128675840.0, "grad_norm": 0.930084427968553, "language_loss": 0.53491867, "learning_rate": 2.1165776586649304e-06, "loss": 0.55523694, "num_input_tokens_seen": 177733020, "step": 8266, "time_per_iteration": 3.2566375732421875 }, { "auxiliary_loss_clip": 0.01113371, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.04611242, "balance_loss_mlp": 1.01834857, "epoch": 0.49703892980610254, "flos": 24059503105920.0, "grad_norm": 1.764439361537035, "language_loss": 0.79587245, "learning_rate": 2.1161888569536223e-06, "loss": 0.81733251, "num_input_tokens_seen": 177753370, "step": 8267, "time_per_iteration": 2.6278576850891113 }, { "auxiliary_loss_clip": 0.01102001, "auxiliary_loss_mlp": 0.01039107, "balance_loss_clip": 1.04590034, "balance_loss_mlp": 1.02316856, "epoch": 0.4970990530587705, "flos": 29126174912640.0, "grad_norm": 2.2169439003129385, "language_loss": 0.74835396, "learning_rate": 2.1158000508364223e-06, "loss": 0.76976496, "num_input_tokens_seen": 177771530, "step": 8268, "time_per_iteration": 2.734259843826294 }, { "auxiliary_loss_clip": 0.011141, "auxiliary_loss_mlp": 0.00771431, "balance_loss_clip": 1.04348183, "balance_loss_mlp": 1.00014162, "epoch": 0.49715917631143847, "flos": 46026167258880.0, "grad_norm": 4.0839840126254225, "language_loss": 0.68041855, "learning_rate": 2.115411240328073e-06, "loss": 0.69927382, "num_input_tokens_seen": 177796355, "step": 8269, "time_per_iteration": 2.90146541595459 }, { "auxiliary_loss_clip": 0.01097171, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.04262531, "balance_loss_mlp": 1.02837276, "epoch": 0.49721929956410643, "flos": 20191277600640.0, "grad_norm": 2.5681883642378436, "language_loss": 0.85533005, "learning_rate": 2.1150224254433167e-06, "loss": 0.87671888, "num_input_tokens_seen": 177814300, "step": 8270, "time_per_iteration": 2.8005404472351074 }, { "auxiliary_loss_clip": 0.01081529, "auxiliary_loss_mlp": 0.00771255, "balance_loss_clip": 1.04315615, "balance_loss_mlp": 1.00016665, "epoch": 0.4972794228167744, "flos": 21653560275840.0, "grad_norm": 1.8215552302583695, "language_loss": 0.70831466, "learning_rate": 2.114633606196899e-06, "loss": 0.72684252, "num_input_tokens_seen": 177833615, "step": 8271, "time_per_iteration": 2.91554594039917 }, { "auxiliary_loss_clip": 0.01112057, "auxiliary_loss_mlp": 0.01035877, "balance_loss_clip": 1.04666567, "balance_loss_mlp": 1.02128029, "epoch": 0.49733954606944236, "flos": 24279743347200.0, "grad_norm": 1.5312065445139798, "language_loss": 0.78403968, "learning_rate": 2.1142447826035635e-06, "loss": 0.80551904, "num_input_tokens_seen": 177855315, "step": 8272, "time_per_iteration": 2.6702592372894287 }, { "auxiliary_loss_clip": 0.01090488, "auxiliary_loss_mlp": 0.01040546, "balance_loss_clip": 1.0464623, "balance_loss_mlp": 1.02679515, "epoch": 0.4973996693221103, "flos": 37852575730560.0, "grad_norm": 2.547664660385474, "language_loss": 0.6682387, "learning_rate": 2.1138559546780544e-06, "loss": 0.68954909, "num_input_tokens_seen": 177875590, "step": 8273, "time_per_iteration": 2.8257791996002197 }, { "auxiliary_loss_clip": 0.01089829, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.04431605, "balance_loss_mlp": 1.02347827, "epoch": 0.4974597925747783, "flos": 21361426963200.0, "grad_norm": 1.5692617693087136, "language_loss": 0.78097814, "learning_rate": 2.1134671224351163e-06, "loss": 0.80224848, "num_input_tokens_seen": 177894175, "step": 8274, "time_per_iteration": 2.6786539554595947 }, { "auxiliary_loss_clip": 0.01087892, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.04171109, "balance_loss_mlp": 1.02315021, "epoch": 0.49751991582744626, "flos": 30738133560960.0, "grad_norm": 1.7539763145915706, "language_loss": 0.75727397, "learning_rate": 2.113078285889493e-06, "loss": 0.77853251, "num_input_tokens_seen": 177913920, "step": 8275, "time_per_iteration": 2.7289958000183105 }, { "auxiliary_loss_clip": 0.01117048, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.04600728, "balance_loss_mlp": 1.02240443, "epoch": 0.4975800390801142, "flos": 14100541044480.0, "grad_norm": 2.0869085379368717, "language_loss": 0.84277642, "learning_rate": 2.1126894450559303e-06, "loss": 0.86433506, "num_input_tokens_seen": 177930425, "step": 8276, "time_per_iteration": 2.612114667892456 }, { "auxiliary_loss_clip": 0.01122283, "auxiliary_loss_mlp": 0.00770821, "balance_loss_clip": 1.04578209, "balance_loss_mlp": 1.00012255, "epoch": 0.4976401623327822, "flos": 24207275658240.0, "grad_norm": 2.0722406374843283, "language_loss": 0.70213616, "learning_rate": 2.112300599949172e-06, "loss": 0.72106719, "num_input_tokens_seen": 177949885, "step": 8277, "time_per_iteration": 2.627364158630371 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01038763, "balance_loss_clip": 1.04542017, "balance_loss_mlp": 1.02430928, "epoch": 0.49770028558545015, "flos": 21136769349120.0, "grad_norm": 1.855614041136712, "language_loss": 0.82644826, "learning_rate": 2.111911750583964e-06, "loss": 0.84793556, "num_input_tokens_seen": 177965720, "step": 8278, "time_per_iteration": 2.653998613357544 }, { "auxiliary_loss_clip": 0.01117237, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.04625261, "balance_loss_mlp": 1.02723408, "epoch": 0.4977604088381181, "flos": 16763927627520.0, "grad_norm": 2.0212653893375276, "language_loss": 0.67471039, "learning_rate": 2.111522896975052e-06, "loss": 0.69629395, "num_input_tokens_seen": 177983190, "step": 8279, "time_per_iteration": 2.607090473175049 }, { "auxiliary_loss_clip": 0.01115839, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.04406691, "balance_loss_mlp": 1.02692842, "epoch": 0.49782053209078614, "flos": 15703521292800.0, "grad_norm": 2.1427811758671527, "language_loss": 0.70507026, "learning_rate": 2.1111340391371794e-06, "loss": 0.72664863, "num_input_tokens_seen": 178000155, "step": 8280, "time_per_iteration": 2.636384963989258 }, { "auxiliary_loss_clip": 0.01090186, "auxiliary_loss_mlp": 0.01035589, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.02177858, "epoch": 0.4978806553434541, "flos": 24753692327040.0, "grad_norm": 2.860421271049928, "language_loss": 0.64889467, "learning_rate": 2.1107451770850936e-06, "loss": 0.67015243, "num_input_tokens_seen": 178021060, "step": 8281, "time_per_iteration": 2.6961820125579834 }, { "auxiliary_loss_clip": 0.0111999, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.0478642, "balance_loss_mlp": 1.02102113, "epoch": 0.49794077859612207, "flos": 13115726881920.0, "grad_norm": 2.7426965878502845, "language_loss": 0.73226738, "learning_rate": 2.1103563108335387e-06, "loss": 0.75382769, "num_input_tokens_seen": 178038180, "step": 8282, "time_per_iteration": 2.7749152183532715 }, { "auxiliary_loss_clip": 0.01095648, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.04499686, "balance_loss_mlp": 1.02106822, "epoch": 0.49800090184879003, "flos": 27525133998720.0, "grad_norm": 1.749404235674241, "language_loss": 0.73327482, "learning_rate": 2.109967440397263e-06, "loss": 0.75457078, "num_input_tokens_seen": 178057565, "step": 8283, "time_per_iteration": 2.7039520740509033 }, { "auxiliary_loss_clip": 0.01068275, "auxiliary_loss_mlp": 0.01054525, "balance_loss_clip": 1.0405463, "balance_loss_mlp": 1.03883147, "epoch": 0.498061025101458, "flos": 19792489829760.0, "grad_norm": 2.5573951668279102, "language_loss": 0.7842927, "learning_rate": 2.1095785657910095e-06, "loss": 0.80552071, "num_input_tokens_seen": 178076965, "step": 8284, "time_per_iteration": 2.7534518241882324 }, { "auxiliary_loss_clip": 0.01103825, "auxiliary_loss_mlp": 0.0104233, "balance_loss_clip": 1.045488, "balance_loss_mlp": 1.02733326, "epoch": 0.49812114835412596, "flos": 29893909230720.0, "grad_norm": 1.7317298938274186, "language_loss": 0.73607123, "learning_rate": 2.109189687029526e-06, "loss": 0.75753278, "num_input_tokens_seen": 178095105, "step": 8285, "time_per_iteration": 2.696913719177246 }, { "auxiliary_loss_clip": 0.01114659, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.0496074, "balance_loss_mlp": 1.01902318, "epoch": 0.49818127160679393, "flos": 23147048891520.0, "grad_norm": 1.6428187648074233, "language_loss": 0.74194658, "learning_rate": 2.1088008041275598e-06, "loss": 0.76343036, "num_input_tokens_seen": 178114505, "step": 8286, "time_per_iteration": 4.164494752883911 }, { "auxiliary_loss_clip": 0.01106668, "auxiliary_loss_mlp": 0.0104423, "balance_loss_clip": 1.04752493, "balance_loss_mlp": 1.02986491, "epoch": 0.4982413948594619, "flos": 21652806090240.0, "grad_norm": 1.7990587687461415, "language_loss": 0.85529351, "learning_rate": 2.1084119170998545e-06, "loss": 0.87680244, "num_input_tokens_seen": 178131595, "step": 8287, "time_per_iteration": 4.236407279968262 }, { "auxiliary_loss_clip": 0.01076576, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.04194725, "balance_loss_mlp": 1.01822948, "epoch": 0.49830151811212986, "flos": 32486982940800.0, "grad_norm": 1.6860437652999367, "language_loss": 0.72530627, "learning_rate": 2.108023025961159e-06, "loss": 0.74639714, "num_input_tokens_seen": 178152055, "step": 8288, "time_per_iteration": 4.404609680175781 }, { "auxiliary_loss_clip": 0.01106449, "auxiliary_loss_mlp": 0.01040352, "balance_loss_clip": 1.04326916, "balance_loss_mlp": 1.02459288, "epoch": 0.4983616413647978, "flos": 18142358002560.0, "grad_norm": 3.334734045415943, "language_loss": 0.79885554, "learning_rate": 2.10763413072622e-06, "loss": 0.82032353, "num_input_tokens_seen": 178168150, "step": 8289, "time_per_iteration": 2.6629836559295654 }, { "auxiliary_loss_clip": 0.01114454, "auxiliary_loss_mlp": 0.0103885, "balance_loss_clip": 1.0446074, "balance_loss_mlp": 1.02460992, "epoch": 0.4984217646174658, "flos": 19718836992000.0, "grad_norm": 2.0640091139098256, "language_loss": 0.72874933, "learning_rate": 2.107245231409784e-06, "loss": 0.75028241, "num_input_tokens_seen": 178186150, "step": 8290, "time_per_iteration": 4.18574333190918 }, { "auxiliary_loss_clip": 0.0112064, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.04972208, "balance_loss_mlp": 1.02428377, "epoch": 0.49848188787013376, "flos": 24936549488640.0, "grad_norm": 1.4927804425375188, "language_loss": 0.8397218, "learning_rate": 2.106856328026598e-06, "loss": 0.86133754, "num_input_tokens_seen": 178207665, "step": 8291, "time_per_iteration": 2.716386556625366 }, { "auxiliary_loss_clip": 0.01103944, "auxiliary_loss_mlp": 0.01046379, "balance_loss_clip": 1.04420066, "balance_loss_mlp": 1.02930808, "epoch": 0.4985420111228017, "flos": 22382439056640.0, "grad_norm": 1.6316694600084898, "language_loss": 0.67022264, "learning_rate": 2.106467420591409e-06, "loss": 0.69172579, "num_input_tokens_seen": 178226325, "step": 8292, "time_per_iteration": 2.7027721405029297 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01039323, "balance_loss_clip": 1.04806566, "balance_loss_mlp": 1.02625203, "epoch": 0.4986021343754697, "flos": 16216469464320.0, "grad_norm": 1.6633361946509924, "language_loss": 0.66995132, "learning_rate": 2.106078509118965e-06, "loss": 0.6916163, "num_input_tokens_seen": 178244960, "step": 8293, "time_per_iteration": 2.5719261169433594 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04379749, "balance_loss_mlp": 1.01533389, "epoch": 0.4986622576281377, "flos": 23403594804480.0, "grad_norm": 1.8610494318021187, "language_loss": 0.82020485, "learning_rate": 2.1056895936240133e-06, "loss": 0.84161556, "num_input_tokens_seen": 178265400, "step": 8294, "time_per_iteration": 2.6504080295562744 }, { "auxiliary_loss_clip": 0.01116097, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.04479063, "balance_loss_mlp": 1.01604557, "epoch": 0.49872238088080567, "flos": 19974556892160.0, "grad_norm": 2.2309244250260183, "language_loss": 0.72901344, "learning_rate": 2.1053006741213016e-06, "loss": 0.75048614, "num_input_tokens_seen": 178284535, "step": 8295, "time_per_iteration": 2.6195027828216553 }, { "auxiliary_loss_clip": 0.01059073, "auxiliary_loss_mlp": 0.01038092, "balance_loss_clip": 1.03994107, "balance_loss_mlp": 1.02466345, "epoch": 0.49878250413347364, "flos": 22893016930560.0, "grad_norm": 1.8092757241660187, "language_loss": 0.67607826, "learning_rate": 2.1049117506255775e-06, "loss": 0.69704998, "num_input_tokens_seen": 178302425, "step": 8296, "time_per_iteration": 2.755263090133667 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01042078, "balance_loss_clip": 1.04649234, "balance_loss_mlp": 1.02715254, "epoch": 0.4988426273861416, "flos": 32598449821440.0, "grad_norm": 2.862724254512052, "language_loss": 0.64573205, "learning_rate": 2.1045228231515895e-06, "loss": 0.66719502, "num_input_tokens_seen": 178323065, "step": 8297, "time_per_iteration": 2.77134108543396 }, { "auxiliary_loss_clip": 0.01068772, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.04186463, "balance_loss_mlp": 1.02241552, "epoch": 0.49890275063880957, "flos": 20923604087040.0, "grad_norm": 1.6802177929429785, "language_loss": 0.70005518, "learning_rate": 2.1041338917140857e-06, "loss": 0.72109365, "num_input_tokens_seen": 178343985, "step": 8298, "time_per_iteration": 2.7644965648651123 }, { "auxiliary_loss_clip": 0.01123634, "auxiliary_loss_mlp": 0.01037158, "balance_loss_clip": 1.04611015, "balance_loss_mlp": 1.02383053, "epoch": 0.49896287389147753, "flos": 18624459369600.0, "grad_norm": 2.15895128631453, "language_loss": 0.85060012, "learning_rate": 2.103744956327814e-06, "loss": 0.87220806, "num_input_tokens_seen": 178362345, "step": 8299, "time_per_iteration": 2.6582682132720947 }, { "auxiliary_loss_clip": 0.0109908, "auxiliary_loss_mlp": 0.01042644, "balance_loss_clip": 1.04576635, "balance_loss_mlp": 1.02676535, "epoch": 0.4990229971441455, "flos": 24826555065600.0, "grad_norm": 3.5746156367417177, "language_loss": 0.69598472, "learning_rate": 2.1033560170075234e-06, "loss": 0.71740198, "num_input_tokens_seen": 178383190, "step": 8300, "time_per_iteration": 2.725041151046753 }, { "auxiliary_loss_clip": 0.01026277, "auxiliary_loss_mlp": 0.01006258, "balance_loss_clip": 1.02488732, "balance_loss_mlp": 1.00483894, "epoch": 0.49908312039681346, "flos": 71384525136000.0, "grad_norm": 0.7557607717879434, "language_loss": 0.51092541, "learning_rate": 2.1029670737679623e-06, "loss": 0.53125077, "num_input_tokens_seen": 178444250, "step": 8301, "time_per_iteration": 3.2866220474243164 }, { "auxiliary_loss_clip": 0.01096877, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.04223108, "balance_loss_mlp": 1.03140736, "epoch": 0.4991432436494814, "flos": 19828651847040.0, "grad_norm": 1.7177443948136444, "language_loss": 0.84648693, "learning_rate": 2.102578126623879e-06, "loss": 0.86791229, "num_input_tokens_seen": 178463250, "step": 8302, "time_per_iteration": 2.66215181350708 }, { "auxiliary_loss_clip": 0.01112659, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04628754, "balance_loss_mlp": 1.02111602, "epoch": 0.4992033669021494, "flos": 15121912273920.0, "grad_norm": 5.640508686379792, "language_loss": 0.68928391, "learning_rate": 2.102189175590024e-06, "loss": 0.71075243, "num_input_tokens_seen": 178481340, "step": 8303, "time_per_iteration": 2.6031181812286377 }, { "auxiliary_loss_clip": 0.01126853, "auxiliary_loss_mlp": 0.01035164, "balance_loss_clip": 1.04641497, "balance_loss_mlp": 1.02095485, "epoch": 0.49926349015481736, "flos": 31207952476800.0, "grad_norm": 1.6560759996443648, "language_loss": 0.72727203, "learning_rate": 2.101800220681144e-06, "loss": 0.74889231, "num_input_tokens_seen": 178501545, "step": 8304, "time_per_iteration": 2.706022262573242 }, { "auxiliary_loss_clip": 0.01116141, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.0475409, "balance_loss_mlp": 1.02420211, "epoch": 0.4993236134074853, "flos": 24900207903360.0, "grad_norm": 2.1644384364092684, "language_loss": 0.81342846, "learning_rate": 2.10141126191199e-06, "loss": 0.83496344, "num_input_tokens_seen": 178519700, "step": 8305, "time_per_iteration": 2.6671528816223145 }, { "auxiliary_loss_clip": 0.01024768, "auxiliary_loss_mlp": 0.01003944, "balance_loss_clip": 1.02671385, "balance_loss_mlp": 1.00258529, "epoch": 0.4993837366601533, "flos": 70420573797120.0, "grad_norm": 0.7597400638433706, "language_loss": 0.56867081, "learning_rate": 2.1010222992973107e-06, "loss": 0.58895797, "num_input_tokens_seen": 178576740, "step": 8306, "time_per_iteration": 3.322448492050171 }, { "auxiliary_loss_clip": 0.01127996, "auxiliary_loss_mlp": 0.01039143, "balance_loss_clip": 1.04948568, "balance_loss_mlp": 1.02432525, "epoch": 0.4994438599128213, "flos": 15961216440960.0, "grad_norm": 2.2302114161499236, "language_loss": 0.82741839, "learning_rate": 2.1006333328518556e-06, "loss": 0.84908974, "num_input_tokens_seen": 178594745, "step": 8307, "time_per_iteration": 2.583996295928955 }, { "auxiliary_loss_clip": 0.01126994, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.04805601, "balance_loss_mlp": 1.02157855, "epoch": 0.4995039831654893, "flos": 27928303228800.0, "grad_norm": 1.7094622949229625, "language_loss": 0.60939324, "learning_rate": 2.1002443625903748e-06, "loss": 0.63102394, "num_input_tokens_seen": 178614110, "step": 8308, "time_per_iteration": 2.6170315742492676 }, { "auxiliary_loss_clip": 0.01120806, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.04421234, "balance_loss_mlp": 1.01890182, "epoch": 0.49956410641815724, "flos": 24204797619840.0, "grad_norm": 1.8375312667766532, "language_loss": 0.74889386, "learning_rate": 2.0998553885276168e-06, "loss": 0.77042031, "num_input_tokens_seen": 178634170, "step": 8309, "time_per_iteration": 2.6147258281707764 }, { "auxiliary_loss_clip": 0.01102514, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.04401636, "balance_loss_mlp": 1.02106261, "epoch": 0.4996242296708252, "flos": 16180127879040.0, "grad_norm": 3.148005555228763, "language_loss": 0.79502416, "learning_rate": 2.0994664106783335e-06, "loss": 0.8163898, "num_input_tokens_seen": 178651775, "step": 8310, "time_per_iteration": 2.6420629024505615 }, { "auxiliary_loss_clip": 0.01111922, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.04564738, "balance_loss_mlp": 1.02757514, "epoch": 0.49968435292349317, "flos": 16873527000960.0, "grad_norm": 1.4976626914983278, "language_loss": 0.70989597, "learning_rate": 2.0990774290572735e-06, "loss": 0.73142344, "num_input_tokens_seen": 178669720, "step": 8311, "time_per_iteration": 2.5778110027313232 }, { "auxiliary_loss_clip": 0.01098554, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.04628289, "balance_loss_mlp": 1.02355957, "epoch": 0.49974447617616113, "flos": 14939521989120.0, "grad_norm": 2.0443790290482498, "language_loss": 0.77375191, "learning_rate": 2.098688443679187e-06, "loss": 0.79510236, "num_input_tokens_seen": 178686765, "step": 8312, "time_per_iteration": 2.6517751216888428 }, { "auxiliary_loss_clip": 0.01095231, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.04635751, "balance_loss_mlp": 1.02135265, "epoch": 0.4998045994288291, "flos": 26651535321600.0, "grad_norm": 1.7937215644313522, "language_loss": 0.84479403, "learning_rate": 2.0982994545588256e-06, "loss": 0.86609983, "num_input_tokens_seen": 178705845, "step": 8313, "time_per_iteration": 2.7882683277130127 }, { "auxiliary_loss_clip": 0.01098533, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.04393864, "balance_loss_mlp": 1.01856351, "epoch": 0.49986472268149706, "flos": 20953768533120.0, "grad_norm": 1.8469644022391951, "language_loss": 0.80625784, "learning_rate": 2.097910461710939e-06, "loss": 0.82756978, "num_input_tokens_seen": 178723410, "step": 8314, "time_per_iteration": 2.6792070865631104 }, { "auxiliary_loss_clip": 0.01093189, "auxiliary_loss_mlp": 0.00772869, "balance_loss_clip": 1.04282761, "balance_loss_mlp": 1.00018048, "epoch": 0.49992484593416503, "flos": 22783884433920.0, "grad_norm": 1.9116629548957604, "language_loss": 0.79824436, "learning_rate": 2.0975214651502773e-06, "loss": 0.8169049, "num_input_tokens_seen": 178743560, "step": 8315, "time_per_iteration": 2.885185718536377 }, { "auxiliary_loss_clip": 0.01126333, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.04775071, "balance_loss_mlp": 1.02025628, "epoch": 0.499984969186833, "flos": 46786970252160.0, "grad_norm": 1.6207947092177402, "language_loss": 0.74976832, "learning_rate": 2.0971324648915926e-06, "loss": 0.77136528, "num_input_tokens_seen": 178767225, "step": 8316, "time_per_iteration": 2.865182399749756 }, { "auxiliary_loss_clip": 0.01104962, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.04472423, "balance_loss_mlp": 1.02195168, "epoch": 0.500045092439501, "flos": 25556978131200.0, "grad_norm": 1.839667572981257, "language_loss": 0.81122506, "learning_rate": 2.0967434609496343e-06, "loss": 0.83262014, "num_input_tokens_seen": 178786810, "step": 8317, "time_per_iteration": 2.781627893447876 }, { "auxiliary_loss_clip": 0.011005, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.04331255, "balance_loss_mlp": 1.02368522, "epoch": 0.5001052156921689, "flos": 20704764476160.0, "grad_norm": 1.6607654789374993, "language_loss": 0.83369392, "learning_rate": 2.0963544533391548e-06, "loss": 0.8550871, "num_input_tokens_seen": 178805660, "step": 8318, "time_per_iteration": 2.790937662124634 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.04552984, "balance_loss_mlp": 1.01974225, "epoch": 0.500165338944837, "flos": 21251109317760.0, "grad_norm": 1.7594247797212967, "language_loss": 0.81800634, "learning_rate": 2.0959654420749045e-06, "loss": 0.83947688, "num_input_tokens_seen": 178824780, "step": 8319, "time_per_iteration": 2.6710760593414307 }, { "auxiliary_loss_clip": 0.01080263, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 1.03828013, "balance_loss_mlp": 1.01689363, "epoch": 0.5002254621975049, "flos": 27854398995840.0, "grad_norm": 1.5279258864896563, "language_loss": 0.71943277, "learning_rate": 2.095576427171635e-06, "loss": 0.7405355, "num_input_tokens_seen": 178845640, "step": 8320, "time_per_iteration": 2.7864880561828613 }, { "auxiliary_loss_clip": 0.01093478, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.04542255, "balance_loss_mlp": 1.02964222, "epoch": 0.5002855854501729, "flos": 15551941898880.0, "grad_norm": 2.783304711521318, "language_loss": 0.76481223, "learning_rate": 2.0951874086440978e-06, "loss": 0.78619403, "num_input_tokens_seen": 178862290, "step": 8321, "time_per_iteration": 2.7580785751342773 }, { "auxiliary_loss_clip": 0.01115908, "auxiliary_loss_mlp": 0.00771212, "balance_loss_clip": 1.04681301, "balance_loss_mlp": 1.00017464, "epoch": 0.5003457087028408, "flos": 16107408794880.0, "grad_norm": 6.807525102727238, "language_loss": 0.82965297, "learning_rate": 2.0947983865070455e-06, "loss": 0.84852415, "num_input_tokens_seen": 178879805, "step": 8322, "time_per_iteration": 2.6580779552459717 }, { "auxiliary_loss_clip": 0.01117442, "auxiliary_loss_mlp": 0.0103527, "balance_loss_clip": 1.0458411, "balance_loss_mlp": 1.02163804, "epoch": 0.5004058319555088, "flos": 22710518904960.0, "grad_norm": 2.2579769372834257, "language_loss": 0.73329234, "learning_rate": 2.094409360775228e-06, "loss": 0.75481945, "num_input_tokens_seen": 178896985, "step": 8323, "time_per_iteration": 2.6743083000183105 }, { "auxiliary_loss_clip": 0.01086486, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.04470778, "balance_loss_mlp": 1.02264738, "epoch": 0.5004659552081767, "flos": 30117956313600.0, "grad_norm": 1.846103580376976, "language_loss": 0.69483137, "learning_rate": 2.0940203314633977e-06, "loss": 0.71606022, "num_input_tokens_seen": 178920605, "step": 8324, "time_per_iteration": 2.783973217010498 }, { "auxiliary_loss_clip": 0.01106501, "auxiliary_loss_mlp": 0.00771259, "balance_loss_clip": 1.0422833, "balance_loss_mlp": 1.0000751, "epoch": 0.5005260784608447, "flos": 18624710764800.0, "grad_norm": 3.4520936591258224, "language_loss": 0.72325313, "learning_rate": 2.0936312985863077e-06, "loss": 0.74203074, "num_input_tokens_seen": 178937760, "step": 8325, "time_per_iteration": 4.274277448654175 }, { "auxiliary_loss_clip": 0.01089915, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.04158878, "balance_loss_mlp": 1.02669656, "epoch": 0.5005862017135126, "flos": 24859987649280.0, "grad_norm": 1.7422514730064806, "language_loss": 0.73518062, "learning_rate": 2.093242262158709e-06, "loss": 0.7564981, "num_input_tokens_seen": 178957985, "step": 8326, "time_per_iteration": 4.3523108959198 }, { "auxiliary_loss_clip": 0.01094661, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.04201293, "balance_loss_mlp": 1.01984525, "epoch": 0.5006463249661807, "flos": 18734381965440.0, "grad_norm": 1.5476902232241379, "language_loss": 0.78111005, "learning_rate": 2.0928532221953544e-06, "loss": 0.80238211, "num_input_tokens_seen": 178977070, "step": 8327, "time_per_iteration": 4.4682557582855225 }, { "auxiliary_loss_clip": 0.01128169, "auxiliary_loss_mlp": 0.01040162, "balance_loss_clip": 1.04810429, "balance_loss_mlp": 1.02641153, "epoch": 0.5007064482188487, "flos": 13042145871360.0, "grad_norm": 2.1714411479157296, "language_loss": 0.88089001, "learning_rate": 2.092464178710997e-06, "loss": 0.90257335, "num_input_tokens_seen": 178994175, "step": 8328, "time_per_iteration": 2.5710413455963135 }, { "auxiliary_loss_clip": 0.01091641, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.04136801, "balance_loss_mlp": 1.02050591, "epoch": 0.5007665714715166, "flos": 21288671965440.0, "grad_norm": 2.863428491996577, "language_loss": 0.73827946, "learning_rate": 2.092075131720388e-06, "loss": 0.75954318, "num_input_tokens_seen": 179013710, "step": 8329, "time_per_iteration": 2.7770020961761475 }, { "auxiliary_loss_clip": 0.01124061, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.04667771, "balance_loss_mlp": 1.01824427, "epoch": 0.5008266947241846, "flos": 29754576374400.0, "grad_norm": 1.6131098934363575, "language_loss": 0.79715234, "learning_rate": 2.091686081238281e-06, "loss": 0.81870234, "num_input_tokens_seen": 179035255, "step": 8330, "time_per_iteration": 4.167505979537964 }, { "auxiliary_loss_clip": 0.01021039, "auxiliary_loss_mlp": 0.00752271, "balance_loss_clip": 1.02094173, "balance_loss_mlp": 0.9997682, "epoch": 0.5008868179768525, "flos": 63557829204480.0, "grad_norm": 0.7263095406539528, "language_loss": 0.5601325, "learning_rate": 2.0912970272794282e-06, "loss": 0.5778656, "num_input_tokens_seen": 179090915, "step": 8331, "time_per_iteration": 3.008077621459961 }, { "auxiliary_loss_clip": 0.01112181, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.04617071, "balance_loss_mlp": 1.02216136, "epoch": 0.5009469412295205, "flos": 27375637593600.0, "grad_norm": 2.025315423078993, "language_loss": 0.65264666, "learning_rate": 2.0909079698585833e-06, "loss": 0.67412001, "num_input_tokens_seen": 179109160, "step": 8332, "time_per_iteration": 2.6730518341064453 }, { "auxiliary_loss_clip": 0.01120357, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.04410577, "balance_loss_mlp": 1.02124023, "epoch": 0.5010070644821885, "flos": 27378833904000.0, "grad_norm": 1.5954618594032755, "language_loss": 0.75023079, "learning_rate": 2.0905189089904993e-06, "loss": 0.7717737, "num_input_tokens_seen": 179130610, "step": 8333, "time_per_iteration": 2.685154914855957 }, { "auxiliary_loss_clip": 0.01125291, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.04558921, "balance_loss_mlp": 1.02145159, "epoch": 0.5010671877348565, "flos": 20662748542080.0, "grad_norm": 1.9338828530124208, "language_loss": 0.80424768, "learning_rate": 2.090129844689929e-06, "loss": 0.82585168, "num_input_tokens_seen": 179147860, "step": 8334, "time_per_iteration": 2.627230405807495 }, { "auxiliary_loss_clip": 0.01037349, "auxiliary_loss_mlp": 0.01004574, "balance_loss_clip": 1.02146554, "balance_loss_mlp": 1.00316703, "epoch": 0.5011273109875244, "flos": 59128645000320.0, "grad_norm": 0.8902108893007158, "language_loss": 0.62708843, "learning_rate": 2.089740776971626e-06, "loss": 0.64750767, "num_input_tokens_seen": 179210490, "step": 8335, "time_per_iteration": 3.2171308994293213 }, { "auxiliary_loss_clip": 0.01110054, "auxiliary_loss_mlp": 0.01029223, "balance_loss_clip": 1.04289985, "balance_loss_mlp": 1.01612818, "epoch": 0.5011874342401924, "flos": 25336342840320.0, "grad_norm": 1.3859166459285381, "language_loss": 0.79553854, "learning_rate": 2.0893517058503435e-06, "loss": 0.81693137, "num_input_tokens_seen": 179231360, "step": 8336, "time_per_iteration": 2.6930394172668457 }, { "auxiliary_loss_clip": 0.01082861, "auxiliary_loss_mlp": 0.01032761, "balance_loss_clip": 1.03948808, "balance_loss_mlp": 1.01899827, "epoch": 0.5012475574928603, "flos": 20229953569920.0, "grad_norm": 2.2337029404169457, "language_loss": 0.80255198, "learning_rate": 2.088962631340836e-06, "loss": 0.82370824, "num_input_tokens_seen": 179250625, "step": 8337, "time_per_iteration": 2.725379467010498 }, { "auxiliary_loss_clip": 0.01129165, "auxiliary_loss_mlp": 0.01038167, "balance_loss_clip": 1.04644942, "balance_loss_mlp": 1.0239507, "epoch": 0.5013076807455283, "flos": 22710123855360.0, "grad_norm": 2.0126131839523835, "language_loss": 0.79470736, "learning_rate": 2.0885735534578555e-06, "loss": 0.81638074, "num_input_tokens_seen": 179267360, "step": 8338, "time_per_iteration": 2.6641087532043457 }, { "auxiliary_loss_clip": 0.01100565, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.04381251, "balance_loss_mlp": 1.01617527, "epoch": 0.5013678039981962, "flos": 24245161528320.0, "grad_norm": 1.6605427604759349, "language_loss": 0.85052264, "learning_rate": 2.0881844722161583e-06, "loss": 0.87182683, "num_input_tokens_seen": 179289810, "step": 8339, "time_per_iteration": 2.7899603843688965 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.04381561, "balance_loss_mlp": 1.02343023, "epoch": 0.5014279272508643, "flos": 26176688501760.0, "grad_norm": 1.4822129376950433, "language_loss": 0.70713747, "learning_rate": 2.0877953876304962e-06, "loss": 0.72862542, "num_input_tokens_seen": 179310620, "step": 8340, "time_per_iteration": 2.773681402206421 }, { "auxiliary_loss_clip": 0.01088541, "auxiliary_loss_mlp": 0.01043525, "balance_loss_clip": 1.04147744, "balance_loss_mlp": 1.02764666, "epoch": 0.5014880505035323, "flos": 21430446946560.0, "grad_norm": 1.9911594693512178, "language_loss": 0.78301972, "learning_rate": 2.0874062997156245e-06, "loss": 0.80434036, "num_input_tokens_seen": 179329005, "step": 8341, "time_per_iteration": 2.7607786655426025 }, { "auxiliary_loss_clip": 0.01096808, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.04584622, "balance_loss_mlp": 1.02391934, "epoch": 0.5015481737562002, "flos": 15770745596160.0, "grad_norm": 4.243666050944008, "language_loss": 0.89054161, "learning_rate": 2.0870172084862975e-06, "loss": 0.9118948, "num_input_tokens_seen": 179343785, "step": 8342, "time_per_iteration": 2.7108232975006104 }, { "auxiliary_loss_clip": 0.01103427, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.04467797, "balance_loss_mlp": 1.02273893, "epoch": 0.5016082970088682, "flos": 26830801123200.0, "grad_norm": 1.768885433843204, "language_loss": 0.76325786, "learning_rate": 2.0866281139572682e-06, "loss": 0.78465378, "num_input_tokens_seen": 179364070, "step": 8343, "time_per_iteration": 2.6551196575164795 }, { "auxiliary_loss_clip": 0.01113632, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.04612589, "balance_loss_mlp": 1.01574898, "epoch": 0.5016684202615361, "flos": 21470595373440.0, "grad_norm": 1.8502078003194165, "language_loss": 0.6725269, "learning_rate": 2.086239016143293e-06, "loss": 0.6939503, "num_input_tokens_seen": 179384225, "step": 8344, "time_per_iteration": 2.634850263595581 }, { "auxiliary_loss_clip": 0.01104392, "auxiliary_loss_mlp": 0.0103805, "balance_loss_clip": 1.04439509, "balance_loss_mlp": 1.025056, "epoch": 0.5017285435142042, "flos": 26246821806720.0, "grad_norm": 2.403480744645997, "language_loss": 0.75519335, "learning_rate": 2.0858499150591258e-06, "loss": 0.77661783, "num_input_tokens_seen": 179402595, "step": 8345, "time_per_iteration": 2.7551872730255127 }, { "auxiliary_loss_clip": 0.01111042, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.04757214, "balance_loss_mlp": 1.01661348, "epoch": 0.5017886667668721, "flos": 20777555387520.0, "grad_norm": 2.18282722391055, "language_loss": 0.78664625, "learning_rate": 2.0854608107195203e-06, "loss": 0.80807132, "num_input_tokens_seen": 179419635, "step": 8346, "time_per_iteration": 2.661569833755493 }, { "auxiliary_loss_clip": 0.01102528, "auxiliary_loss_mlp": 0.00770029, "balance_loss_clip": 1.04322028, "balance_loss_mlp": 1.00006032, "epoch": 0.5018487900195401, "flos": 20156408472960.0, "grad_norm": 1.5952257408001917, "language_loss": 0.69384575, "learning_rate": 2.0850717031392333e-06, "loss": 0.71257138, "num_input_tokens_seen": 179438770, "step": 8347, "time_per_iteration": 2.7273542881011963 }, { "auxiliary_loss_clip": 0.0108784, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.04173744, "balance_loss_mlp": 1.02352858, "epoch": 0.501908913272208, "flos": 18150689957760.0, "grad_norm": 1.852088117198485, "language_loss": 0.70635176, "learning_rate": 2.0846825923330174e-06, "loss": 0.72760713, "num_input_tokens_seen": 179457475, "step": 8348, "time_per_iteration": 2.7395875453948975 }, { "auxiliary_loss_clip": 0.01110808, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.04538929, "balance_loss_mlp": 1.02306843, "epoch": 0.501969036524876, "flos": 23112287504640.0, "grad_norm": 1.775170825025465, "language_loss": 0.74760187, "learning_rate": 2.0842934783156303e-06, "loss": 0.76906341, "num_input_tokens_seen": 179478140, "step": 8349, "time_per_iteration": 2.6996099948883057 }, { "auxiliary_loss_clip": 0.01112401, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.0427202, "balance_loss_mlp": 1.01971805, "epoch": 0.5020291597775439, "flos": 11363214314880.0, "grad_norm": 2.078287176668375, "language_loss": 0.63625813, "learning_rate": 2.0839043611018266e-06, "loss": 0.6577245, "num_input_tokens_seen": 179494325, "step": 8350, "time_per_iteration": 2.6264822483062744 }, { "auxiliary_loss_clip": 0.01015981, "auxiliary_loss_mlp": 0.01015388, "balance_loss_clip": 1.01908755, "balance_loss_mlp": 1.01377916, "epoch": 0.5020892830302119, "flos": 64011094928640.0, "grad_norm": 0.7752505604108973, "language_loss": 0.59761232, "learning_rate": 2.0835152407063597e-06, "loss": 0.617926, "num_input_tokens_seen": 179553545, "step": 8351, "time_per_iteration": 3.4168505668640137 }, { "auxiliary_loss_clip": 0.01100468, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.04387021, "balance_loss_mlp": 1.02232814, "epoch": 0.5021494062828799, "flos": 23732859801600.0, "grad_norm": 1.746970205481512, "language_loss": 0.74981982, "learning_rate": 2.0831261171439873e-06, "loss": 0.77118939, "num_input_tokens_seen": 179573645, "step": 8352, "time_per_iteration": 2.7219762802124023 }, { "auxiliary_loss_clip": 0.01097371, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.04458284, "balance_loss_mlp": 1.02211952, "epoch": 0.5022095295355479, "flos": 21576747041280.0, "grad_norm": 1.6929263676943664, "language_loss": 0.71971965, "learning_rate": 2.082736990429464e-06, "loss": 0.74105263, "num_input_tokens_seen": 179591435, "step": 8353, "time_per_iteration": 2.6912848949432373 }, { "auxiliary_loss_clip": 0.01123337, "auxiliary_loss_mlp": 0.01037374, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.02265787, "epoch": 0.5022696527882159, "flos": 21397229844480.0, "grad_norm": 1.8297806631316527, "language_loss": 0.74025398, "learning_rate": 2.0823478605775455e-06, "loss": 0.76186109, "num_input_tokens_seen": 179609955, "step": 8354, "time_per_iteration": 2.7325775623321533 }, { "auxiliary_loss_clip": 0.0110051, "auxiliary_loss_mlp": 0.01042571, "balance_loss_clip": 1.04367399, "balance_loss_mlp": 1.02817094, "epoch": 0.5023297760408838, "flos": 27160712565120.0, "grad_norm": 1.8324523966840642, "language_loss": 0.72395205, "learning_rate": 2.0819587276029884e-06, "loss": 0.74538279, "num_input_tokens_seen": 179630875, "step": 8355, "time_per_iteration": 2.717954158782959 }, { "auxiliary_loss_clip": 0.01117118, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.0459739, "balance_loss_mlp": 1.02644253, "epoch": 0.5023898992935518, "flos": 26213820186240.0, "grad_norm": 1.6992540953340016, "language_loss": 0.81400853, "learning_rate": 2.081569591520548e-06, "loss": 0.83559179, "num_input_tokens_seen": 179649835, "step": 8356, "time_per_iteration": 2.7149479389190674 }, { "auxiliary_loss_clip": 0.01117006, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.04384911, "balance_loss_mlp": 1.02906859, "epoch": 0.5024500225462197, "flos": 13440323111040.0, "grad_norm": 2.281950898223197, "language_loss": 0.76235557, "learning_rate": 2.0811804523449803e-06, "loss": 0.78396809, "num_input_tokens_seen": 179667605, "step": 8357, "time_per_iteration": 2.6641504764556885 }, { "auxiliary_loss_clip": 0.01115092, "auxiliary_loss_mlp": 0.01038737, "balance_loss_clip": 1.04538774, "balance_loss_mlp": 1.02369308, "epoch": 0.5025101457988878, "flos": 21579584215680.0, "grad_norm": 1.606830870939079, "language_loss": 0.766074, "learning_rate": 2.0807913100910417e-06, "loss": 0.78761232, "num_input_tokens_seen": 179686910, "step": 8358, "time_per_iteration": 2.715304136276245 }, { "auxiliary_loss_clip": 0.01101769, "auxiliary_loss_mlp": 0.0103829, "balance_loss_clip": 1.04243326, "balance_loss_mlp": 1.02330494, "epoch": 0.5025702690515557, "flos": 24645134448000.0, "grad_norm": 2.4091387510851354, "language_loss": 0.72286153, "learning_rate": 2.0804021647734887e-06, "loss": 0.7442621, "num_input_tokens_seen": 179706395, "step": 8359, "time_per_iteration": 2.7783002853393555 }, { "auxiliary_loss_clip": 0.01097913, "auxiliary_loss_mlp": 0.01045718, "balance_loss_clip": 1.04463625, "balance_loss_mlp": 1.03208613, "epoch": 0.5026303923042237, "flos": 22090162089600.0, "grad_norm": 1.9040983502257391, "language_loss": 0.76839483, "learning_rate": 2.080013016407077e-06, "loss": 0.7898311, "num_input_tokens_seen": 179725735, "step": 8360, "time_per_iteration": 2.6632778644561768 }, { "auxiliary_loss_clip": 0.01085631, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.04737091, "balance_loss_mlp": 1.02541208, "epoch": 0.5026905155568916, "flos": 23697200574720.0, "grad_norm": 1.9221287440607566, "language_loss": 0.7667141, "learning_rate": 2.0796238650065645e-06, "loss": 0.78795838, "num_input_tokens_seen": 179746150, "step": 8361, "time_per_iteration": 2.7411348819732666 }, { "auxiliary_loss_clip": 0.01096697, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.04426289, "balance_loss_mlp": 1.01988244, "epoch": 0.5027506388095596, "flos": 25812410722560.0, "grad_norm": 1.5686217043676736, "language_loss": 0.85069525, "learning_rate": 2.0792347105867065e-06, "loss": 0.87201089, "num_input_tokens_seen": 179767550, "step": 8362, "time_per_iteration": 2.827319622039795 }, { "auxiliary_loss_clip": 0.01102707, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.0435946, "balance_loss_mlp": 1.02022946, "epoch": 0.5028107620622275, "flos": 27526606456320.0, "grad_norm": 1.54737690881779, "language_loss": 0.78134143, "learning_rate": 2.0788455531622605e-06, "loss": 0.80270725, "num_input_tokens_seen": 179790075, "step": 8363, "time_per_iteration": 2.76174259185791 }, { "auxiliary_loss_clip": 0.01111576, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.04562223, "balance_loss_mlp": 1.02060819, "epoch": 0.5028708853148955, "flos": 24534278098560.0, "grad_norm": 3.229087026174198, "language_loss": 0.75995886, "learning_rate": 2.0784563927479838e-06, "loss": 0.78142548, "num_input_tokens_seen": 179806515, "step": 8364, "time_per_iteration": 4.35154914855957 }, { "auxiliary_loss_clip": 0.01124922, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.04685044, "balance_loss_mlp": 1.01810658, "epoch": 0.5029310085675635, "flos": 20813609664000.0, "grad_norm": 1.5241312757107228, "language_loss": 0.69465041, "learning_rate": 2.0780672293586317e-06, "loss": 0.71620929, "num_input_tokens_seen": 179826450, "step": 8365, "time_per_iteration": 2.619415283203125 }, { "auxiliary_loss_clip": 0.01103666, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.04435158, "balance_loss_mlp": 1.0207144, "epoch": 0.5029911318202315, "flos": 22342470197760.0, "grad_norm": 1.4884180792885182, "language_loss": 0.73293805, "learning_rate": 2.0776780630089635e-06, "loss": 0.75432301, "num_input_tokens_seen": 179846770, "step": 8366, "time_per_iteration": 4.228264331817627 }, { "auxiliary_loss_clip": 0.01113401, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.04693627, "balance_loss_mlp": 1.0189749, "epoch": 0.5030512550728995, "flos": 24352713826560.0, "grad_norm": 1.4343945223262573, "language_loss": 0.7806654, "learning_rate": 2.077288893713735e-06, "loss": 0.80212247, "num_input_tokens_seen": 179866585, "step": 8367, "time_per_iteration": 4.1336071491241455 }, { "auxiliary_loss_clip": 0.01113589, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.0443697, "balance_loss_mlp": 1.01778555, "epoch": 0.5031113783255674, "flos": 18259930195200.0, "grad_norm": 1.686368676940742, "language_loss": 0.69880998, "learning_rate": 2.0768997214877035e-06, "loss": 0.72025627, "num_input_tokens_seen": 179885575, "step": 8368, "time_per_iteration": 2.5836374759674072 }, { "auxiliary_loss_clip": 0.01036914, "auxiliary_loss_mlp": 0.01003217, "balance_loss_clip": 1.0201298, "balance_loss_mlp": 1.00156045, "epoch": 0.5031715015782354, "flos": 57253173200640.0, "grad_norm": 0.8467965026864039, "language_loss": 0.63315928, "learning_rate": 2.0765105463456274e-06, "loss": 0.65356052, "num_input_tokens_seen": 179939650, "step": 8369, "time_per_iteration": 4.438805103302002 }, { "auxiliary_loss_clip": 0.011076, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.04427028, "balance_loss_mlp": 1.0215379, "epoch": 0.5032316248309033, "flos": 27527360641920.0, "grad_norm": 2.0752589468807043, "language_loss": 0.60782373, "learning_rate": 2.076121368302263e-06, "loss": 0.62924629, "num_input_tokens_seen": 179961765, "step": 8370, "time_per_iteration": 2.65816330909729 }, { "auxiliary_loss_clip": 0.01076531, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.04144311, "balance_loss_mlp": 1.02868104, "epoch": 0.5032917480835714, "flos": 34495825939200.0, "grad_norm": 1.8954281033433134, "language_loss": 0.68462563, "learning_rate": 2.0757321873723695e-06, "loss": 0.70582867, "num_input_tokens_seen": 179983015, "step": 8371, "time_per_iteration": 2.8479132652282715 }, { "auxiliary_loss_clip": 0.01097422, "auxiliary_loss_mlp": 0.01034396, "balance_loss_clip": 1.04120922, "balance_loss_mlp": 1.019364, "epoch": 0.5033518713362393, "flos": 33656773167360.0, "grad_norm": 1.6611598690743674, "language_loss": 0.67656618, "learning_rate": 2.0753430035707042e-06, "loss": 0.69788438, "num_input_tokens_seen": 180003210, "step": 8372, "time_per_iteration": 2.767489194869995 }, { "auxiliary_loss_clip": 0.01085092, "auxiliary_loss_mlp": 0.01043333, "balance_loss_clip": 1.04139996, "balance_loss_mlp": 1.02714443, "epoch": 0.5034119945889073, "flos": 28185495586560.0, "grad_norm": 1.9018001021824607, "language_loss": 0.66726547, "learning_rate": 2.0749538169120235e-06, "loss": 0.68854976, "num_input_tokens_seen": 180025530, "step": 8373, "time_per_iteration": 2.7779579162597656 }, { "auxiliary_loss_clip": 0.0109703, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.04184651, "balance_loss_mlp": 1.02208042, "epoch": 0.5034721178415752, "flos": 21358697529600.0, "grad_norm": 1.7065424378128664, "language_loss": 0.74679291, "learning_rate": 2.0745646274110872e-06, "loss": 0.76811939, "num_input_tokens_seen": 180043180, "step": 8374, "time_per_iteration": 2.673182487487793 }, { "auxiliary_loss_clip": 0.01100104, "auxiliary_loss_mlp": 0.01040932, "balance_loss_clip": 1.04264212, "balance_loss_mlp": 1.02604842, "epoch": 0.5035322410942432, "flos": 22674823764480.0, "grad_norm": 1.5424981365737231, "language_loss": 0.68154198, "learning_rate": 2.0741754350826525e-06, "loss": 0.70295238, "num_input_tokens_seen": 180062905, "step": 8375, "time_per_iteration": 2.6842665672302246 }, { "auxiliary_loss_clip": 0.01077033, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.04517126, "balance_loss_mlp": 1.0195334, "epoch": 0.5035923643469111, "flos": 19828723674240.0, "grad_norm": 3.5828954699990656, "language_loss": 0.79316169, "learning_rate": 2.0737862399414777e-06, "loss": 0.81427765, "num_input_tokens_seen": 180082000, "step": 8376, "time_per_iteration": 2.7780654430389404 }, { "auxiliary_loss_clip": 0.01117369, "auxiliary_loss_mlp": 0.00771622, "balance_loss_clip": 1.04441619, "balance_loss_mlp": 1.00016475, "epoch": 0.5036524875995791, "flos": 30514625182080.0, "grad_norm": 2.6140774214814693, "language_loss": 0.59478593, "learning_rate": 2.0733970420023213e-06, "loss": 0.61367583, "num_input_tokens_seen": 180101340, "step": 8377, "time_per_iteration": 2.8071539402008057 }, { "auxiliary_loss_clip": 0.01101437, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.04309344, "balance_loss_mlp": 1.02237928, "epoch": 0.5037126108522471, "flos": 14720574637440.0, "grad_norm": 2.0235166884987663, "language_loss": 0.76598781, "learning_rate": 2.0730078412799425e-06, "loss": 0.78737032, "num_input_tokens_seen": 180119160, "step": 8378, "time_per_iteration": 2.7332303524017334 }, { "auxiliary_loss_clip": 0.01086538, "auxiliary_loss_mlp": 0.01035008, "balance_loss_clip": 1.04592919, "balance_loss_mlp": 1.02190685, "epoch": 0.5037727341049151, "flos": 25297702784640.0, "grad_norm": 1.7029006786118923, "language_loss": 0.75000858, "learning_rate": 2.0726186377890985e-06, "loss": 0.77122402, "num_input_tokens_seen": 180138730, "step": 8379, "time_per_iteration": 2.8803420066833496 }, { "auxiliary_loss_clip": 0.0111301, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.04890418, "balance_loss_mlp": 1.02151, "epoch": 0.5038328573575831, "flos": 28541764632960.0, "grad_norm": 2.071075437448324, "language_loss": 0.67026305, "learning_rate": 2.072229431544548e-06, "loss": 0.69174337, "num_input_tokens_seen": 180158810, "step": 8380, "time_per_iteration": 2.7347092628479004 }, { "auxiliary_loss_clip": 0.01070606, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.04154301, "balance_loss_mlp": 1.02420914, "epoch": 0.503892980610251, "flos": 31649869503360.0, "grad_norm": 1.7540511910669407, "language_loss": 0.63245583, "learning_rate": 2.071840222561051e-06, "loss": 0.65353596, "num_input_tokens_seen": 180179700, "step": 8381, "time_per_iteration": 2.836247444152832 }, { "auxiliary_loss_clip": 0.01101604, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.04428375, "balance_loss_mlp": 1.02624631, "epoch": 0.503953103862919, "flos": 27089358197760.0, "grad_norm": 1.4852984664170332, "language_loss": 0.67586917, "learning_rate": 2.071451010853365e-06, "loss": 0.69727832, "num_input_tokens_seen": 180199890, "step": 8382, "time_per_iteration": 2.776895523071289 }, { "auxiliary_loss_clip": 0.01115945, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.04923749, "balance_loss_mlp": 1.02039194, "epoch": 0.5040132271155869, "flos": 15632957024640.0, "grad_norm": 2.370012953933875, "language_loss": 0.62379169, "learning_rate": 2.0710617964362506e-06, "loss": 0.64530009, "num_input_tokens_seen": 180217840, "step": 8383, "time_per_iteration": 2.7200045585632324 }, { "auxiliary_loss_clip": 0.0108883, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.04611087, "balance_loss_mlp": 1.02349341, "epoch": 0.504073350368255, "flos": 13590106824960.0, "grad_norm": 1.70449565256652, "language_loss": 0.66918409, "learning_rate": 2.070672579324465e-06, "loss": 0.69044316, "num_input_tokens_seen": 180236465, "step": 8384, "time_per_iteration": 2.7442476749420166 }, { "auxiliary_loss_clip": 0.01108405, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.04502487, "balance_loss_mlp": 1.02765775, "epoch": 0.5041334736209229, "flos": 29058160510080.0, "grad_norm": 3.2853523964565072, "language_loss": 0.7103979, "learning_rate": 2.0702833595327674e-06, "loss": 0.73188871, "num_input_tokens_seen": 180258025, "step": 8385, "time_per_iteration": 2.7480194568634033 }, { "auxiliary_loss_clip": 0.01110668, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.0450182, "balance_loss_mlp": 1.01644468, "epoch": 0.5041935968735909, "flos": 24608361899520.0, "grad_norm": 1.9814049774657359, "language_loss": 0.83344412, "learning_rate": 2.069894137075919e-06, "loss": 0.8548454, "num_input_tokens_seen": 180277825, "step": 8386, "time_per_iteration": 2.703789234161377 }, { "auxiliary_loss_clip": 0.01108831, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.04437232, "balance_loss_mlp": 1.02313972, "epoch": 0.5042537201262588, "flos": 26286934320000.0, "grad_norm": 1.592773103928685, "language_loss": 0.66832674, "learning_rate": 2.0695049119686766e-06, "loss": 0.68978512, "num_input_tokens_seen": 180300465, "step": 8387, "time_per_iteration": 2.8348472118377686 }, { "auxiliary_loss_clip": 0.0106703, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.03972006, "balance_loss_mlp": 1.02091861, "epoch": 0.5043138433789268, "flos": 22017371178240.0, "grad_norm": 1.386335560273684, "language_loss": 0.80273068, "learning_rate": 2.0691156842258016e-06, "loss": 0.82373804, "num_input_tokens_seen": 180321050, "step": 8388, "time_per_iteration": 2.8797311782836914 }, { "auxiliary_loss_clip": 0.01112016, "auxiliary_loss_mlp": 0.01032606, "balance_loss_clip": 1.04459918, "balance_loss_mlp": 1.01927233, "epoch": 0.5043739666315947, "flos": 28767104605440.0, "grad_norm": 2.1659708262729436, "language_loss": 0.69815123, "learning_rate": 2.0687264538620537e-06, "loss": 0.7195974, "num_input_tokens_seen": 180338870, "step": 8389, "time_per_iteration": 2.7739861011505127 }, { "auxiliary_loss_clip": 0.01090981, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.04124045, "balance_loss_mlp": 1.02756596, "epoch": 0.5044340898842627, "flos": 27599253713280.0, "grad_norm": 1.6276843858059296, "language_loss": 0.6986587, "learning_rate": 2.068337220892191e-06, "loss": 0.71997494, "num_input_tokens_seen": 180361285, "step": 8390, "time_per_iteration": 2.844275712966919 }, { "auxiliary_loss_clip": 0.01033792, "auxiliary_loss_mlp": 0.01003101, "balance_loss_clip": 1.02656126, "balance_loss_mlp": 1.00192666, "epoch": 0.5044942131369307, "flos": 67458050749440.0, "grad_norm": 0.9139771068710668, "language_loss": 0.52933067, "learning_rate": 2.067947985330974e-06, "loss": 0.54969966, "num_input_tokens_seen": 180415170, "step": 8391, "time_per_iteration": 3.054262638092041 }, { "auxiliary_loss_clip": 0.01015619, "auxiliary_loss_mlp": 0.01001074, "balance_loss_clip": 1.02201819, "balance_loss_mlp": 0.99963111, "epoch": 0.5045543363895987, "flos": 58630849390080.0, "grad_norm": 0.853635093218063, "language_loss": 0.60675329, "learning_rate": 2.0675587471931628e-06, "loss": 0.62692022, "num_input_tokens_seen": 180468060, "step": 8392, "time_per_iteration": 3.0727028846740723 }, { "auxiliary_loss_clip": 0.01085218, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.04148042, "balance_loss_mlp": 1.02351034, "epoch": 0.5046144596422667, "flos": 22526620248960.0, "grad_norm": 2.343143032045354, "language_loss": 0.84343797, "learning_rate": 2.067169506493517e-06, "loss": 0.86465156, "num_input_tokens_seen": 180486610, "step": 8393, "time_per_iteration": 2.7260749340057373 }, { "auxiliary_loss_clip": 0.01087949, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.04098725, "balance_loss_mlp": 1.02107096, "epoch": 0.5046745828949346, "flos": 27454246508160.0, "grad_norm": 1.8418334138160795, "language_loss": 0.50936127, "learning_rate": 2.0667802632467974e-06, "loss": 0.53057826, "num_input_tokens_seen": 180508135, "step": 8394, "time_per_iteration": 2.827000617980957 }, { "auxiliary_loss_clip": 0.01121524, "auxiliary_loss_mlp": 0.0103809, "balance_loss_clip": 1.04323471, "balance_loss_mlp": 1.02311766, "epoch": 0.5047347061476026, "flos": 17274541415040.0, "grad_norm": 1.5679941994223312, "language_loss": 0.75414777, "learning_rate": 2.0663910174677627e-06, "loss": 0.7757439, "num_input_tokens_seen": 180527000, "step": 8395, "time_per_iteration": 2.6535708904266357 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.01041618, "balance_loss_clip": 1.04312563, "balance_loss_mlp": 1.02860057, "epoch": 0.5047948294002705, "flos": 16649515831680.0, "grad_norm": 2.0910564250698562, "language_loss": 0.68781769, "learning_rate": 2.0660017691711737e-06, "loss": 0.70929396, "num_input_tokens_seen": 180544715, "step": 8396, "time_per_iteration": 2.700747013092041 }, { "auxiliary_loss_clip": 0.01111788, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.04604292, "balance_loss_mlp": 1.02059579, "epoch": 0.5048549526529386, "flos": 26865706164480.0, "grad_norm": 3.479269791703844, "language_loss": 0.78899479, "learning_rate": 2.065612518371792e-06, "loss": 0.81044173, "num_input_tokens_seen": 180565365, "step": 8397, "time_per_iteration": 2.716320514678955 }, { "auxiliary_loss_clip": 0.01078686, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.04137075, "balance_loss_mlp": 1.02079701, "epoch": 0.5049150759056065, "flos": 21833939399040.0, "grad_norm": 3.435063442023246, "language_loss": 0.66291559, "learning_rate": 2.065223265084376e-06, "loss": 0.68404007, "num_input_tokens_seen": 180586670, "step": 8398, "time_per_iteration": 2.773245334625244 }, { "auxiliary_loss_clip": 0.01113858, "auxiliary_loss_mlp": 0.00770983, "balance_loss_clip": 1.04783058, "balance_loss_mlp": 1.00018215, "epoch": 0.5049751991582745, "flos": 21685807710720.0, "grad_norm": 1.5640615321007765, "language_loss": 0.720043, "learning_rate": 2.064834009323688e-06, "loss": 0.73889136, "num_input_tokens_seen": 180605085, "step": 8399, "time_per_iteration": 2.697688341140747 }, { "auxiliary_loss_clip": 0.01091578, "auxiliary_loss_mlp": 0.01053063, "balance_loss_clip": 1.04215539, "balance_loss_mlp": 1.03741038, "epoch": 0.5050353224109424, "flos": 21359379888000.0, "grad_norm": 3.5795224523825695, "language_loss": 0.81615806, "learning_rate": 2.0644447511044878e-06, "loss": 0.8376044, "num_input_tokens_seen": 180624370, "step": 8400, "time_per_iteration": 2.7172608375549316 }, { "auxiliary_loss_clip": 0.01084985, "auxiliary_loss_mlp": 0.01039311, "balance_loss_clip": 1.04359269, "balance_loss_mlp": 1.02413547, "epoch": 0.5050954456636104, "flos": 22820082364800.0, "grad_norm": 1.9975954417395212, "language_loss": 0.78901821, "learning_rate": 2.0640554904415362e-06, "loss": 0.81026119, "num_input_tokens_seen": 180642450, "step": 8401, "time_per_iteration": 2.790361166000366 }, { "auxiliary_loss_clip": 0.01125612, "auxiliary_loss_mlp": 0.00770602, "balance_loss_clip": 1.04576373, "balance_loss_mlp": 1.00024748, "epoch": 0.5051555689162783, "flos": 30448226891520.0, "grad_norm": 1.6142524162989784, "language_loss": 0.70102769, "learning_rate": 2.063666227349593e-06, "loss": 0.7199899, "num_input_tokens_seen": 180665250, "step": 8402, "time_per_iteration": 2.6950721740722656 }, { "auxiliary_loss_clip": 0.01112822, "auxiliary_loss_mlp": 0.00771289, "balance_loss_clip": 1.04341567, "balance_loss_mlp": 1.00022268, "epoch": 0.5052156921689464, "flos": 21287953693440.0, "grad_norm": 2.3922403816883433, "language_loss": 0.69298434, "learning_rate": 2.063276961843422e-06, "loss": 0.71182549, "num_input_tokens_seen": 180687425, "step": 8403, "time_per_iteration": 4.257136344909668 }, { "auxiliary_loss_clip": 0.01109967, "auxiliary_loss_mlp": 0.01043124, "balance_loss_clip": 1.04455948, "balance_loss_mlp": 1.03021932, "epoch": 0.5052758154216143, "flos": 25081305298560.0, "grad_norm": 1.6578366313908228, "language_loss": 0.85693455, "learning_rate": 2.062887693937781e-06, "loss": 0.87846541, "num_input_tokens_seen": 180708725, "step": 8404, "time_per_iteration": 2.725935459136963 }, { "auxiliary_loss_clip": 0.01087696, "auxiliary_loss_mlp": 0.00769912, "balance_loss_clip": 1.04370379, "balance_loss_mlp": 1.00018847, "epoch": 0.5053359386742823, "flos": 20885502735360.0, "grad_norm": 1.5507323053673605, "language_loss": 0.75329977, "learning_rate": 2.0624984236474322e-06, "loss": 0.77187586, "num_input_tokens_seen": 180727990, "step": 8405, "time_per_iteration": 4.237490653991699 }, { "auxiliary_loss_clip": 0.01124188, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.04560125, "balance_loss_mlp": 1.01756775, "epoch": 0.5053960619269503, "flos": 37743335493120.0, "grad_norm": 1.5851552924914987, "language_loss": 0.73046809, "learning_rate": 2.0621091509871378e-06, "loss": 0.75202894, "num_input_tokens_seen": 180749765, "step": 8406, "time_per_iteration": 4.387450218200684 }, { "auxiliary_loss_clip": 0.0108276, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.04293895, "balance_loss_mlp": 1.01945066, "epoch": 0.5054561851796182, "flos": 23513840622720.0, "grad_norm": 1.8244341787972256, "language_loss": 0.76631331, "learning_rate": 2.0617198759716568e-06, "loss": 0.78746021, "num_input_tokens_seen": 180769580, "step": 8407, "time_per_iteration": 2.765031099319458 }, { "auxiliary_loss_clip": 0.01085678, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.04038286, "balance_loss_mlp": 1.01838887, "epoch": 0.5055163084322862, "flos": 30410233280640.0, "grad_norm": 1.769865286909125, "language_loss": 0.63482308, "learning_rate": 2.0613305986157535e-06, "loss": 0.65598726, "num_input_tokens_seen": 180790295, "step": 8408, "time_per_iteration": 2.7497997283935547 }, { "auxiliary_loss_clip": 0.01094613, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.04494774, "balance_loss_mlp": 1.03097582, "epoch": 0.5055764316849541, "flos": 20259651139200.0, "grad_norm": 1.9259425074827412, "language_loss": 0.63427341, "learning_rate": 2.0609413189341865e-06, "loss": 0.655689, "num_input_tokens_seen": 180807875, "step": 8409, "time_per_iteration": 4.083381652832031 }, { "auxiliary_loss_clip": 0.01099903, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.04535913, "balance_loss_mlp": 1.01790488, "epoch": 0.5056365549376222, "flos": 26070895969920.0, "grad_norm": 2.0381050127162528, "language_loss": 0.71175253, "learning_rate": 2.0605520369417193e-06, "loss": 0.73305017, "num_input_tokens_seen": 180831300, "step": 8410, "time_per_iteration": 2.7279632091522217 }, { "auxiliary_loss_clip": 0.01097675, "auxiliary_loss_mlp": 0.0104194, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.02787888, "epoch": 0.5056966781902901, "flos": 19279074781440.0, "grad_norm": 1.4485544779958848, "language_loss": 0.79037184, "learning_rate": 2.060162752653113e-06, "loss": 0.81176794, "num_input_tokens_seen": 180849055, "step": 8411, "time_per_iteration": 2.6837332248687744 }, { "auxiliary_loss_clip": 0.01125313, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.04655755, "balance_loss_mlp": 1.02372837, "epoch": 0.5057568014429581, "flos": 21323325611520.0, "grad_norm": 1.8986612146492552, "language_loss": 0.81808418, "learning_rate": 2.0597734660831285e-06, "loss": 0.83972836, "num_input_tokens_seen": 180867395, "step": 8412, "time_per_iteration": 2.615809679031372 }, { "auxiliary_loss_clip": 0.01103779, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.04390502, "balance_loss_mlp": 1.02739954, "epoch": 0.505816924695626, "flos": 17493596507520.0, "grad_norm": 1.9029826105260268, "language_loss": 0.80660832, "learning_rate": 2.0593841772465283e-06, "loss": 0.82805753, "num_input_tokens_seen": 180886670, "step": 8413, "time_per_iteration": 2.7692911624908447 }, { "auxiliary_loss_clip": 0.0109162, "auxiliary_loss_mlp": 0.00771431, "balance_loss_clip": 1.04406643, "balance_loss_mlp": 1.00020945, "epoch": 0.505877047948294, "flos": 21142084561920.0, "grad_norm": 1.9410580169313951, "language_loss": 0.80582374, "learning_rate": 2.0589948861580737e-06, "loss": 0.82445419, "num_input_tokens_seen": 180904645, "step": 8414, "time_per_iteration": 2.6970348358154297 }, { "auxiliary_loss_clip": 0.01107406, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.03923571, "balance_loss_mlp": 1.0169946, "epoch": 0.5059371712009619, "flos": 36350036887680.0, "grad_norm": 2.0609800291463225, "language_loss": 0.62233627, "learning_rate": 2.058605592832528e-06, "loss": 0.64371288, "num_input_tokens_seen": 180922340, "step": 8415, "time_per_iteration": 2.7197422981262207 }, { "auxiliary_loss_clip": 0.01087332, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.04092574, "balance_loss_mlp": 1.01899433, "epoch": 0.50599729445363, "flos": 22673387220480.0, "grad_norm": 1.6231002317718672, "language_loss": 0.81935573, "learning_rate": 2.0582162972846515e-06, "loss": 0.84055215, "num_input_tokens_seen": 180941350, "step": 8416, "time_per_iteration": 2.782719612121582 }, { "auxiliary_loss_clip": 0.01091272, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.04698849, "balance_loss_mlp": 1.02498984, "epoch": 0.5060574177062979, "flos": 22747866071040.0, "grad_norm": 1.5803053727793945, "language_loss": 0.78981423, "learning_rate": 2.0578269995292078e-06, "loss": 0.81110072, "num_input_tokens_seen": 180960720, "step": 8417, "time_per_iteration": 2.7089340686798096 }, { "auxiliary_loss_clip": 0.01070059, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.0394783, "balance_loss_mlp": 1.02599227, "epoch": 0.5061175409589659, "flos": 21653201139840.0, "grad_norm": 1.8562945560748794, "language_loss": 0.62433213, "learning_rate": 2.0574376995809588e-06, "loss": 0.64542329, "num_input_tokens_seen": 180979725, "step": 8418, "time_per_iteration": 2.719282388687134 }, { "auxiliary_loss_clip": 0.0109094, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.04258347, "balance_loss_mlp": 1.02194929, "epoch": 0.5061776642116339, "flos": 21616249023360.0, "grad_norm": 2.2787836153634724, "language_loss": 0.77394211, "learning_rate": 2.0570483974546653e-06, "loss": 0.79520482, "num_input_tokens_seen": 180998980, "step": 8419, "time_per_iteration": 2.741727113723755 }, { "auxiliary_loss_clip": 0.01062039, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.04027188, "balance_loss_mlp": 1.02160168, "epoch": 0.5062377874643018, "flos": 24426294837120.0, "grad_norm": 1.7570247471688223, "language_loss": 0.77180004, "learning_rate": 2.0566590931650917e-06, "loss": 0.79277784, "num_input_tokens_seen": 181019165, "step": 8420, "time_per_iteration": 2.8240675926208496 }, { "auxiliary_loss_clip": 0.01123562, "auxiliary_loss_mlp": 0.01036164, "balance_loss_clip": 1.04462767, "balance_loss_mlp": 1.02188277, "epoch": 0.5062979107169698, "flos": 22524429519360.0, "grad_norm": 1.730716034871051, "language_loss": 0.77317429, "learning_rate": 2.056269786726999e-06, "loss": 0.79477155, "num_input_tokens_seen": 181037110, "step": 8421, "time_per_iteration": 2.6797008514404297 }, { "auxiliary_loss_clip": 0.01106529, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04212284, "balance_loss_mlp": 1.01860261, "epoch": 0.5063580339696377, "flos": 24571984400640.0, "grad_norm": 1.4584078249019805, "language_loss": 0.66635919, "learning_rate": 2.0558804781551512e-06, "loss": 0.68774974, "num_input_tokens_seen": 181057775, "step": 8422, "time_per_iteration": 2.80218505859375 }, { "auxiliary_loss_clip": 0.01123775, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.04679537, "balance_loss_mlp": 1.01939869, "epoch": 0.5064181572223058, "flos": 22596143022720.0, "grad_norm": 1.7069001340883154, "language_loss": 0.818717, "learning_rate": 2.05549116746431e-06, "loss": 0.84028399, "num_input_tokens_seen": 181078260, "step": 8423, "time_per_iteration": 2.6722168922424316 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.00771759, "balance_loss_clip": 1.04458904, "balance_loss_mlp": 1.00021005, "epoch": 0.5064782804749737, "flos": 25994944661760.0, "grad_norm": 1.7762047243227106, "language_loss": 0.74689841, "learning_rate": 2.055101854669237e-06, "loss": 0.76586592, "num_input_tokens_seen": 181098755, "step": 8424, "time_per_iteration": 2.657538652420044 }, { "auxiliary_loss_clip": 0.01121266, "auxiliary_loss_mlp": 0.01037955, "balance_loss_clip": 1.04494393, "balance_loss_mlp": 1.02427602, "epoch": 0.5065384037276417, "flos": 28553041503360.0, "grad_norm": 1.7147939268792267, "language_loss": 0.71541035, "learning_rate": 2.0547125397846975e-06, "loss": 0.73700261, "num_input_tokens_seen": 181121570, "step": 8425, "time_per_iteration": 2.6696951389312744 }, { "auxiliary_loss_clip": 0.0108314, "auxiliary_loss_mlp": 0.01043142, "balance_loss_clip": 1.04042649, "balance_loss_mlp": 1.02828813, "epoch": 0.5065985269803096, "flos": 22966023323520.0, "grad_norm": 1.7834107132976578, "language_loss": 0.7868796, "learning_rate": 2.0543232228254524e-06, "loss": 0.80814242, "num_input_tokens_seen": 181140240, "step": 8426, "time_per_iteration": 2.702861785888672 }, { "auxiliary_loss_clip": 0.01116039, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.0481956, "balance_loss_mlp": 1.0255599, "epoch": 0.5066586502329776, "flos": 21608563512960.0, "grad_norm": 2.9338643206598713, "language_loss": 0.7762264, "learning_rate": 2.053933903806265e-06, "loss": 0.79778051, "num_input_tokens_seen": 181158630, "step": 8427, "time_per_iteration": 2.5964066982269287 }, { "auxiliary_loss_clip": 0.0112123, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.04505837, "balance_loss_mlp": 1.014763, "epoch": 0.5067187734856455, "flos": 20339912079360.0, "grad_norm": 2.519773325925209, "language_loss": 0.71591479, "learning_rate": 2.0535445827418997e-06, "loss": 0.73741496, "num_input_tokens_seen": 181176405, "step": 8428, "time_per_iteration": 2.5878183841705322 }, { "auxiliary_loss_clip": 0.01105053, "auxiliary_loss_mlp": 0.00769921, "balance_loss_clip": 1.041857, "balance_loss_mlp": 1.00016701, "epoch": 0.5067788967383136, "flos": 28841080665600.0, "grad_norm": 1.637474951892814, "language_loss": 0.83266222, "learning_rate": 2.0531552596471168e-06, "loss": 0.85141206, "num_input_tokens_seen": 181197595, "step": 8429, "time_per_iteration": 2.6528842449188232 }, { "auxiliary_loss_clip": 0.01094205, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.04527116, "balance_loss_mlp": 1.02068472, "epoch": 0.5068390199909815, "flos": 32450174478720.0, "grad_norm": 1.986559953193462, "language_loss": 0.73507559, "learning_rate": 2.052765934536682e-06, "loss": 0.75637317, "num_input_tokens_seen": 181218560, "step": 8430, "time_per_iteration": 2.8031511306762695 }, { "auxiliary_loss_clip": 0.01057925, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.03520572, "balance_loss_mlp": 1.02702332, "epoch": 0.5068991432436495, "flos": 23146582014720.0, "grad_norm": 2.0458094547910766, "language_loss": 0.77132332, "learning_rate": 2.0523766074253575e-06, "loss": 0.79232198, "num_input_tokens_seen": 181237095, "step": 8431, "time_per_iteration": 2.7593939304351807 }, { "auxiliary_loss_clip": 0.01108688, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.04256523, "balance_loss_mlp": 1.02171338, "epoch": 0.5069592664963174, "flos": 19936096404480.0, "grad_norm": 1.5904348009832192, "language_loss": 0.72110546, "learning_rate": 2.0519872783279074e-06, "loss": 0.74255085, "num_input_tokens_seen": 181255940, "step": 8432, "time_per_iteration": 2.6104278564453125 }, { "auxiliary_loss_clip": 0.0100252, "auxiliary_loss_mlp": 0.01010781, "balance_loss_clip": 1.01845694, "balance_loss_mlp": 1.00870693, "epoch": 0.5070193897489854, "flos": 65793771941760.0, "grad_norm": 0.7570764213883562, "language_loss": 0.63648349, "learning_rate": 2.0515979472590945e-06, "loss": 0.65661651, "num_input_tokens_seen": 181316945, "step": 8433, "time_per_iteration": 3.395040273666382 }, { "auxiliary_loss_clip": 0.01089015, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04288781, "balance_loss_mlp": 1.02685428, "epoch": 0.5070795130016534, "flos": 17275331514240.0, "grad_norm": 2.2603713431070194, "language_loss": 0.78218484, "learning_rate": 2.051208614233681e-06, "loss": 0.80348414, "num_input_tokens_seen": 181335555, "step": 8434, "time_per_iteration": 2.705864667892456 }, { "auxiliary_loss_clip": 0.01099616, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.04088449, "balance_loss_mlp": 1.02169967, "epoch": 0.5071396362543213, "flos": 21069940095360.0, "grad_norm": 1.6177485307205706, "language_loss": 0.70698971, "learning_rate": 2.0508192792664326e-06, "loss": 0.72833788, "num_input_tokens_seen": 181354580, "step": 8435, "time_per_iteration": 2.699631929397583 }, { "auxiliary_loss_clip": 0.01115814, "auxiliary_loss_mlp": 0.01036717, "balance_loss_clip": 1.04539943, "balance_loss_mlp": 1.02220905, "epoch": 0.5071997595069894, "flos": 23144822248320.0, "grad_norm": 1.8141877812584497, "language_loss": 0.72254074, "learning_rate": 2.050429942372112e-06, "loss": 0.74406612, "num_input_tokens_seen": 181374320, "step": 8436, "time_per_iteration": 2.6646859645843506 }, { "auxiliary_loss_clip": 0.01124514, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.04597569, "balance_loss_mlp": 1.01978946, "epoch": 0.5072598827596573, "flos": 22747183712640.0, "grad_norm": 1.5423854267163515, "language_loss": 0.83801168, "learning_rate": 2.050040603565483e-06, "loss": 0.85959864, "num_input_tokens_seen": 181392190, "step": 8437, "time_per_iteration": 2.6614348888397217 }, { "auxiliary_loss_clip": 0.01110359, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.04387856, "balance_loss_mlp": 1.01448607, "epoch": 0.5073200060123253, "flos": 22566301799040.0, "grad_norm": 2.7232019997829924, "language_loss": 0.80638587, "learning_rate": 2.049651262861309e-06, "loss": 0.82777059, "num_input_tokens_seen": 181413890, "step": 8438, "time_per_iteration": 2.6778056621551514 }, { "auxiliary_loss_clip": 0.01081177, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.04218078, "balance_loss_mlp": 1.02235103, "epoch": 0.5073801292649932, "flos": 25806341324160.0, "grad_norm": 1.4751942737164592, "language_loss": 0.7943362, "learning_rate": 2.0492619202743543e-06, "loss": 0.81552327, "num_input_tokens_seen": 181433240, "step": 8439, "time_per_iteration": 2.694603443145752 }, { "auxiliary_loss_clip": 0.01088705, "auxiliary_loss_mlp": 0.0077357, "balance_loss_clip": 1.04178834, "balance_loss_mlp": 1.00020123, "epoch": 0.5074402525176612, "flos": 25373941401600.0, "grad_norm": 1.5360675692672114, "language_loss": 0.71413541, "learning_rate": 2.048872575819383e-06, "loss": 0.7327581, "num_input_tokens_seen": 181453535, "step": 8440, "time_per_iteration": 2.68709397315979 }, { "auxiliary_loss_clip": 0.01096271, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.04103327, "balance_loss_mlp": 1.0227561, "epoch": 0.5075003757703291, "flos": 26064431521920.0, "grad_norm": 1.6763306182018036, "language_loss": 0.7087847, "learning_rate": 2.048483229511158e-06, "loss": 0.73011339, "num_input_tokens_seen": 181474195, "step": 8441, "time_per_iteration": 2.728649377822876 }, { "auxiliary_loss_clip": 0.01113949, "auxiliary_loss_mlp": 0.00771406, "balance_loss_clip": 1.04312265, "balance_loss_mlp": 1.00021851, "epoch": 0.5075604990229972, "flos": 21835447770240.0, "grad_norm": 1.794299641086803, "language_loss": 0.63846874, "learning_rate": 2.0480938813644445e-06, "loss": 0.65732235, "num_input_tokens_seen": 181494000, "step": 8442, "time_per_iteration": 4.1495561599731445 }, { "auxiliary_loss_clip": 0.01065064, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.03900802, "balance_loss_mlp": 1.01582956, "epoch": 0.5076206222756651, "flos": 31978703537280.0, "grad_norm": 1.7729718848020288, "language_loss": 0.7149542, "learning_rate": 2.047704531394006e-06, "loss": 0.73588729, "num_input_tokens_seen": 181515955, "step": 8443, "time_per_iteration": 2.84781551361084 }, { "auxiliary_loss_clip": 0.01033895, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 1.03034997, "balance_loss_mlp": 1.02093554, "epoch": 0.5076807455283331, "flos": 36904031326080.0, "grad_norm": 1.237062481884337, "language_loss": 0.62134659, "learning_rate": 2.047315179614607e-06, "loss": 0.64205158, "num_input_tokens_seen": 181540225, "step": 8444, "time_per_iteration": 3.2103631496429443 }, { "auxiliary_loss_clip": 0.01086312, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.04043984, "balance_loss_mlp": 1.0172112, "epoch": 0.507740868781001, "flos": 29862415981440.0, "grad_norm": 1.7245082556223335, "language_loss": 0.64173615, "learning_rate": 2.046925826041012e-06, "loss": 0.66290236, "num_input_tokens_seen": 181560125, "step": 8445, "time_per_iteration": 4.46838903427124 }, { "auxiliary_loss_clip": 0.01013224, "auxiliary_loss_mlp": 0.01008254, "balance_loss_clip": 1.02398801, "balance_loss_mlp": 1.00686538, "epoch": 0.507800992033669, "flos": 61918974247680.0, "grad_norm": 0.8265855466772786, "language_loss": 0.61854541, "learning_rate": 2.0465364706879845e-06, "loss": 0.63876021, "num_input_tokens_seen": 181618830, "step": 8446, "time_per_iteration": 3.267681121826172 }, { "auxiliary_loss_clip": 0.01080886, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.0391748, "balance_loss_mlp": 1.0157063, "epoch": 0.507861115286337, "flos": 20700490757760.0, "grad_norm": 1.574417237275669, "language_loss": 0.8065623, "learning_rate": 2.04614711357029e-06, "loss": 0.82765681, "num_input_tokens_seen": 181637120, "step": 8447, "time_per_iteration": 2.761584758758545 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.04490948, "balance_loss_mlp": 1.01859963, "epoch": 0.507921238539005, "flos": 30847050576000.0, "grad_norm": 1.8510365938740598, "language_loss": 0.70990604, "learning_rate": 2.0457577547026916e-06, "loss": 0.73133665, "num_input_tokens_seen": 181659965, "step": 8448, "time_per_iteration": 4.335421085357666 }, { "auxiliary_loss_clip": 0.01121931, "auxiliary_loss_mlp": 0.00769587, "balance_loss_clip": 1.04565167, "balance_loss_mlp": 1.00020599, "epoch": 0.507981361791673, "flos": 35700197984640.0, "grad_norm": 3.0099403095172557, "language_loss": 0.71958399, "learning_rate": 2.045368394099955e-06, "loss": 0.73849922, "num_input_tokens_seen": 181685290, "step": 8449, "time_per_iteration": 2.7780673503875732 }, { "auxiliary_loss_clip": 0.01094628, "auxiliary_loss_mlp": 0.0103001, "balance_loss_clip": 1.04017317, "balance_loss_mlp": 1.01767778, "epoch": 0.5080414850443409, "flos": 27161466750720.0, "grad_norm": 1.5810099588314865, "language_loss": 0.73045403, "learning_rate": 2.044979031776844e-06, "loss": 0.7517004, "num_input_tokens_seen": 181706080, "step": 8450, "time_per_iteration": 2.744396448135376 }, { "auxiliary_loss_clip": 0.01123333, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.04468369, "balance_loss_mlp": 1.01485837, "epoch": 0.5081016082970089, "flos": 27085192220160.0, "grad_norm": 1.7103931675901212, "language_loss": 0.77190459, "learning_rate": 2.0445896677481234e-06, "loss": 0.79341733, "num_input_tokens_seen": 181724805, "step": 8451, "time_per_iteration": 2.683182716369629 }, { "auxiliary_loss_clip": 0.01122238, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.04372776, "balance_loss_mlp": 1.02413273, "epoch": 0.5081617315496768, "flos": 22856531690880.0, "grad_norm": 1.9627256153454082, "language_loss": 0.85055304, "learning_rate": 2.044200302028559e-06, "loss": 0.87214684, "num_input_tokens_seen": 181743725, "step": 8452, "time_per_iteration": 2.684624671936035 }, { "auxiliary_loss_clip": 0.01126785, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.04584098, "balance_loss_mlp": 1.02078056, "epoch": 0.5082218548023448, "flos": 16281898087680.0, "grad_norm": 4.065129026902181, "language_loss": 0.77099299, "learning_rate": 2.0438109346329143e-06, "loss": 0.79260981, "num_input_tokens_seen": 181757720, "step": 8453, "time_per_iteration": 2.572178602218628 }, { "auxiliary_loss_clip": 0.01084848, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.04113591, "balance_loss_mlp": 1.02010989, "epoch": 0.5082819780550127, "flos": 24460768915200.0, "grad_norm": 1.6244227223176155, "language_loss": 0.76530403, "learning_rate": 2.0434215655759544e-06, "loss": 0.78647447, "num_input_tokens_seen": 181778545, "step": 8454, "time_per_iteration": 2.8153836727142334 }, { "auxiliary_loss_clip": 0.01097667, "auxiliary_loss_mlp": 0.01036941, "balance_loss_clip": 1.03992426, "balance_loss_mlp": 1.02275562, "epoch": 0.5083421013076808, "flos": 23403271582080.0, "grad_norm": 1.5351507829324025, "language_loss": 0.89199609, "learning_rate": 2.0430321948724446e-06, "loss": 0.91334224, "num_input_tokens_seen": 181799495, "step": 8455, "time_per_iteration": 2.7793357372283936 }, { "auxiliary_loss_clip": 0.01106838, "auxiliary_loss_mlp": 0.00772606, "balance_loss_clip": 1.04346323, "balance_loss_mlp": 1.00026703, "epoch": 0.5084022245603487, "flos": 23872695448320.0, "grad_norm": 1.6166334009695327, "language_loss": 0.62119138, "learning_rate": 2.042642822537149e-06, "loss": 0.63998592, "num_input_tokens_seen": 181818400, "step": 8456, "time_per_iteration": 2.7200372219085693 }, { "auxiliary_loss_clip": 0.01034029, "auxiliary_loss_mlp": 0.01006279, "balance_loss_clip": 1.01840019, "balance_loss_mlp": 1.00490177, "epoch": 0.5084623478130167, "flos": 62873336655360.0, "grad_norm": 0.8116383799523507, "language_loss": 0.6243, "learning_rate": 2.0422534485848343e-06, "loss": 0.64470303, "num_input_tokens_seen": 181875975, "step": 8457, "time_per_iteration": 3.087890625 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.0439477, "balance_loss_mlp": 1.01853776, "epoch": 0.5085224710656846, "flos": 22346133384960.0, "grad_norm": 1.6206653077395385, "language_loss": 0.67609936, "learning_rate": 2.0418640730302644e-06, "loss": 0.6975553, "num_input_tokens_seen": 181896450, "step": 8458, "time_per_iteration": 2.6950957775115967 }, { "auxiliary_loss_clip": 0.011096, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.04140186, "balance_loss_mlp": 1.01998079, "epoch": 0.5085825943183526, "flos": 26066263115520.0, "grad_norm": 1.6983738136244226, "language_loss": 0.77766174, "learning_rate": 2.0414746958882043e-06, "loss": 0.79910213, "num_input_tokens_seen": 181916770, "step": 8459, "time_per_iteration": 2.699784278869629 }, { "auxiliary_loss_clip": 0.01127851, "auxiliary_loss_mlp": 0.01035156, "balance_loss_clip": 1.04686987, "balance_loss_mlp": 1.02099431, "epoch": 0.5086427175710206, "flos": 17420733768960.0, "grad_norm": 10.198892862393663, "language_loss": 0.8050856, "learning_rate": 2.0410853171734196e-06, "loss": 0.82671559, "num_input_tokens_seen": 181932710, "step": 8460, "time_per_iteration": 2.632998466491699 }, { "auxiliary_loss_clip": 0.01101605, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.04293346, "balance_loss_mlp": 1.0255115, "epoch": 0.5087028408236886, "flos": 20631758083200.0, "grad_norm": 1.5613520556763807, "language_loss": 0.68347144, "learning_rate": 2.0406959369006754e-06, "loss": 0.70487332, "num_input_tokens_seen": 181950665, "step": 8461, "time_per_iteration": 2.7463462352752686 }, { "auxiliary_loss_clip": 0.01118492, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.04215729, "balance_loss_mlp": 1.01677442, "epoch": 0.5087629640763566, "flos": 25593822506880.0, "grad_norm": 1.9214201788253797, "language_loss": 0.76016432, "learning_rate": 2.0403065550847375e-06, "loss": 0.7816515, "num_input_tokens_seen": 181971270, "step": 8462, "time_per_iteration": 2.780043363571167 }, { "auxiliary_loss_clip": 0.01081215, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.0401057, "balance_loss_mlp": 1.02322388, "epoch": 0.5088230873290245, "flos": 13261631927040.0, "grad_norm": 2.117801536001897, "language_loss": 0.81441897, "learning_rate": 2.0399171717403706e-06, "loss": 0.83560967, "num_input_tokens_seen": 181988410, "step": 8463, "time_per_iteration": 2.7101564407348633 }, { "auxiliary_loss_clip": 0.0110518, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.04148602, "balance_loss_mlp": 1.02201426, "epoch": 0.5088832105816925, "flos": 20043469134720.0, "grad_norm": 2.6576302734312733, "language_loss": 0.76305163, "learning_rate": 2.039527786882341e-06, "loss": 0.78445399, "num_input_tokens_seen": 182006530, "step": 8464, "time_per_iteration": 2.6081295013427734 }, { "auxiliary_loss_clip": 0.01034964, "auxiliary_loss_mlp": 0.0100043, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 0.99929708, "epoch": 0.5089433338343604, "flos": 67422179018880.0, "grad_norm": 0.6843560168430419, "language_loss": 0.59347767, "learning_rate": 2.0391384005254133e-06, "loss": 0.61383158, "num_input_tokens_seen": 182074240, "step": 8465, "time_per_iteration": 3.308885097503662 }, { "auxiliary_loss_clip": 0.0111949, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.04262543, "balance_loss_mlp": 1.0203197, "epoch": 0.5090034570870284, "flos": 22710339336960.0, "grad_norm": 2.5778248190048787, "language_loss": 0.80206662, "learning_rate": 2.038749012684354e-06, "loss": 0.82359493, "num_input_tokens_seen": 182093360, "step": 8466, "time_per_iteration": 2.6912481784820557 }, { "auxiliary_loss_clip": 0.01107512, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.03987598, "balance_loss_mlp": 1.01634204, "epoch": 0.5090635803396963, "flos": 20445812352000.0, "grad_norm": 1.5056043379234754, "language_loss": 0.78307688, "learning_rate": 2.0383596233739286e-06, "loss": 0.80444586, "num_input_tokens_seen": 182110170, "step": 8467, "time_per_iteration": 2.61828875541687 }, { "auxiliary_loss_clip": 0.01119026, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.04424381, "balance_loss_mlp": 1.02226961, "epoch": 0.5091237035923644, "flos": 23768878164480.0, "grad_norm": 1.9340722959801353, "language_loss": 0.74676347, "learning_rate": 2.0379702326089013e-06, "loss": 0.76830113, "num_input_tokens_seen": 182129570, "step": 8468, "time_per_iteration": 2.6233344078063965 }, { "auxiliary_loss_clip": 0.01119943, "auxiliary_loss_mlp": 0.01029058, "balance_loss_clip": 1.04366863, "balance_loss_mlp": 1.01651728, "epoch": 0.5091838268450323, "flos": 18327908684160.0, "grad_norm": 1.884666390581893, "language_loss": 0.77613342, "learning_rate": 2.03758084040404e-06, "loss": 0.7976234, "num_input_tokens_seen": 182147565, "step": 8469, "time_per_iteration": 2.579117774963379 }, { "auxiliary_loss_clip": 0.01107532, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04521155, "balance_loss_mlp": 1.02425504, "epoch": 0.5092439500977003, "flos": 29057621806080.0, "grad_norm": 1.5718905230515574, "language_loss": 0.69481277, "learning_rate": 2.037191446774109e-06, "loss": 0.71627223, "num_input_tokens_seen": 182169695, "step": 8470, "time_per_iteration": 2.6437594890594482 }, { "auxiliary_loss_clip": 0.01096004, "auxiliary_loss_mlp": 0.01045395, "balance_loss_clip": 1.04067326, "balance_loss_mlp": 1.02993393, "epoch": 0.5093040733503682, "flos": 13553908894080.0, "grad_norm": 2.534594931806725, "language_loss": 0.73583853, "learning_rate": 2.0368020517338745e-06, "loss": 0.75725245, "num_input_tokens_seen": 182186385, "step": 8471, "time_per_iteration": 2.6213905811309814 }, { "auxiliary_loss_clip": 0.01043282, "auxiliary_loss_mlp": 0.00999685, "balance_loss_clip": 1.01733398, "balance_loss_mlp": 0.99825424, "epoch": 0.5093641966030362, "flos": 68906617407360.0, "grad_norm": 0.7545989611287492, "language_loss": 0.58065605, "learning_rate": 2.036412655298103e-06, "loss": 0.60108572, "num_input_tokens_seen": 182247095, "step": 8472, "time_per_iteration": 3.1640241146087646 }, { "auxiliary_loss_clip": 0.01069354, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.03772914, "balance_loss_mlp": 1.0235815, "epoch": 0.5094243198557042, "flos": 21580948932480.0, "grad_norm": 2.4665832849090994, "language_loss": 0.68956393, "learning_rate": 2.03602325748156e-06, "loss": 0.71062028, "num_input_tokens_seen": 182266380, "step": 8473, "time_per_iteration": 2.806593179702759 }, { "auxiliary_loss_clip": 0.01097364, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.04190159, "balance_loss_mlp": 1.02250814, "epoch": 0.5094844431083722, "flos": 28840721529600.0, "grad_norm": 1.8851162187904098, "language_loss": 0.85464561, "learning_rate": 2.0356338582990105e-06, "loss": 0.87597537, "num_input_tokens_seen": 182284685, "step": 8474, "time_per_iteration": 2.7467737197875977 }, { "auxiliary_loss_clip": 0.01097916, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04213905, "balance_loss_mlp": 1.02201009, "epoch": 0.5095445663610402, "flos": 14976114969600.0, "grad_norm": 2.1580860587409867, "language_loss": 0.65563238, "learning_rate": 2.035244457765222e-06, "loss": 0.6769644, "num_input_tokens_seen": 182301810, "step": 8475, "time_per_iteration": 2.653343439102173 }, { "auxiliary_loss_clip": 0.01101978, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.04155195, "balance_loss_mlp": 1.03043771, "epoch": 0.5096046896137081, "flos": 20777088510720.0, "grad_norm": 2.3692417745384886, "language_loss": 0.82122153, "learning_rate": 2.0348550558949605e-06, "loss": 0.84268838, "num_input_tokens_seen": 182320285, "step": 8476, "time_per_iteration": 2.735163927078247 }, { "auxiliary_loss_clip": 0.01069648, "auxiliary_loss_mlp": 0.01043833, "balance_loss_clip": 1.03814852, "balance_loss_mlp": 1.02698851, "epoch": 0.5096648128663761, "flos": 23185078416000.0, "grad_norm": 5.724576330634238, "language_loss": 0.80651575, "learning_rate": 2.0344656527029917e-06, "loss": 0.82765061, "num_input_tokens_seen": 182339465, "step": 8477, "time_per_iteration": 2.8972108364105225 }, { "auxiliary_loss_clip": 0.01096525, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.04044962, "balance_loss_mlp": 1.01321959, "epoch": 0.509724936119044, "flos": 22309432663680.0, "grad_norm": 1.8365176357872317, "language_loss": 0.6178633, "learning_rate": 2.034076248204082e-06, "loss": 0.63911271, "num_input_tokens_seen": 182358375, "step": 8478, "time_per_iteration": 2.77237606048584 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.04185414, "balance_loss_mlp": 1.02667403, "epoch": 0.509785059371712, "flos": 26287077974400.0, "grad_norm": 1.8436515105252975, "language_loss": 0.66209054, "learning_rate": 2.0336868424129968e-06, "loss": 0.68354309, "num_input_tokens_seen": 182377935, "step": 8479, "time_per_iteration": 2.667865514755249 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.0434258, "balance_loss_mlp": 1.02382231, "epoch": 0.50984518262438, "flos": 22964586779520.0, "grad_norm": 1.5755275700627138, "language_loss": 0.69447386, "learning_rate": 2.0332974353445037e-06, "loss": 0.71592748, "num_input_tokens_seen": 182396440, "step": 8480, "time_per_iteration": 2.630505323410034 }, { "auxiliary_loss_clip": 0.01124122, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.04386926, "balance_loss_mlp": 1.02133346, "epoch": 0.509905305877048, "flos": 26213389223040.0, "grad_norm": 1.7899171043779052, "language_loss": 0.79267204, "learning_rate": 2.0329080270133688e-06, "loss": 0.81426102, "num_input_tokens_seen": 182415890, "step": 8481, "time_per_iteration": 2.6193926334381104 }, { "auxiliary_loss_clip": 0.01104496, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.04157507, "balance_loss_mlp": 1.02124786, "epoch": 0.5099654291297159, "flos": 20340055733760.0, "grad_norm": 1.468990392476105, "language_loss": 0.83301556, "learning_rate": 2.0325186174343578e-06, "loss": 0.85440642, "num_input_tokens_seen": 182434235, "step": 8482, "time_per_iteration": 4.149403095245361 }, { "auxiliary_loss_clip": 0.01113898, "auxiliary_loss_mlp": 0.00771464, "balance_loss_clip": 1.04287457, "balance_loss_mlp": 1.00025356, "epoch": 0.5100255523823839, "flos": 29054820545280.0, "grad_norm": 1.9010351115161617, "language_loss": 0.85379988, "learning_rate": 2.032129206622238e-06, "loss": 0.87265354, "num_input_tokens_seen": 182454360, "step": 8483, "time_per_iteration": 2.7000234127044678 }, { "auxiliary_loss_clip": 0.01109801, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.04242575, "balance_loss_mlp": 1.0214082, "epoch": 0.5100856756350518, "flos": 22455912326400.0, "grad_norm": 2.079288328100567, "language_loss": 0.82931423, "learning_rate": 2.031739794591775e-06, "loss": 0.85075212, "num_input_tokens_seen": 182471940, "step": 8484, "time_per_iteration": 4.3401288986206055 }, { "auxiliary_loss_clip": 0.01095037, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.0400697, "balance_loss_mlp": 1.0194087, "epoch": 0.5101457988877198, "flos": 19171055606400.0, "grad_norm": 2.530206097433835, "language_loss": 0.81594586, "learning_rate": 2.031350381357736e-06, "loss": 0.83723271, "num_input_tokens_seen": 182490685, "step": 8485, "time_per_iteration": 2.6573400497436523 }, { "auxiliary_loss_clip": 0.01092909, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.03726983, "balance_loss_mlp": 1.02494788, "epoch": 0.5102059221403878, "flos": 14866371941760.0, "grad_norm": 1.9374375358888782, "language_loss": 0.74155819, "learning_rate": 2.0309609669348874e-06, "loss": 0.76287007, "num_input_tokens_seen": 182508325, "step": 8486, "time_per_iteration": 2.676863670349121 }, { "auxiliary_loss_clip": 0.01078995, "auxiliary_loss_mlp": 0.01037671, "balance_loss_clip": 1.03769588, "balance_loss_mlp": 1.0228231, "epoch": 0.5102660453930558, "flos": 22961103160320.0, "grad_norm": 1.4946123985675848, "language_loss": 0.70439661, "learning_rate": 2.0305715513379953e-06, "loss": 0.72556329, "num_input_tokens_seen": 182527020, "step": 8487, "time_per_iteration": 2.740612030029297 }, { "auxiliary_loss_clip": 0.01099488, "auxiliary_loss_mlp": 0.01039832, "balance_loss_clip": 1.04223216, "balance_loss_mlp": 1.02521729, "epoch": 0.5103261686457238, "flos": 23149311448320.0, "grad_norm": 2.286550245787084, "language_loss": 0.73022705, "learning_rate": 2.030182134581827e-06, "loss": 0.75162029, "num_input_tokens_seen": 182543505, "step": 8488, "time_per_iteration": 4.345505714416504 }, { "auxiliary_loss_clip": 0.01081446, "auxiliary_loss_mlp": 0.00771801, "balance_loss_clip": 1.04138601, "balance_loss_mlp": 1.00030088, "epoch": 0.5103862918983917, "flos": 14319237000960.0, "grad_norm": 1.7726796746163496, "language_loss": 0.69465196, "learning_rate": 2.0297927166811503e-06, "loss": 0.71318448, "num_input_tokens_seen": 182562250, "step": 8489, "time_per_iteration": 2.7057676315307617 }, { "auxiliary_loss_clip": 0.01096056, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.04011536, "balance_loss_mlp": 1.02176082, "epoch": 0.5104464151510597, "flos": 25848536826240.0, "grad_norm": 2.097372581248088, "language_loss": 0.73219633, "learning_rate": 2.0294032976507297e-06, "loss": 0.75351048, "num_input_tokens_seen": 182581910, "step": 8490, "time_per_iteration": 2.7062344551086426 }, { "auxiliary_loss_clip": 0.01093699, "auxiliary_loss_mlp": 0.01030609, "balance_loss_clip": 1.04015577, "balance_loss_mlp": 1.01796126, "epoch": 0.5105065384037276, "flos": 21652913831040.0, "grad_norm": 1.454492701867694, "language_loss": 0.80228478, "learning_rate": 2.0290138775053337e-06, "loss": 0.82352787, "num_input_tokens_seen": 182601350, "step": 8491, "time_per_iteration": 2.670520782470703 }, { "auxiliary_loss_clip": 0.01108835, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.04258561, "balance_loss_mlp": 1.01813614, "epoch": 0.5105666616563956, "flos": 22491571553280.0, "grad_norm": 1.8545470770344947, "language_loss": 0.78970987, "learning_rate": 2.028624456259728e-06, "loss": 0.81110907, "num_input_tokens_seen": 182619660, "step": 8492, "time_per_iteration": 2.681852102279663 }, { "auxiliary_loss_clip": 0.01088193, "auxiliary_loss_mlp": 0.01045644, "balance_loss_clip": 1.04025435, "balance_loss_mlp": 1.03187561, "epoch": 0.5106267849090635, "flos": 22455768672000.0, "grad_norm": 1.9312934890574833, "language_loss": 0.77364743, "learning_rate": 2.0282350339286804e-06, "loss": 0.79498577, "num_input_tokens_seen": 182639815, "step": 8493, "time_per_iteration": 2.71234393119812 }, { "auxiliary_loss_clip": 0.01079322, "auxiliary_loss_mlp": 0.01035175, "balance_loss_clip": 1.04074192, "balance_loss_mlp": 1.02040458, "epoch": 0.5106869081617316, "flos": 23547093638400.0, "grad_norm": 1.7772442138937719, "language_loss": 0.84122825, "learning_rate": 2.0278456105269574e-06, "loss": 0.86237323, "num_input_tokens_seen": 182659655, "step": 8494, "time_per_iteration": 2.737844944000244 }, { "auxiliary_loss_clip": 0.0112627, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.04641843, "balance_loss_mlp": 1.02502632, "epoch": 0.5107470314143995, "flos": 26792987080320.0, "grad_norm": 1.9716326999087717, "language_loss": 0.78846836, "learning_rate": 2.027456186069326e-06, "loss": 0.81010866, "num_input_tokens_seen": 182677075, "step": 8495, "time_per_iteration": 2.5992324352264404 }, { "auxiliary_loss_clip": 0.01088486, "auxiliary_loss_mlp": 0.0103671, "balance_loss_clip": 1.04210663, "balance_loss_mlp": 1.02254176, "epoch": 0.5108071546670675, "flos": 25739691638400.0, "grad_norm": 1.7860993635097173, "language_loss": 0.78245926, "learning_rate": 2.0270667605705535e-06, "loss": 0.80371118, "num_input_tokens_seen": 182699625, "step": 8496, "time_per_iteration": 2.764511823654175 }, { "auxiliary_loss_clip": 0.01107232, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.04186177, "balance_loss_mlp": 1.01885021, "epoch": 0.5108672779197354, "flos": 18697537589760.0, "grad_norm": 2.583960220786706, "language_loss": 0.78615016, "learning_rate": 2.0266773340454066e-06, "loss": 0.80753547, "num_input_tokens_seen": 182717020, "step": 8497, "time_per_iteration": 2.614715337753296 }, { "auxiliary_loss_clip": 0.01119749, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.04238069, "balance_loss_mlp": 1.01958323, "epoch": 0.5109274011724034, "flos": 26688164215680.0, "grad_norm": 1.8043712312754003, "language_loss": 0.81731009, "learning_rate": 2.0262879065086525e-06, "loss": 0.83883524, "num_input_tokens_seen": 182736955, "step": 8498, "time_per_iteration": 2.670713186264038 }, { "auxiliary_loss_clip": 0.01086895, "auxiliary_loss_mlp": 0.00771568, "balance_loss_clip": 1.03893542, "balance_loss_mlp": 1.00021791, "epoch": 0.5109875244250714, "flos": 22784028088320.0, "grad_norm": 1.9502410959783398, "language_loss": 0.70963287, "learning_rate": 2.0258984779750584e-06, "loss": 0.72821754, "num_input_tokens_seen": 182757620, "step": 8499, "time_per_iteration": 2.6890597343444824 }, { "auxiliary_loss_clip": 0.01063023, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.03797197, "balance_loss_mlp": 1.0247463, "epoch": 0.5110476476777394, "flos": 35588515622400.0, "grad_norm": 1.532594294583486, "language_loss": 0.72400367, "learning_rate": 2.0255090484593914e-06, "loss": 0.74502897, "num_input_tokens_seen": 182780195, "step": 8500, "time_per_iteration": 2.8889389038085938 }, { "auxiliary_loss_clip": 0.01113898, "auxiliary_loss_mlp": 0.01039834, "balance_loss_clip": 1.04150367, "balance_loss_mlp": 1.0244801, "epoch": 0.5111077709304074, "flos": 19280798634240.0, "grad_norm": 2.6334939898019867, "language_loss": 0.62424856, "learning_rate": 2.0251196179764183e-06, "loss": 0.64578593, "num_input_tokens_seen": 182795765, "step": 8501, "time_per_iteration": 2.564922571182251 }, { "auxiliary_loss_clip": 0.01120814, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04017985, "balance_loss_mlp": 1.0265801, "epoch": 0.5111678941830753, "flos": 20668207409280.0, "grad_norm": 2.184561184824311, "language_loss": 0.87622821, "learning_rate": 2.024730186540907e-06, "loss": 0.89784235, "num_input_tokens_seen": 182813120, "step": 8502, "time_per_iteration": 2.6287243366241455 }, { "auxiliary_loss_clip": 0.01106628, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.04065216, "balance_loss_mlp": 1.02592492, "epoch": 0.5112280174357433, "flos": 26287903987200.0, "grad_norm": 1.480449524900748, "language_loss": 0.82794261, "learning_rate": 2.0243407541676253e-06, "loss": 0.84939504, "num_input_tokens_seen": 182835745, "step": 8503, "time_per_iteration": 2.682711124420166 }, { "auxiliary_loss_clip": 0.01025632, "auxiliary_loss_mlp": 0.01004613, "balance_loss_clip": 1.0205853, "balance_loss_mlp": 1.00336099, "epoch": 0.5112881406884112, "flos": 59474247707520.0, "grad_norm": 0.8583626669635097, "language_loss": 0.63898063, "learning_rate": 2.023951320871339e-06, "loss": 0.65928316, "num_input_tokens_seen": 182892540, "step": 8504, "time_per_iteration": 3.216397523880005 }, { "auxiliary_loss_clip": 0.01091882, "auxiliary_loss_mlp": 0.00771622, "balance_loss_clip": 1.04488444, "balance_loss_mlp": 1.00014472, "epoch": 0.5113482639410792, "flos": 26468857728000.0, "grad_norm": 1.826391287063558, "language_loss": 0.84206301, "learning_rate": 2.023561886666816e-06, "loss": 0.86069804, "num_input_tokens_seen": 182911515, "step": 8505, "time_per_iteration": 2.8032052516937256 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.04468179, "balance_loss_mlp": 1.01698971, "epoch": 0.5114083871937471, "flos": 29895848565120.0, "grad_norm": 1.983310033112748, "language_loss": 0.75608075, "learning_rate": 2.0231724515688246e-06, "loss": 0.77747381, "num_input_tokens_seen": 182930860, "step": 8506, "time_per_iteration": 2.699448347091675 }, { "auxiliary_loss_clip": 0.01122646, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.04428148, "balance_loss_mlp": 1.01986337, "epoch": 0.5114685104464152, "flos": 24314576561280.0, "grad_norm": 1.918965700593569, "language_loss": 0.58023655, "learning_rate": 2.022783015592131e-06, "loss": 0.60180998, "num_input_tokens_seen": 182949960, "step": 8507, "time_per_iteration": 2.5828280448913574 }, { "auxiliary_loss_clip": 0.01114406, "auxiliary_loss_mlp": 0.01042669, "balance_loss_clip": 1.04659033, "balance_loss_mlp": 1.02820277, "epoch": 0.5115286336990831, "flos": 17019288391680.0, "grad_norm": 1.7197846358145388, "language_loss": 0.85691231, "learning_rate": 2.022393578751503e-06, "loss": 0.87848306, "num_input_tokens_seen": 182968085, "step": 8508, "time_per_iteration": 2.691185235977173 }, { "auxiliary_loss_clip": 0.01090388, "auxiliary_loss_mlp": 0.00770619, "balance_loss_clip": 1.04480338, "balance_loss_mlp": 1.00018072, "epoch": 0.5115887569517511, "flos": 23659386531840.0, "grad_norm": 1.8624731533798382, "language_loss": 0.72326827, "learning_rate": 2.022004141061709e-06, "loss": 0.74187839, "num_input_tokens_seen": 182987275, "step": 8509, "time_per_iteration": 2.7239418029785156 }, { "auxiliary_loss_clip": 0.01120525, "auxiliary_loss_mlp": 0.00770526, "balance_loss_clip": 1.04470599, "balance_loss_mlp": 1.00009036, "epoch": 0.511648880204419, "flos": 16107193313280.0, "grad_norm": 2.5868792605641477, "language_loss": 0.76204944, "learning_rate": 2.0216147025375153e-06, "loss": 0.78095996, "num_input_tokens_seen": 183004700, "step": 8510, "time_per_iteration": 2.6135294437408447 }, { "auxiliary_loss_clip": 0.0112199, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.04560411, "balance_loss_mlp": 1.022668, "epoch": 0.511709003457087, "flos": 32634970974720.0, "grad_norm": 4.709097064233808, "language_loss": 0.70997655, "learning_rate": 2.0212252631936907e-06, "loss": 0.73155165, "num_input_tokens_seen": 183025830, "step": 8511, "time_per_iteration": 2.7760493755340576 }, { "auxiliary_loss_clip": 0.01095679, "auxiliary_loss_mlp": 0.01029146, "balance_loss_clip": 1.04216874, "balance_loss_mlp": 1.01593149, "epoch": 0.511769126709755, "flos": 21762082241280.0, "grad_norm": 2.953853433531297, "language_loss": 0.66357356, "learning_rate": 2.020835823045001e-06, "loss": 0.68482178, "num_input_tokens_seen": 183045140, "step": 8512, "time_per_iteration": 2.723987340927124 }, { "auxiliary_loss_clip": 0.01060265, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.0384953, "balance_loss_mlp": 1.02158666, "epoch": 0.511829249962423, "flos": 23915357827200.0, "grad_norm": 1.7575723482240548, "language_loss": 0.67203867, "learning_rate": 2.0204463821062146e-06, "loss": 0.69301212, "num_input_tokens_seen": 183063935, "step": 8513, "time_per_iteration": 2.759958505630493 }, { "auxiliary_loss_clip": 0.01083159, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.04507256, "balance_loss_mlp": 1.02201903, "epoch": 0.511889373215091, "flos": 23727005884800.0, "grad_norm": 2.3341144576485116, "language_loss": 0.68508673, "learning_rate": 2.0200569403921e-06, "loss": 0.70627999, "num_input_tokens_seen": 183084135, "step": 8514, "time_per_iteration": 2.7791545391082764 }, { "auxiliary_loss_clip": 0.01119085, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.04411948, "balance_loss_mlp": 1.01689076, "epoch": 0.5119494964677589, "flos": 28111519526400.0, "grad_norm": 1.6536407135597841, "language_loss": 0.66139281, "learning_rate": 2.019667497917424e-06, "loss": 0.68287694, "num_input_tokens_seen": 183104570, "step": 8515, "time_per_iteration": 2.6567435264587402 }, { "auxiliary_loss_clip": 0.01109629, "auxiliary_loss_mlp": 0.01035907, "balance_loss_clip": 1.04417586, "balance_loss_mlp": 1.02317524, "epoch": 0.5120096197204269, "flos": 24973214296320.0, "grad_norm": 1.939516836327544, "language_loss": 0.7526269, "learning_rate": 2.019278054696955e-06, "loss": 0.77408224, "num_input_tokens_seen": 183123850, "step": 8516, "time_per_iteration": 2.7218270301818848 }, { "auxiliary_loss_clip": 0.01093123, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.04275799, "balance_loss_mlp": 1.02562129, "epoch": 0.5120697429730948, "flos": 17968012364160.0, "grad_norm": 2.066446678045309, "language_loss": 0.78090644, "learning_rate": 2.0188886107454595e-06, "loss": 0.80223525, "num_input_tokens_seen": 183141725, "step": 8517, "time_per_iteration": 2.6922826766967773 }, { "auxiliary_loss_clip": 0.01114661, "auxiliary_loss_mlp": 0.01034987, "balance_loss_clip": 1.0449543, "balance_loss_mlp": 1.02086043, "epoch": 0.5121298662257628, "flos": 23292343405440.0, "grad_norm": 1.7160803061965533, "language_loss": 0.74111056, "learning_rate": 2.0184991660777063e-06, "loss": 0.76260698, "num_input_tokens_seen": 183161300, "step": 8518, "time_per_iteration": 2.6781773567199707 }, { "auxiliary_loss_clip": 0.01107849, "auxiliary_loss_mlp": 0.0104112, "balance_loss_clip": 1.04497719, "balance_loss_mlp": 1.02699947, "epoch": 0.5121899894784308, "flos": 17311062568320.0, "grad_norm": 1.7790366802945887, "language_loss": 0.78405094, "learning_rate": 2.0181097207084625e-06, "loss": 0.80554068, "num_input_tokens_seen": 183180495, "step": 8519, "time_per_iteration": 2.634488582611084 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.04811025, "balance_loss_mlp": 1.02241898, "epoch": 0.5122501127310988, "flos": 24930085040640.0, "grad_norm": 1.8142627745056843, "language_loss": 0.79518384, "learning_rate": 2.017720274652497e-06, "loss": 0.81681275, "num_input_tokens_seen": 183200330, "step": 8520, "time_per_iteration": 2.6977620124816895 }, { "auxiliary_loss_clip": 0.01104965, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.0438292, "balance_loss_mlp": 1.02683616, "epoch": 0.5123102359837667, "flos": 18442859184000.0, "grad_norm": 2.180675544150299, "language_loss": 0.81294155, "learning_rate": 2.0173308279245765e-06, "loss": 0.83440727, "num_input_tokens_seen": 183218230, "step": 8521, "time_per_iteration": 4.264198303222656 }, { "auxiliary_loss_clip": 0.0111372, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.04381251, "balance_loss_mlp": 1.01808071, "epoch": 0.5123703592364347, "flos": 26684860164480.0, "grad_norm": 1.8350455385455566, "language_loss": 0.68333864, "learning_rate": 2.0169413805394692e-06, "loss": 0.70480323, "num_input_tokens_seen": 183236735, "step": 8522, "time_per_iteration": 2.755563735961914 }, { "auxiliary_loss_clip": 0.0109986, "auxiliary_loss_mlp": 0.01043615, "balance_loss_clip": 1.04744244, "balance_loss_mlp": 1.02636552, "epoch": 0.5124304824891026, "flos": 28803948981120.0, "grad_norm": 1.6735611690288588, "language_loss": 0.61849087, "learning_rate": 2.0165519325119433e-06, "loss": 0.6399256, "num_input_tokens_seen": 183257550, "step": 8523, "time_per_iteration": 2.752614974975586 }, { "auxiliary_loss_clip": 0.01088964, "auxiliary_loss_mlp": 0.01041136, "balance_loss_clip": 1.04488027, "balance_loss_mlp": 1.02776718, "epoch": 0.5124906057417706, "flos": 21761830846080.0, "grad_norm": 2.1631882282248966, "language_loss": 0.7807008, "learning_rate": 2.0161624838567656e-06, "loss": 0.80200177, "num_input_tokens_seen": 183275515, "step": 8524, "time_per_iteration": 5.938940763473511 }, { "auxiliary_loss_clip": 0.0110059, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.04444933, "balance_loss_mlp": 1.02287436, "epoch": 0.5125507289944387, "flos": 18880538405760.0, "grad_norm": 2.5285806743725834, "language_loss": 0.7489953, "learning_rate": 2.015773034588706e-06, "loss": 0.77035987, "num_input_tokens_seen": 183293880, "step": 8525, "time_per_iteration": 2.6603550910949707 }, { "auxiliary_loss_clip": 0.01100341, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.04424882, "balance_loss_mlp": 1.02996945, "epoch": 0.5126108522471066, "flos": 35627838036480.0, "grad_norm": 1.6545403659553666, "language_loss": 0.74193799, "learning_rate": 2.015383584722531e-06, "loss": 0.76340014, "num_input_tokens_seen": 183315860, "step": 8526, "time_per_iteration": 2.7631187438964844 }, { "auxiliary_loss_clip": 0.01117967, "auxiliary_loss_mlp": 0.010412, "balance_loss_clip": 1.04805541, "balance_loss_mlp": 1.02755094, "epoch": 0.5126709754997746, "flos": 20190918464640.0, "grad_norm": 1.7970307477050764, "language_loss": 0.65624464, "learning_rate": 2.0149941342730088e-06, "loss": 0.6778363, "num_input_tokens_seen": 183335480, "step": 8527, "time_per_iteration": 4.185753107070923 }, { "auxiliary_loss_clip": 0.01099112, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04767573, "balance_loss_mlp": 1.02663493, "epoch": 0.5127310987524425, "flos": 18588548747520.0, "grad_norm": 1.4652981434759074, "language_loss": 0.74246556, "learning_rate": 2.014604683254908e-06, "loss": 0.76384449, "num_input_tokens_seen": 183354395, "step": 8528, "time_per_iteration": 2.647268056869507 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.04382324, "balance_loss_mlp": 1.02143764, "epoch": 0.5127912220051105, "flos": 22454691264000.0, "grad_norm": 1.6345499952693072, "language_loss": 0.82838154, "learning_rate": 2.014215231682995e-06, "loss": 0.84983552, "num_input_tokens_seen": 183372980, "step": 8529, "time_per_iteration": 2.6546859741210938 }, { "auxiliary_loss_clip": 0.0107231, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.04131067, "balance_loss_mlp": 1.02149725, "epoch": 0.5128513452577784, "flos": 19093703667840.0, "grad_norm": 2.6019709601767866, "language_loss": 0.73687661, "learning_rate": 2.01382577957204e-06, "loss": 0.75794935, "num_input_tokens_seen": 183390160, "step": 8530, "time_per_iteration": 2.754840612411499 }, { "auxiliary_loss_clip": 0.01018433, "auxiliary_loss_mlp": 0.01003338, "balance_loss_clip": 1.02142978, "balance_loss_mlp": 1.00163293, "epoch": 0.5129114685104464, "flos": 67892285243520.0, "grad_norm": 0.7482622882096543, "language_loss": 0.60775113, "learning_rate": 2.0134363269368095e-06, "loss": 0.62796879, "num_input_tokens_seen": 183455280, "step": 8531, "time_per_iteration": 3.331425666809082 }, { "auxiliary_loss_clip": 0.01096599, "auxiliary_loss_mlp": 0.01039227, "balance_loss_clip": 1.04599643, "balance_loss_mlp": 1.02387309, "epoch": 0.5129715917631144, "flos": 20449152316800.0, "grad_norm": 1.6723134032232012, "language_loss": 0.76866412, "learning_rate": 2.0130468737920725e-06, "loss": 0.79002237, "num_input_tokens_seen": 183473955, "step": 8532, "time_per_iteration": 2.8071939945220947 }, { "auxiliary_loss_clip": 0.0110043, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.0434345, "balance_loss_mlp": 1.02273178, "epoch": 0.5130317150157824, "flos": 35116146840960.0, "grad_norm": 4.28948987854823, "language_loss": 0.67031407, "learning_rate": 2.012657420152597e-06, "loss": 0.69168431, "num_input_tokens_seen": 183497195, "step": 8533, "time_per_iteration": 2.7799179553985596 }, { "auxiliary_loss_clip": 0.01094678, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.04602468, "balance_loss_mlp": 1.02452362, "epoch": 0.5130918382684503, "flos": 19791627903360.0, "grad_norm": 1.9915175591272611, "language_loss": 0.8200537, "learning_rate": 2.01226796603315e-06, "loss": 0.84139454, "num_input_tokens_seen": 183513675, "step": 8534, "time_per_iteration": 2.6692066192626953 }, { "auxiliary_loss_clip": 0.01111793, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.04316652, "balance_loss_mlp": 1.02398574, "epoch": 0.5131519615211183, "flos": 26323096337280.0, "grad_norm": 1.4683279633381257, "language_loss": 0.63850307, "learning_rate": 2.0118785114485017e-06, "loss": 0.66000712, "num_input_tokens_seen": 183535165, "step": 8535, "time_per_iteration": 2.6881463527679443 }, { "auxiliary_loss_clip": 0.01118055, "auxiliary_loss_mlp": 0.01031488, "balance_loss_clip": 1.04930139, "balance_loss_mlp": 1.01707554, "epoch": 0.5132120847737862, "flos": 19171917532800.0, "grad_norm": 1.558826189326605, "language_loss": 0.69832361, "learning_rate": 2.011489056413418e-06, "loss": 0.71981907, "num_input_tokens_seen": 183553780, "step": 8536, "time_per_iteration": 2.7181568145751953 }, { "auxiliary_loss_clip": 0.01116762, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.04751253, "balance_loss_mlp": 1.02378178, "epoch": 0.5132722080264542, "flos": 20230420446720.0, "grad_norm": 1.9464397996960447, "language_loss": 0.70725036, "learning_rate": 2.011099600942669e-06, "loss": 0.72880518, "num_input_tokens_seen": 183572285, "step": 8537, "time_per_iteration": 2.6996657848358154 }, { "auxiliary_loss_clip": 0.01080908, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.04291606, "balance_loss_mlp": 1.02007353, "epoch": 0.5133323312791223, "flos": 16469459930880.0, "grad_norm": 1.8282608051087097, "language_loss": 0.8028723, "learning_rate": 2.0107101450510214e-06, "loss": 0.82402611, "num_input_tokens_seen": 183589330, "step": 8538, "time_per_iteration": 2.752685308456421 }, { "auxiliary_loss_clip": 0.01113197, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.0443325, "balance_loss_mlp": 1.01739144, "epoch": 0.5133924545317902, "flos": 26068094709120.0, "grad_norm": 2.0083592119837403, "language_loss": 0.78388107, "learning_rate": 2.0103206887532437e-06, "loss": 0.80532658, "num_input_tokens_seen": 183609205, "step": 8539, "time_per_iteration": 2.6856329441070557 }, { "auxiliary_loss_clip": 0.0109867, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.04138374, "balance_loss_mlp": 1.01994729, "epoch": 0.5134525777844582, "flos": 29131023248640.0, "grad_norm": 1.7382927125385157, "language_loss": 0.76111883, "learning_rate": 2.009931232064105e-06, "loss": 0.78244424, "num_input_tokens_seen": 183629985, "step": 8540, "time_per_iteration": 2.780198574066162 }, { "auxiliary_loss_clip": 0.01074682, "auxiliary_loss_mlp": 0.01038818, "balance_loss_clip": 1.04355264, "balance_loss_mlp": 1.02344, "epoch": 0.5135127010371261, "flos": 17454776883840.0, "grad_norm": 1.7132610384814069, "language_loss": 0.746566, "learning_rate": 2.0095417749983724e-06, "loss": 0.76770097, "num_input_tokens_seen": 183648220, "step": 8541, "time_per_iteration": 2.6982674598693848 }, { "auxiliary_loss_clip": 0.01060333, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.0412941, "balance_loss_mlp": 1.02475083, "epoch": 0.5135728242897941, "flos": 21944975316480.0, "grad_norm": 1.5289233613121331, "language_loss": 0.70432508, "learning_rate": 2.0091523175708162e-06, "loss": 0.72532117, "num_input_tokens_seen": 183668230, "step": 8542, "time_per_iteration": 2.783440113067627 }, { "auxiliary_loss_clip": 0.01102439, "auxiliary_loss_mlp": 0.01029643, "balance_loss_clip": 1.04426861, "balance_loss_mlp": 1.01601708, "epoch": 0.513632947542462, "flos": 22674859678080.0, "grad_norm": 1.886898343071389, "language_loss": 0.79691696, "learning_rate": 2.0087628597962023e-06, "loss": 0.81823772, "num_input_tokens_seen": 183687800, "step": 8543, "time_per_iteration": 2.906564950942993 }, { "auxiliary_loss_clip": 0.01101285, "auxiliary_loss_mlp": 0.01044679, "balance_loss_clip": 1.04514194, "balance_loss_mlp": 1.03012979, "epoch": 0.51369307079513, "flos": 29457163762560.0, "grad_norm": 1.7217499667212701, "language_loss": 0.67941636, "learning_rate": 2.008373401689299e-06, "loss": 0.700876, "num_input_tokens_seen": 183709025, "step": 8544, "time_per_iteration": 2.815377950668335 }, { "auxiliary_loss_clip": 0.01086355, "auxiliary_loss_mlp": 0.01049073, "balance_loss_clip": 1.03878117, "balance_loss_mlp": 1.03430903, "epoch": 0.513753194047798, "flos": 18989347680000.0, "grad_norm": 2.2112374430559214, "language_loss": 0.72265953, "learning_rate": 2.0079839432648765e-06, "loss": 0.74401385, "num_input_tokens_seen": 183725740, "step": 8545, "time_per_iteration": 2.7677536010742188 }, { "auxiliary_loss_clip": 0.01115821, "auxiliary_loss_mlp": 0.01045255, "balance_loss_clip": 1.04458177, "balance_loss_mlp": 1.03013897, "epoch": 0.513813317300466, "flos": 17821855923840.0, "grad_norm": 2.431720560794894, "language_loss": 0.82277304, "learning_rate": 2.0075944845377016e-06, "loss": 0.84438378, "num_input_tokens_seen": 183743995, "step": 8546, "time_per_iteration": 2.6764519214630127 }, { "auxiliary_loss_clip": 0.01110159, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.0421015, "balance_loss_mlp": 1.02272379, "epoch": 0.5138734405531339, "flos": 24061191045120.0, "grad_norm": 1.829642419824105, "language_loss": 0.73038638, "learning_rate": 2.007205025522544e-06, "loss": 0.75185841, "num_input_tokens_seen": 183764150, "step": 8547, "time_per_iteration": 2.664536714553833 }, { "auxiliary_loss_clip": 0.01112692, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.04215682, "balance_loss_mlp": 1.03369892, "epoch": 0.5139335638058019, "flos": 26097253574400.0, "grad_norm": 1.6776951969003835, "language_loss": 0.73548347, "learning_rate": 2.0068155662341702e-06, "loss": 0.75709057, "num_input_tokens_seen": 183783280, "step": 8548, "time_per_iteration": 2.6639697551727295 }, { "auxiliary_loss_clip": 0.01086334, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.03931546, "balance_loss_mlp": 1.02296984, "epoch": 0.5139936870584698, "flos": 18917095472640.0, "grad_norm": 1.6001321585074282, "language_loss": 0.82261604, "learning_rate": 2.0064261066873495e-06, "loss": 0.84385222, "num_input_tokens_seen": 183800725, "step": 8549, "time_per_iteration": 2.748581886291504 }, { "auxiliary_loss_clip": 0.01115178, "auxiliary_loss_mlp": 0.01033379, "balance_loss_clip": 1.04665935, "balance_loss_mlp": 1.0205524, "epoch": 0.5140538103111378, "flos": 16144001775360.0, "grad_norm": 1.9742432137522015, "language_loss": 0.71977437, "learning_rate": 2.0060366468968504e-06, "loss": 0.74125993, "num_input_tokens_seen": 183818735, "step": 8550, "time_per_iteration": 2.651068687438965 }, { "auxiliary_loss_clip": 0.01112958, "auxiliary_loss_mlp": 0.01041915, "balance_loss_clip": 1.04612732, "balance_loss_mlp": 1.02725196, "epoch": 0.5141139335638057, "flos": 22420145358720.0, "grad_norm": 1.8069208573649895, "language_loss": 0.75043917, "learning_rate": 2.0056471868774408e-06, "loss": 0.77198792, "num_input_tokens_seen": 183840015, "step": 8551, "time_per_iteration": 2.7058589458465576 }, { "auxiliary_loss_clip": 0.01093993, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.0459106, "balance_loss_mlp": 1.0240587, "epoch": 0.5141740568164738, "flos": 27089645506560.0, "grad_norm": 1.6630090206247619, "language_loss": 0.69182396, "learning_rate": 2.0052577266438897e-06, "loss": 0.71314144, "num_input_tokens_seen": 183860145, "step": 8552, "time_per_iteration": 2.7040834426879883 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.04381299, "balance_loss_mlp": 1.02445841, "epoch": 0.5142341800691418, "flos": 24973250209920.0, "grad_norm": 2.1567314432200364, "language_loss": 0.753088, "learning_rate": 2.004868266210965e-06, "loss": 0.7746222, "num_input_tokens_seen": 183880540, "step": 8553, "time_per_iteration": 2.6321310997009277 }, { "auxiliary_loss_clip": 0.01125852, "auxiliary_loss_mlp": 0.0104126, "balance_loss_clip": 1.04767513, "balance_loss_mlp": 1.02800989, "epoch": 0.5142943033218097, "flos": 20704513080960.0, "grad_norm": 1.7807872167537822, "language_loss": 0.67740041, "learning_rate": 2.004478805593435e-06, "loss": 0.69907153, "num_input_tokens_seen": 183900895, "step": 8554, "time_per_iteration": 2.5353291034698486 }, { "auxiliary_loss_clip": 0.01118225, "auxiliary_loss_mlp": 0.01040414, "balance_loss_clip": 1.04483485, "balance_loss_mlp": 1.02390337, "epoch": 0.5143544265744777, "flos": 22925479847040.0, "grad_norm": 1.822401657137422, "language_loss": 0.73321033, "learning_rate": 2.004089344806068e-06, "loss": 0.75479674, "num_input_tokens_seen": 183920335, "step": 8555, "time_per_iteration": 2.8193295001983643 }, { "auxiliary_loss_clip": 0.01089525, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.04645813, "balance_loss_mlp": 1.02570128, "epoch": 0.5144145498271456, "flos": 15921391236480.0, "grad_norm": 2.4707318139003327, "language_loss": 0.74175709, "learning_rate": 2.003699883863633e-06, "loss": 0.76304758, "num_input_tokens_seen": 183936220, "step": 8556, "time_per_iteration": 2.721573829650879 }, { "auxiliary_loss_clip": 0.0109284, "auxiliary_loss_mlp": 0.01036355, "balance_loss_clip": 1.04400861, "balance_loss_mlp": 1.02320015, "epoch": 0.5144746730798136, "flos": 19681238430720.0, "grad_norm": 1.790105253554859, "language_loss": 0.85782719, "learning_rate": 2.003310422780898e-06, "loss": 0.87911922, "num_input_tokens_seen": 183953250, "step": 8557, "time_per_iteration": 2.70686674118042 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.01043673, "balance_loss_clip": 1.04357624, "balance_loss_mlp": 1.0292908, "epoch": 0.5145347963324816, "flos": 23914711382400.0, "grad_norm": 1.6124493392185149, "language_loss": 0.88770819, "learning_rate": 2.0029209615726307e-06, "loss": 0.90921259, "num_input_tokens_seen": 183973865, "step": 8558, "time_per_iteration": 2.7256360054016113 }, { "auxiliary_loss_clip": 0.01123218, "auxiliary_loss_mlp": 0.00770892, "balance_loss_clip": 1.04631722, "balance_loss_mlp": 1.00014222, "epoch": 0.5145949195851496, "flos": 18260002022400.0, "grad_norm": 2.0888380287595196, "language_loss": 0.65300936, "learning_rate": 2.002531500253602e-06, "loss": 0.67195046, "num_input_tokens_seen": 183992555, "step": 8559, "time_per_iteration": 2.64591646194458 }, { "auxiliary_loss_clip": 0.01108519, "auxiliary_loss_mlp": 0.00771269, "balance_loss_clip": 1.04542136, "balance_loss_mlp": 1.00025797, "epoch": 0.5146550428378175, "flos": 26213425136640.0, "grad_norm": 1.9572467781311524, "language_loss": 0.63094109, "learning_rate": 2.002142038838577e-06, "loss": 0.64973897, "num_input_tokens_seen": 184010825, "step": 8560, "time_per_iteration": 4.225303888320923 }, { "auxiliary_loss_clip": 0.0112394, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.04584384, "balance_loss_mlp": 1.01820433, "epoch": 0.5147151660904855, "flos": 22674177319680.0, "grad_norm": 1.85112269234195, "language_loss": 0.70142567, "learning_rate": 2.0017525773423265e-06, "loss": 0.72298455, "num_input_tokens_seen": 184030155, "step": 8561, "time_per_iteration": 2.6462759971618652 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.01032154, "balance_loss_clip": 1.04134226, "balance_loss_mlp": 1.01888585, "epoch": 0.5147752893431534, "flos": 24972388283520.0, "grad_norm": 1.6885707870282478, "language_loss": 0.66502726, "learning_rate": 2.0013631157796177e-06, "loss": 0.6863426, "num_input_tokens_seen": 184051440, "step": 8562, "time_per_iteration": 2.6790151596069336 }, { "auxiliary_loss_clip": 0.01118509, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.04731929, "balance_loss_mlp": 1.02153838, "epoch": 0.5148354125958214, "flos": 22744669760640.0, "grad_norm": 1.6641105551237323, "language_loss": 0.77625287, "learning_rate": 2.0009736541652188e-06, "loss": 0.79778934, "num_input_tokens_seen": 184070205, "step": 8563, "time_per_iteration": 5.86843466758728 }, { "auxiliary_loss_clip": 0.01117165, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.04520798, "balance_loss_mlp": 1.01931095, "epoch": 0.5148955358484893, "flos": 23068763199360.0, "grad_norm": 1.8668644890701778, "language_loss": 0.82346904, "learning_rate": 2.0005841925139e-06, "loss": 0.84499174, "num_input_tokens_seen": 184087345, "step": 8564, "time_per_iteration": 2.6531171798706055 }, { "auxiliary_loss_clip": 0.01105481, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.04333782, "balance_loss_mlp": 1.02130592, "epoch": 0.5149556591011574, "flos": 20340127560960.0, "grad_norm": 1.6929228826937828, "language_loss": 0.73255026, "learning_rate": 2.0001947308404283e-06, "loss": 0.75396281, "num_input_tokens_seen": 184107110, "step": 8565, "time_per_iteration": 2.8100740909576416 }, { "auxiliary_loss_clip": 0.0111614, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04448807, "balance_loss_mlp": 1.02056694, "epoch": 0.5150157823538254, "flos": 22638230784000.0, "grad_norm": 2.0356075529568596, "language_loss": 0.68441874, "learning_rate": 1.9998052691595715e-06, "loss": 0.70594788, "num_input_tokens_seen": 184127105, "step": 8566, "time_per_iteration": 4.174206972122192 }, { "auxiliary_loss_clip": 0.01126685, "auxiliary_loss_mlp": 0.00772285, "balance_loss_clip": 1.04328656, "balance_loss_mlp": 1.00031221, "epoch": 0.5150759056064933, "flos": 26067627832320.0, "grad_norm": 1.624621701105177, "language_loss": 0.78153682, "learning_rate": 1.9994158074861005e-06, "loss": 0.80052656, "num_input_tokens_seen": 184148060, "step": 8567, "time_per_iteration": 2.6405906677246094 }, { "auxiliary_loss_clip": 0.01115866, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.0444839, "balance_loss_mlp": 1.01929939, "epoch": 0.5151360288591613, "flos": 25952641418880.0, "grad_norm": 2.181301277452511, "language_loss": 0.79243255, "learning_rate": 1.9990263458347806e-06, "loss": 0.81393552, "num_input_tokens_seen": 184166175, "step": 8568, "time_per_iteration": 2.6806869506835938 }, { "auxiliary_loss_clip": 0.01100678, "auxiliary_loss_mlp": 0.01033449, "balance_loss_clip": 1.04264474, "balance_loss_mlp": 1.02017546, "epoch": 0.5151961521118292, "flos": 18507246312960.0, "grad_norm": 2.356580017264164, "language_loss": 0.9131906, "learning_rate": 1.9986368842203825e-06, "loss": 0.93453181, "num_input_tokens_seen": 184182600, "step": 8569, "time_per_iteration": 2.6493630409240723 }, { "auxiliary_loss_clip": 0.01128863, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.04688525, "balance_loss_mlp": 1.0198164, "epoch": 0.5152562753644973, "flos": 22233696837120.0, "grad_norm": 2.0115285980006967, "language_loss": 0.76725376, "learning_rate": 1.998247422657674e-06, "loss": 0.78888762, "num_input_tokens_seen": 184202020, "step": 8570, "time_per_iteration": 2.6327102184295654 }, { "auxiliary_loss_clip": 0.01115897, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.04504037, "balance_loss_mlp": 1.02880454, "epoch": 0.5153163986171652, "flos": 38436555047040.0, "grad_norm": 1.735564613465363, "language_loss": 0.73986542, "learning_rate": 1.9978579611614227e-06, "loss": 0.76147163, "num_input_tokens_seen": 184224850, "step": 8571, "time_per_iteration": 2.879904270172119 }, { "auxiliary_loss_clip": 0.01031454, "auxiliary_loss_mlp": 0.01001432, "balance_loss_clip": 1.02375364, "balance_loss_mlp": 1.00009048, "epoch": 0.5153765218698332, "flos": 66384503015040.0, "grad_norm": 0.7786581254678329, "language_loss": 0.52855021, "learning_rate": 1.9974684997463984e-06, "loss": 0.54887909, "num_input_tokens_seen": 184288520, "step": 8572, "time_per_iteration": 3.2987639904022217 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.04641247, "balance_loss_mlp": 1.02542353, "epoch": 0.5154366451225011, "flos": 24024669891840.0, "grad_norm": 1.82770535610101, "language_loss": 0.76185274, "learning_rate": 1.9970790384273687e-06, "loss": 0.78336841, "num_input_tokens_seen": 184308565, "step": 8573, "time_per_iteration": 2.6767003536224365 }, { "auxiliary_loss_clip": 0.01111651, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.04382682, "balance_loss_mlp": 1.01498199, "epoch": 0.5154967683751691, "flos": 23468843859840.0, "grad_norm": 2.7144169534848976, "language_loss": 0.77198601, "learning_rate": 1.996689577219102e-06, "loss": 0.7933901, "num_input_tokens_seen": 184326795, "step": 8574, "time_per_iteration": 2.6607704162597656 }, { "auxiliary_loss_clip": 0.01099994, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.04476404, "balance_loss_mlp": 1.02018237, "epoch": 0.515556891627837, "flos": 23805650712960.0, "grad_norm": 3.244613949266341, "language_loss": 0.8558231, "learning_rate": 1.996300116136367e-06, "loss": 0.87715936, "num_input_tokens_seen": 184345990, "step": 8575, "time_per_iteration": 2.6699635982513428 }, { "auxiliary_loss_clip": 0.01113561, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.04307377, "balance_loss_mlp": 1.02077138, "epoch": 0.515617014880505, "flos": 19828544106240.0, "grad_norm": 1.6301780240264319, "language_loss": 0.76920515, "learning_rate": 1.995910655193932e-06, "loss": 0.79069233, "num_input_tokens_seen": 184366300, "step": 8576, "time_per_iteration": 2.7603139877319336 }, { "auxiliary_loss_clip": 0.01078348, "auxiliary_loss_mlp": 0.00773356, "balance_loss_clip": 1.04196084, "balance_loss_mlp": 1.00032973, "epoch": 0.515677138133173, "flos": 14245907385600.0, "grad_norm": 2.480047069773859, "language_loss": 0.76414418, "learning_rate": 1.9955211944065654e-06, "loss": 0.78266126, "num_input_tokens_seen": 184383030, "step": 8577, "time_per_iteration": 2.694549083709717 }, { "auxiliary_loss_clip": 0.01099471, "auxiliary_loss_mlp": 0.01044811, "balance_loss_clip": 1.04260516, "balance_loss_mlp": 1.0279547, "epoch": 0.515737261385841, "flos": 28289707920000.0, "grad_norm": 1.7162174586848327, "language_loss": 0.80910254, "learning_rate": 1.9951317337890353e-06, "loss": 0.83054537, "num_input_tokens_seen": 184403410, "step": 8578, "time_per_iteration": 2.740527391433716 }, { "auxiliary_loss_clip": 0.01121615, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.04364657, "balance_loss_mlp": 1.01914644, "epoch": 0.515797384638509, "flos": 27891925729920.0, "grad_norm": 1.8526777225789184, "language_loss": 0.75880611, "learning_rate": 1.9947422733561105e-06, "loss": 0.780352, "num_input_tokens_seen": 184423830, "step": 8579, "time_per_iteration": 2.6643004417419434 }, { "auxiliary_loss_clip": 0.01087857, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.04332745, "balance_loss_mlp": 1.01849377, "epoch": 0.5158575078911769, "flos": 23040071210880.0, "grad_norm": 3.647152473791378, "language_loss": 0.7862978, "learning_rate": 1.994352813122559e-06, "loss": 0.80749989, "num_input_tokens_seen": 184445050, "step": 8580, "time_per_iteration": 2.74796986579895 }, { "auxiliary_loss_clip": 0.01086006, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.04050803, "balance_loss_mlp": 1.03265989, "epoch": 0.5159176311438449, "flos": 12641346938880.0, "grad_norm": 2.0718752995567966, "language_loss": 0.73151392, "learning_rate": 1.99396335310315e-06, "loss": 0.75286567, "num_input_tokens_seen": 184460775, "step": 8581, "time_per_iteration": 2.6738648414611816 }, { "auxiliary_loss_clip": 0.01114558, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.0463438, "balance_loss_mlp": 1.01976788, "epoch": 0.5159777543965128, "flos": 15558154951680.0, "grad_norm": 2.080206363710033, "language_loss": 0.74150515, "learning_rate": 1.9935738933126508e-06, "loss": 0.76298487, "num_input_tokens_seen": 184477365, "step": 8582, "time_per_iteration": 2.649186134338379 }, { "auxiliary_loss_clip": 0.01085634, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.04351485, "balance_loss_mlp": 1.02202952, "epoch": 0.5160378776491809, "flos": 23221671396480.0, "grad_norm": 4.912834420865202, "language_loss": 0.65803373, "learning_rate": 1.99318443376583e-06, "loss": 0.67924196, "num_input_tokens_seen": 184497045, "step": 8583, "time_per_iteration": 2.7025017738342285 }, { "auxiliary_loss_clip": 0.0111508, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.04503357, "balance_loss_mlp": 1.02199888, "epoch": 0.5160980009018488, "flos": 21944616180480.0, "grad_norm": 1.4135833939266678, "language_loss": 0.76130998, "learning_rate": 1.9927949744774568e-06, "loss": 0.78283131, "num_input_tokens_seen": 184517675, "step": 8584, "time_per_iteration": 2.662471294403076 }, { "auxiliary_loss_clip": 0.01093144, "auxiliary_loss_mlp": 0.01043062, "balance_loss_clip": 1.0425117, "balance_loss_mlp": 1.02877474, "epoch": 0.5161581241545168, "flos": 22784064001920.0, "grad_norm": 2.700643227023907, "language_loss": 0.79112214, "learning_rate": 1.9924055154622983e-06, "loss": 0.81248415, "num_input_tokens_seen": 184537745, "step": 8585, "time_per_iteration": 2.727789878845215 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.0444293, "balance_loss_mlp": 1.02064013, "epoch": 0.5162182474071847, "flos": 19675384513920.0, "grad_norm": 2.398879690546405, "language_loss": 0.81236124, "learning_rate": 1.9920160567351238e-06, "loss": 0.83375853, "num_input_tokens_seen": 184553630, "step": 8586, "time_per_iteration": 2.6371195316314697 }, { "auxiliary_loss_clip": 0.01106215, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.04690671, "balance_loss_mlp": 1.02083015, "epoch": 0.5162783706598527, "flos": 20046198568320.0, "grad_norm": 1.819724898525227, "language_loss": 0.71372288, "learning_rate": 1.991626598310701e-06, "loss": 0.73513913, "num_input_tokens_seen": 184573530, "step": 8587, "time_per_iteration": 2.7760136127471924 }, { "auxiliary_loss_clip": 0.01038098, "auxiliary_loss_mlp": 0.01008101, "balance_loss_clip": 1.02063632, "balance_loss_mlp": 1.00669408, "epoch": 0.5163384939125206, "flos": 69959553713280.0, "grad_norm": 0.7288340121404665, "language_loss": 0.57740283, "learning_rate": 1.9912371402037984e-06, "loss": 0.59786481, "num_input_tokens_seen": 184637875, "step": 8588, "time_per_iteration": 3.183241844177246 }, { "auxiliary_loss_clip": 0.01101129, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.04456651, "balance_loss_mlp": 1.02572727, "epoch": 0.5163986171651886, "flos": 17417034668160.0, "grad_norm": 1.7775907605960104, "language_loss": 0.75007761, "learning_rate": 1.990847682429185e-06, "loss": 0.77149177, "num_input_tokens_seen": 184656125, "step": 8589, "time_per_iteration": 2.8228790760040283 }, { "auxiliary_loss_clip": 0.01117201, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.04574263, "balance_loss_mlp": 1.01678646, "epoch": 0.5164587404178566, "flos": 21322679166720.0, "grad_norm": 1.76753328713407, "language_loss": 0.67530292, "learning_rate": 1.990458225001627e-06, "loss": 0.69677365, "num_input_tokens_seen": 184675920, "step": 8590, "time_per_iteration": 2.6443076133728027 }, { "auxiliary_loss_clip": 0.0104106, "auxiliary_loss_mlp": 0.01004207, "balance_loss_clip": 1.02416718, "balance_loss_mlp": 1.00274086, "epoch": 0.5165188636705246, "flos": 68057149691520.0, "grad_norm": 1.576071766619913, "language_loss": 0.55832803, "learning_rate": 1.990068767935895e-06, "loss": 0.57878071, "num_input_tokens_seen": 184730520, "step": 8591, "time_per_iteration": 3.062364101409912 }, { "auxiliary_loss_clip": 0.01096175, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.04139185, "balance_loss_mlp": 1.01549983, "epoch": 0.5165789869231926, "flos": 19385657412480.0, "grad_norm": 1.5710435869577224, "language_loss": 0.81707442, "learning_rate": 1.9896793112467566e-06, "loss": 0.83831745, "num_input_tokens_seen": 184748340, "step": 8592, "time_per_iteration": 2.6631641387939453 }, { "auxiliary_loss_clip": 0.01108366, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.04346967, "balance_loss_mlp": 1.01837873, "epoch": 0.5166391101758605, "flos": 20960197067520.0, "grad_norm": 2.447309188835127, "language_loss": 0.83472121, "learning_rate": 1.989289854948979e-06, "loss": 0.85612202, "num_input_tokens_seen": 184766615, "step": 8593, "time_per_iteration": 2.6486148834228516 }, { "auxiliary_loss_clip": 0.01097046, "auxiliary_loss_mlp": 0.01044386, "balance_loss_clip": 1.04197097, "balance_loss_mlp": 1.02946699, "epoch": 0.5166992334285285, "flos": 29462407148160.0, "grad_norm": 2.3092045349550374, "language_loss": 0.69423366, "learning_rate": 1.9889003990573314e-06, "loss": 0.71564794, "num_input_tokens_seen": 184788075, "step": 8594, "time_per_iteration": 2.7182230949401855 }, { "auxiliary_loss_clip": 0.01082123, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.04193354, "balance_loss_mlp": 1.01663446, "epoch": 0.5167593566811964, "flos": 20304360593280.0, "grad_norm": 1.4197237581629922, "language_loss": 0.77434355, "learning_rate": 1.988510943586582e-06, "loss": 0.79547119, "num_input_tokens_seen": 184808710, "step": 8595, "time_per_iteration": 2.7374019622802734 }, { "auxiliary_loss_clip": 0.01123588, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.0457046, "balance_loss_mlp": 1.02551079, "epoch": 0.5168194799338645, "flos": 14611370313600.0, "grad_norm": 1.5026096017220443, "language_loss": 0.650635, "learning_rate": 1.9881214885514986e-06, "loss": 0.67226291, "num_input_tokens_seen": 184826475, "step": 8596, "time_per_iteration": 2.581263542175293 }, { "auxiliary_loss_clip": 0.01083842, "auxiliary_loss_mlp": 0.01032453, "balance_loss_clip": 1.0427258, "balance_loss_mlp": 1.01740873, "epoch": 0.5168796031865324, "flos": 25007257411200.0, "grad_norm": 1.5566562133380693, "language_loss": 0.75481033, "learning_rate": 1.9877320339668492e-06, "loss": 0.77597326, "num_input_tokens_seen": 184845245, "step": 8597, "time_per_iteration": 2.741926670074463 }, { "auxiliary_loss_clip": 0.01124007, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.04456997, "balance_loss_mlp": 1.01349235, "epoch": 0.5169397264392004, "flos": 26939969533440.0, "grad_norm": 1.5821649734534613, "language_loss": 0.81177652, "learning_rate": 1.987342579847403e-06, "loss": 0.83328438, "num_input_tokens_seen": 184866605, "step": 8598, "time_per_iteration": 2.690035343170166 }, { "auxiliary_loss_clip": 0.01071801, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.03745472, "balance_loss_mlp": 1.03122449, "epoch": 0.5169998496918683, "flos": 25407804948480.0, "grad_norm": 1.4930779887062733, "language_loss": 0.75179017, "learning_rate": 1.9869531262079273e-06, "loss": 0.77297151, "num_input_tokens_seen": 184886945, "step": 8599, "time_per_iteration": 2.8392081260681152 }, { "auxiliary_loss_clip": 0.01105064, "auxiliary_loss_mlp": 0.01033083, "balance_loss_clip": 1.04534984, "balance_loss_mlp": 1.02013683, "epoch": 0.5170599729445363, "flos": 24680793674880.0, "grad_norm": 2.7626803107212825, "language_loss": 0.72095126, "learning_rate": 1.9865636730631904e-06, "loss": 0.7423327, "num_input_tokens_seen": 184905590, "step": 8600, "time_per_iteration": 4.393568515777588 }, { "auxiliary_loss_clip": 0.01085277, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.03932548, "balance_loss_mlp": 1.02074337, "epoch": 0.5171200961972042, "flos": 20994455664000.0, "grad_norm": 1.381905387614244, "language_loss": 0.73886168, "learning_rate": 1.9861742204279602e-06, "loss": 0.76006198, "num_input_tokens_seen": 184925555, "step": 8601, "time_per_iteration": 2.7736306190490723 }, { "auxiliary_loss_clip": 0.01114158, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.04510868, "balance_loss_mlp": 1.02620816, "epoch": 0.5171802194498722, "flos": 22745639427840.0, "grad_norm": 2.1013626788591817, "language_loss": 0.83703583, "learning_rate": 1.9857847683170045e-06, "loss": 0.85858572, "num_input_tokens_seen": 184944490, "step": 8602, "time_per_iteration": 4.306191444396973 }, { "auxiliary_loss_clip": 0.01124659, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.04496753, "balance_loss_mlp": 1.01937509, "epoch": 0.5172403427025402, "flos": 28176732668160.0, "grad_norm": 1.7451034136925476, "language_loss": 0.74647379, "learning_rate": 1.9853953167450926e-06, "loss": 0.76805902, "num_input_tokens_seen": 184963190, "step": 8603, "time_per_iteration": 2.73425030708313 }, { "auxiliary_loss_clip": 0.01101467, "auxiliary_loss_mlp": 0.01037433, "balance_loss_clip": 1.04518127, "balance_loss_mlp": 1.02431369, "epoch": 0.5173004659552082, "flos": 20337829090560.0, "grad_norm": 2.1792209860390503, "language_loss": 0.72349811, "learning_rate": 1.9850058657269915e-06, "loss": 0.74488711, "num_input_tokens_seen": 184981220, "step": 8604, "time_per_iteration": 2.740248441696167 }, { "auxiliary_loss_clip": 0.01107237, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.04422593, "balance_loss_mlp": 1.02716208, "epoch": 0.5173605892078762, "flos": 19063323740160.0, "grad_norm": 1.7719196350127329, "language_loss": 0.85052991, "learning_rate": 1.984616415277469e-06, "loss": 0.87202406, "num_input_tokens_seen": 184998810, "step": 8605, "time_per_iteration": 4.264687538146973 }, { "auxiliary_loss_clip": 0.01107777, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.04396403, "balance_loss_mlp": 1.01552308, "epoch": 0.5174207124605441, "flos": 27995168396160.0, "grad_norm": 1.6794634480750013, "language_loss": 0.64467752, "learning_rate": 1.984226965411294e-06, "loss": 0.6660347, "num_input_tokens_seen": 185021185, "step": 8606, "time_per_iteration": 2.7390646934509277 }, { "auxiliary_loss_clip": 0.01096289, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.04330635, "balance_loss_mlp": 1.01885414, "epoch": 0.5174808357132121, "flos": 19496657416320.0, "grad_norm": 1.503605725156866, "language_loss": 0.77918422, "learning_rate": 1.983837516143234e-06, "loss": 0.80046678, "num_input_tokens_seen": 185038465, "step": 8607, "time_per_iteration": 2.718864917755127 }, { "auxiliary_loss_clip": 0.01114878, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.04531431, "balance_loss_mlp": 1.0226177, "epoch": 0.51754095896588, "flos": 22784171742720.0, "grad_norm": 2.7158797821524585, "language_loss": 0.72334993, "learning_rate": 1.983448067488057e-06, "loss": 0.74486864, "num_input_tokens_seen": 185057340, "step": 8608, "time_per_iteration": 2.767817258834839 }, { "auxiliary_loss_clip": 0.01119837, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.04469681, "balance_loss_mlp": 1.01979923, "epoch": 0.5176010822185481, "flos": 22669257156480.0, "grad_norm": 1.8609844806921267, "language_loss": 0.8623482, "learning_rate": 1.983058619460531e-06, "loss": 0.88388956, "num_input_tokens_seen": 185074935, "step": 8609, "time_per_iteration": 2.8063855171203613 }, { "auxiliary_loss_clip": 0.01111694, "auxiliary_loss_mlp": 0.01037765, "balance_loss_clip": 1.04306316, "balance_loss_mlp": 1.02484906, "epoch": 0.517661205471216, "flos": 23951196622080.0, "grad_norm": 2.050130502752804, "language_loss": 0.73473549, "learning_rate": 1.9826691720754237e-06, "loss": 0.75623012, "num_input_tokens_seen": 185095050, "step": 8610, "time_per_iteration": 2.740083694458008 }, { "auxiliary_loss_clip": 0.01129954, "auxiliary_loss_mlp": 0.01038598, "balance_loss_clip": 1.04616904, "balance_loss_mlp": 1.02353036, "epoch": 0.517721328723884, "flos": 15596076735360.0, "grad_norm": 2.3590336184711926, "language_loss": 0.67205131, "learning_rate": 1.9822797253475034e-06, "loss": 0.69373685, "num_input_tokens_seen": 185112275, "step": 8611, "time_per_iteration": 2.648165464401245 }, { "auxiliary_loss_clip": 0.01122336, "auxiliary_loss_mlp": 0.01039403, "balance_loss_clip": 1.0434556, "balance_loss_mlp": 1.02535403, "epoch": 0.5177814519765519, "flos": 20960197067520.0, "grad_norm": 2.3905761842565485, "language_loss": 0.77420157, "learning_rate": 1.9818902792915373e-06, "loss": 0.79581904, "num_input_tokens_seen": 185132165, "step": 8612, "time_per_iteration": 2.663339376449585 }, { "auxiliary_loss_clip": 0.01114318, "auxiliary_loss_mlp": 0.01040798, "balance_loss_clip": 1.04297137, "balance_loss_mlp": 1.02688015, "epoch": 0.5178415752292199, "flos": 17967832796160.0, "grad_norm": 2.1474229546439174, "language_loss": 0.8168264, "learning_rate": 1.981500833922294e-06, "loss": 0.83837759, "num_input_tokens_seen": 185151025, "step": 8613, "time_per_iteration": 2.6589057445526123 }, { "auxiliary_loss_clip": 0.01128171, "auxiliary_loss_mlp": 0.01042961, "balance_loss_clip": 1.04804301, "balance_loss_mlp": 1.02832222, "epoch": 0.5179016984818878, "flos": 17821496787840.0, "grad_norm": 2.274335348251239, "language_loss": 0.66216785, "learning_rate": 1.981111389254541e-06, "loss": 0.6838792, "num_input_tokens_seen": 185168455, "step": 8614, "time_per_iteration": 2.692133903503418 }, { "auxiliary_loss_clip": 0.01100612, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.04462051, "balance_loss_mlp": 1.01982355, "epoch": 0.5179618217345558, "flos": 17820455293440.0, "grad_norm": 2.0015033819610055, "language_loss": 0.8693983, "learning_rate": 1.9807219453030453e-06, "loss": 0.89074928, "num_input_tokens_seen": 185184415, "step": 8615, "time_per_iteration": 2.690483808517456 }, { "auxiliary_loss_clip": 0.01113112, "auxiliary_loss_mlp": 0.01044655, "balance_loss_clip": 1.04499412, "balance_loss_mlp": 1.03147638, "epoch": 0.5180219449872238, "flos": 22522131048960.0, "grad_norm": 1.8105595259457619, "language_loss": 0.8084923, "learning_rate": 1.9803325020825763e-06, "loss": 0.83007002, "num_input_tokens_seen": 185202910, "step": 8616, "time_per_iteration": 2.6410508155822754 }, { "auxiliary_loss_clip": 0.01120148, "auxiliary_loss_mlp": 0.00772211, "balance_loss_clip": 1.04987717, "balance_loss_mlp": 1.00035763, "epoch": 0.5180820682398918, "flos": 23915465568000.0, "grad_norm": 2.1203191332986675, "language_loss": 0.75104189, "learning_rate": 1.9799430596079e-06, "loss": 0.76996547, "num_input_tokens_seen": 185223085, "step": 8617, "time_per_iteration": 2.6979870796203613 }, { "auxiliary_loss_clip": 0.01126304, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.04557788, "balance_loss_mlp": 1.02717435, "epoch": 0.5181421914925598, "flos": 16979930064000.0, "grad_norm": 1.6549706674723104, "language_loss": 0.70240247, "learning_rate": 1.979553617893785e-06, "loss": 0.72409028, "num_input_tokens_seen": 185241295, "step": 8618, "time_per_iteration": 2.6166911125183105 }, { "auxiliary_loss_clip": 0.01038523, "auxiliary_loss_mlp": 0.01004843, "balance_loss_clip": 1.02117562, "balance_loss_mlp": 1.00342429, "epoch": 0.5182023147452277, "flos": 66059870872320.0, "grad_norm": 0.9503620431523022, "language_loss": 0.67223799, "learning_rate": 1.979164176954999e-06, "loss": 0.69267166, "num_input_tokens_seen": 185298295, "step": 8619, "time_per_iteration": 3.186922550201416 }, { "auxiliary_loss_clip": 0.01079843, "auxiliary_loss_mlp": 0.01035858, "balance_loss_clip": 1.04400134, "balance_loss_mlp": 1.02230954, "epoch": 0.5182624379978957, "flos": 18187749815040.0, "grad_norm": 1.8983764009380637, "language_loss": 0.79863739, "learning_rate": 1.97877473680631e-06, "loss": 0.8197943, "num_input_tokens_seen": 185317000, "step": 8620, "time_per_iteration": 2.8446528911590576 }, { "auxiliary_loss_clip": 0.01060893, "auxiliary_loss_mlp": 0.00772403, "balance_loss_clip": 1.04089034, "balance_loss_mlp": 1.00029039, "epoch": 0.5183225612505636, "flos": 14026708638720.0, "grad_norm": 2.0819192927399586, "language_loss": 0.82402205, "learning_rate": 1.9783852974624846e-06, "loss": 0.84235501, "num_input_tokens_seen": 185331185, "step": 8621, "time_per_iteration": 2.753957509994507 }, { "auxiliary_loss_clip": 0.01097265, "auxiliary_loss_mlp": 0.010405, "balance_loss_clip": 1.03958249, "balance_loss_mlp": 1.02750611, "epoch": 0.5183826845032317, "flos": 23659781581440.0, "grad_norm": 2.428940739700658, "language_loss": 0.65491748, "learning_rate": 1.9779958589382905e-06, "loss": 0.67629516, "num_input_tokens_seen": 185348955, "step": 8622, "time_per_iteration": 2.7421741485595703 }, { "auxiliary_loss_clip": 0.01106105, "auxiliary_loss_mlp": 0.01044986, "balance_loss_clip": 1.04371572, "balance_loss_mlp": 1.03016257, "epoch": 0.5184428077558996, "flos": 15888605097600.0, "grad_norm": 2.083884784089921, "language_loss": 0.60552382, "learning_rate": 1.977606421248497e-06, "loss": 0.62703472, "num_input_tokens_seen": 185367330, "step": 8623, "time_per_iteration": 2.690345048904419 }, { "auxiliary_loss_clip": 0.0112578, "auxiliary_loss_mlp": 0.01032047, "balance_loss_clip": 1.04534173, "balance_loss_mlp": 1.01890421, "epoch": 0.5185029310085676, "flos": 21030833162880.0, "grad_norm": 1.609281256747452, "language_loss": 0.76150465, "learning_rate": 1.9772169844078685e-06, "loss": 0.78308284, "num_input_tokens_seen": 185385060, "step": 8624, "time_per_iteration": 2.613788366317749 }, { "auxiliary_loss_clip": 0.0107795, "auxiliary_loss_mlp": 0.01043066, "balance_loss_clip": 1.03900456, "balance_loss_mlp": 1.02859426, "epoch": 0.5185630542612355, "flos": 26542690133760.0, "grad_norm": 2.373822325498003, "language_loss": 0.70952767, "learning_rate": 1.9768275484311756e-06, "loss": 0.73073781, "num_input_tokens_seen": 185403745, "step": 8625, "time_per_iteration": 2.7548205852508545 }, { "auxiliary_loss_clip": 0.01100948, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04119349, "balance_loss_mlp": 1.02260327, "epoch": 0.5186231775139035, "flos": 20668422890880.0, "grad_norm": 1.9009704883002407, "language_loss": 0.67718256, "learning_rate": 1.976438113333184e-06, "loss": 0.69854349, "num_input_tokens_seen": 185422620, "step": 8626, "time_per_iteration": 2.731328248977661 }, { "auxiliary_loss_clip": 0.0111085, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.04271841, "balance_loss_mlp": 1.02022982, "epoch": 0.5186833007665714, "flos": 20885502735360.0, "grad_norm": 1.960489278080422, "language_loss": 0.70780122, "learning_rate": 1.9760486791286612e-06, "loss": 0.72924662, "num_input_tokens_seen": 185439380, "step": 8627, "time_per_iteration": 2.6464414596557617 }, { "auxiliary_loss_clip": 0.011279, "auxiliary_loss_mlp": 0.00772067, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.00029826, "epoch": 0.5187434240192395, "flos": 20886903365760.0, "grad_norm": 2.0333805073835007, "language_loss": 0.7303592, "learning_rate": 1.9756592458323753e-06, "loss": 0.74935889, "num_input_tokens_seen": 185458830, "step": 8628, "time_per_iteration": 2.7327346801757812 }, { "auxiliary_loss_clip": 0.01102356, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.04561651, "balance_loss_mlp": 1.01927686, "epoch": 0.5188035472719074, "flos": 19859929614720.0, "grad_norm": 1.6190117042724865, "language_loss": 0.77354944, "learning_rate": 1.9752698134590927e-06, "loss": 0.79489267, "num_input_tokens_seen": 185477270, "step": 8629, "time_per_iteration": 2.77992582321167 }, { "auxiliary_loss_clip": 0.01115143, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.04428935, "balance_loss_mlp": 1.01932621, "epoch": 0.5188636705245754, "flos": 21138313633920.0, "grad_norm": 2.228815370750346, "language_loss": 0.75078702, "learning_rate": 1.9748803820235815e-06, "loss": 0.77228034, "num_input_tokens_seen": 185495795, "step": 8630, "time_per_iteration": 2.6749987602233887 }, { "auxiliary_loss_clip": 0.01112188, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04358792, "balance_loss_mlp": 1.02446306, "epoch": 0.5189237937772434, "flos": 22419786222720.0, "grad_norm": 2.002083188679526, "language_loss": 0.80665708, "learning_rate": 1.9744909515406093e-06, "loss": 0.82816863, "num_input_tokens_seen": 185514885, "step": 8631, "time_per_iteration": 2.7432682514190674 }, { "auxiliary_loss_clip": 0.01114617, "auxiliary_loss_mlp": 0.01034953, "balance_loss_clip": 1.04478788, "balance_loss_mlp": 1.02031374, "epoch": 0.5189839170299113, "flos": 25446696399360.0, "grad_norm": 1.4933919289773454, "language_loss": 0.74756616, "learning_rate": 1.974101522024942e-06, "loss": 0.76906186, "num_input_tokens_seen": 185537155, "step": 8632, "time_per_iteration": 2.726018190383911 }, { "auxiliary_loss_clip": 0.01093075, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.04612803, "balance_loss_mlp": 1.01946926, "epoch": 0.5190440402825793, "flos": 18587722734720.0, "grad_norm": 1.8814471450767234, "language_loss": 0.78911304, "learning_rate": 1.9737120934913477e-06, "loss": 0.81037819, "num_input_tokens_seen": 185555520, "step": 8633, "time_per_iteration": 2.715510606765747 }, { "auxiliary_loss_clip": 0.0111596, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.04581857, "balance_loss_mlp": 1.01619983, "epoch": 0.5191041635352472, "flos": 21908633731200.0, "grad_norm": 5.606824878452593, "language_loss": 0.80551088, "learning_rate": 1.9733226659545936e-06, "loss": 0.82696015, "num_input_tokens_seen": 185573855, "step": 8634, "time_per_iteration": 2.6477181911468506 }, { "auxiliary_loss_clip": 0.01122619, "auxiliary_loss_mlp": 0.0103901, "balance_loss_clip": 1.04603028, "balance_loss_mlp": 1.02571273, "epoch": 0.5191642867879153, "flos": 27527971173120.0, "grad_norm": 1.5734156514364543, "language_loss": 0.69467652, "learning_rate": 1.9729332394294467e-06, "loss": 0.71629286, "num_input_tokens_seen": 185595145, "step": 8635, "time_per_iteration": 2.713585615158081 }, { "auxiliary_loss_clip": 0.01102259, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.0431217, "balance_loss_mlp": 1.02210498, "epoch": 0.5192244100405832, "flos": 15705999331200.0, "grad_norm": 1.6343728145872918, "language_loss": 0.77876496, "learning_rate": 1.9725438139306742e-06, "loss": 0.80014527, "num_input_tokens_seen": 185613320, "step": 8636, "time_per_iteration": 2.6876139640808105 }, { "auxiliary_loss_clip": 0.01127572, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.04695189, "balance_loss_mlp": 1.01938009, "epoch": 0.5192845332932512, "flos": 12057080313600.0, "grad_norm": 2.1121159964360596, "language_loss": 0.71433318, "learning_rate": 1.9721543894730425e-06, "loss": 0.73593867, "num_input_tokens_seen": 185630730, "step": 8637, "time_per_iteration": 2.6093368530273438 }, { "auxiliary_loss_clip": 0.01088299, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.04357982, "balance_loss_mlp": 1.01999319, "epoch": 0.5193446565459191, "flos": 18953185662720.0, "grad_norm": 2.05486546466365, "language_loss": 0.76026344, "learning_rate": 1.9717649660713194e-06, "loss": 0.78147888, "num_input_tokens_seen": 185648515, "step": 8638, "time_per_iteration": 2.680696725845337 }, { "auxiliary_loss_clip": 0.0109108, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.04291189, "balance_loss_mlp": 1.01578116, "epoch": 0.5194047797985871, "flos": 20374960775040.0, "grad_norm": 13.373516582231533, "language_loss": 0.74382144, "learning_rate": 1.971375543740272e-06, "loss": 0.7650196, "num_input_tokens_seen": 185665220, "step": 8639, "time_per_iteration": 4.318557500839233 }, { "auxiliary_loss_clip": 0.01123361, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.04529893, "balance_loss_mlp": 1.01838636, "epoch": 0.519464903051255, "flos": 24353001135360.0, "grad_norm": 1.5657899745454023, "language_loss": 0.77311909, "learning_rate": 1.9709861224946665e-06, "loss": 0.79467607, "num_input_tokens_seen": 185683750, "step": 8640, "time_per_iteration": 2.5864639282226562 }, { "auxiliary_loss_clip": 0.01082849, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.04260516, "balance_loss_mlp": 1.01930904, "epoch": 0.519525026303923, "flos": 14061829161600.0, "grad_norm": 2.0170540453425714, "language_loss": 0.66183293, "learning_rate": 1.97059670234927e-06, "loss": 0.68298292, "num_input_tokens_seen": 185700625, "step": 8641, "time_per_iteration": 2.692979574203491 }, { "auxiliary_loss_clip": 0.01123177, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04594493, "balance_loss_mlp": 1.02172363, "epoch": 0.519585149556591, "flos": 28835873193600.0, "grad_norm": 1.7554954360005686, "language_loss": 0.76535702, "learning_rate": 1.97020728331885e-06, "loss": 0.78693068, "num_input_tokens_seen": 185721155, "step": 8642, "time_per_iteration": 5.96128249168396 }, { "auxiliary_loss_clip": 0.0112288, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.04584873, "balance_loss_mlp": 1.01806307, "epoch": 0.519645272809259, "flos": 25373007648000.0, "grad_norm": 2.255175934024536, "language_loss": 0.83165199, "learning_rate": 1.9698178654181726e-06, "loss": 0.85319304, "num_input_tokens_seen": 185740990, "step": 8643, "time_per_iteration": 2.81384539604187 }, { "auxiliary_loss_clip": 0.01126122, "auxiliary_loss_mlp": 0.01041822, "balance_loss_clip": 1.04520261, "balance_loss_mlp": 1.02785623, "epoch": 0.519705396061927, "flos": 25372863993600.0, "grad_norm": 2.2020503225508645, "language_loss": 0.7044059, "learning_rate": 1.969428448662004e-06, "loss": 0.72608531, "num_input_tokens_seen": 185762235, "step": 8644, "time_per_iteration": 2.7107033729553223 }, { "auxiliary_loss_clip": 0.01111108, "auxiliary_loss_mlp": 0.00770711, "balance_loss_clip": 1.04354811, "balance_loss_mlp": 1.00015676, "epoch": 0.5197655193145949, "flos": 28476228268800.0, "grad_norm": 1.5309653957313616, "language_loss": 0.80272603, "learning_rate": 1.9690390330651133e-06, "loss": 0.82154423, "num_input_tokens_seen": 185783415, "step": 8645, "time_per_iteration": 4.246826171875 }, { "auxiliary_loss_clip": 0.01122573, "auxiliary_loss_mlp": 0.01033869, "balance_loss_clip": 1.04362488, "balance_loss_mlp": 1.02058911, "epoch": 0.5198256425672629, "flos": 20009138711040.0, "grad_norm": 1.7778396167930446, "language_loss": 0.7800498, "learning_rate": 1.968649618642264e-06, "loss": 0.80161417, "num_input_tokens_seen": 185801345, "step": 8646, "time_per_iteration": 2.630892276763916 }, { "auxiliary_loss_clip": 0.01117401, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.04832959, "balance_loss_mlp": 1.02218235, "epoch": 0.5198857658199308, "flos": 19828867328640.0, "grad_norm": 1.6794769864367036, "language_loss": 0.65647638, "learning_rate": 1.9682602054082252e-06, "loss": 0.67800039, "num_input_tokens_seen": 185820815, "step": 8647, "time_per_iteration": 2.6543033123016357 }, { "auxiliary_loss_clip": 0.01127292, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.04618931, "balance_loss_mlp": 1.02208591, "epoch": 0.5199458890725989, "flos": 24461918150400.0, "grad_norm": 1.7193073170603235, "language_loss": 0.71425897, "learning_rate": 1.967870793377763e-06, "loss": 0.73589844, "num_input_tokens_seen": 185841450, "step": 8648, "time_per_iteration": 2.6632113456726074 }, { "auxiliary_loss_clip": 0.0110717, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.02016664, "epoch": 0.5200060123252668, "flos": 23404779953280.0, "grad_norm": 2.0932120653926853, "language_loss": 0.64383608, "learning_rate": 1.967481382565642e-06, "loss": 0.66525912, "num_input_tokens_seen": 185859935, "step": 8649, "time_per_iteration": 2.708676815032959 }, { "auxiliary_loss_clip": 0.01101881, "auxiliary_loss_mlp": 0.01035641, "balance_loss_clip": 1.04480278, "balance_loss_mlp": 1.02039409, "epoch": 0.5200661355779348, "flos": 17201355454080.0, "grad_norm": 2.0779038173518978, "language_loss": 0.70331943, "learning_rate": 1.9670919729866315e-06, "loss": 0.72469461, "num_input_tokens_seen": 185876795, "step": 8650, "time_per_iteration": 2.650996446609497 }, { "auxiliary_loss_clip": 0.01123307, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.04483724, "balance_loss_mlp": 1.01754415, "epoch": 0.5201262588306027, "flos": 18515075477760.0, "grad_norm": 1.793577075652819, "language_loss": 0.77560079, "learning_rate": 1.966702564655496e-06, "loss": 0.79714197, "num_input_tokens_seen": 185895570, "step": 8651, "time_per_iteration": 2.6181790828704834 }, { "auxiliary_loss_clip": 0.01068752, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.04241145, "balance_loss_mlp": 1.02557862, "epoch": 0.5201863820832707, "flos": 18619395552000.0, "grad_norm": 1.579276828195563, "language_loss": 0.78716815, "learning_rate": 1.966313157587003e-06, "loss": 0.80825853, "num_input_tokens_seen": 185913700, "step": 8652, "time_per_iteration": 2.81169056892395 }, { "auxiliary_loss_clip": 0.01087589, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.04238617, "balance_loss_mlp": 1.02496183, "epoch": 0.5202465053359386, "flos": 22857142222080.0, "grad_norm": 2.456126746607985, "language_loss": 0.70069832, "learning_rate": 1.9659237517959187e-06, "loss": 0.7219739, "num_input_tokens_seen": 185932460, "step": 8653, "time_per_iteration": 2.8110082149505615 }, { "auxiliary_loss_clip": 0.01094035, "auxiliary_loss_mlp": 0.01042704, "balance_loss_clip": 1.04702687, "balance_loss_mlp": 1.02864337, "epoch": 0.5203066285886067, "flos": 21981532383360.0, "grad_norm": 1.546190224311193, "language_loss": 0.78555804, "learning_rate": 1.965534347297008e-06, "loss": 0.80692542, "num_input_tokens_seen": 185952030, "step": 8654, "time_per_iteration": 2.8240180015563965 }, { "auxiliary_loss_clip": 0.01115002, "auxiliary_loss_mlp": 0.01046231, "balance_loss_clip": 1.04417038, "balance_loss_mlp": 1.03130579, "epoch": 0.5203667518412746, "flos": 20233329448320.0, "grad_norm": 1.7757606906195533, "language_loss": 0.84137118, "learning_rate": 1.9651449441050393e-06, "loss": 0.86298347, "num_input_tokens_seen": 185973130, "step": 8655, "time_per_iteration": 2.767338752746582 }, { "auxiliary_loss_clip": 0.01113773, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.04705739, "balance_loss_mlp": 1.02643943, "epoch": 0.5204268750939426, "flos": 15705460627200.0, "grad_norm": 2.3853440972465227, "language_loss": 0.66374946, "learning_rate": 1.9647555422347777e-06, "loss": 0.68527532, "num_input_tokens_seen": 185990200, "step": 8656, "time_per_iteration": 2.6653099060058594 }, { "auxiliary_loss_clip": 0.01083984, "auxiliary_loss_mlp": 0.01043204, "balance_loss_clip": 1.04517853, "balance_loss_mlp": 1.02981043, "epoch": 0.5204869983466105, "flos": 27449469999360.0, "grad_norm": 1.9804929730339849, "language_loss": 0.73262924, "learning_rate": 1.9643661417009893e-06, "loss": 0.75390112, "num_input_tokens_seen": 186009880, "step": 8657, "time_per_iteration": 2.8447728157043457 }, { "auxiliary_loss_clip": 0.01091042, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.0432241, "balance_loss_mlp": 1.02489877, "epoch": 0.5205471215992785, "flos": 20595452411520.0, "grad_norm": 1.769785544944644, "language_loss": 0.71705246, "learning_rate": 1.9639767425184408e-06, "loss": 0.73835564, "num_input_tokens_seen": 186026680, "step": 8658, "time_per_iteration": 2.8423781394958496 }, { "auxiliary_loss_clip": 0.01123437, "auxiliary_loss_mlp": 0.01039751, "balance_loss_clip": 1.04425454, "balance_loss_mlp": 1.02607751, "epoch": 0.5206072448519465, "flos": 22127904305280.0, "grad_norm": 1.7936056694778655, "language_loss": 0.83181685, "learning_rate": 1.963587344701897e-06, "loss": 0.85344875, "num_input_tokens_seen": 186046920, "step": 8659, "time_per_iteration": 2.662799596786499 }, { "auxiliary_loss_clip": 0.01103478, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.043998, "balance_loss_mlp": 1.02959061, "epoch": 0.5206673681046144, "flos": 18330422636160.0, "grad_norm": 1.9906097398392346, "language_loss": 0.75777173, "learning_rate": 1.9631979482661253e-06, "loss": 0.77926397, "num_input_tokens_seen": 186062090, "step": 8660, "time_per_iteration": 2.6635682582855225 }, { "auxiliary_loss_clip": 0.01123245, "auxiliary_loss_mlp": 0.01039579, "balance_loss_clip": 1.04523396, "balance_loss_mlp": 1.02638865, "epoch": 0.5207274913572825, "flos": 20230240878720.0, "grad_norm": 1.836365427627734, "language_loss": 0.77897781, "learning_rate": 1.9628085532258906e-06, "loss": 0.80060601, "num_input_tokens_seen": 186081135, "step": 8661, "time_per_iteration": 2.6036980152130127 }, { "auxiliary_loss_clip": 0.01101785, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.04206395, "balance_loss_mlp": 1.02354193, "epoch": 0.5207876146099504, "flos": 22127042378880.0, "grad_norm": 1.6821546298299666, "language_loss": 0.70456815, "learning_rate": 1.9624191595959603e-06, "loss": 0.72595346, "num_input_tokens_seen": 186099700, "step": 8662, "time_per_iteration": 2.6941347122192383 }, { "auxiliary_loss_clip": 0.01108537, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.04286838, "balance_loss_mlp": 1.01910543, "epoch": 0.5208477378626184, "flos": 23878908501120.0, "grad_norm": 1.571076572398917, "language_loss": 0.69488823, "learning_rate": 1.962029767391098e-06, "loss": 0.71631837, "num_input_tokens_seen": 186119740, "step": 8663, "time_per_iteration": 2.648148536682129 }, { "auxiliary_loss_clip": 0.01096912, "auxiliary_loss_mlp": 0.00772823, "balance_loss_clip": 1.04340351, "balance_loss_mlp": 1.00029683, "epoch": 0.5209078611152863, "flos": 20961525870720.0, "grad_norm": 1.508064062466455, "language_loss": 0.77011776, "learning_rate": 1.961640376626072e-06, "loss": 0.78881508, "num_input_tokens_seen": 186140645, "step": 8664, "time_per_iteration": 2.713656187057495 }, { "auxiliary_loss_clip": 0.01099911, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.04555953, "balance_loss_mlp": 1.02207136, "epoch": 0.5209679843679543, "flos": 20667740532480.0, "grad_norm": 2.174055653698437, "language_loss": 0.76443201, "learning_rate": 1.961250987315646e-06, "loss": 0.78578866, "num_input_tokens_seen": 186160130, "step": 8665, "time_per_iteration": 2.6254820823669434 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.04986227, "balance_loss_mlp": 1.02577186, "epoch": 0.5210281076206222, "flos": 20227295963520.0, "grad_norm": 1.6491776532454103, "language_loss": 0.72156572, "learning_rate": 1.960861599474586e-06, "loss": 0.74313289, "num_input_tokens_seen": 186179485, "step": 8666, "time_per_iteration": 2.680417060852051 }, { "auxiliary_loss_clip": 0.01108853, "auxiliary_loss_mlp": 0.01038135, "balance_loss_clip": 1.04408336, "balance_loss_mlp": 1.02222097, "epoch": 0.5210882308732903, "flos": 16069989801600.0, "grad_norm": 2.5838170040517583, "language_loss": 0.68477565, "learning_rate": 1.9604722131176592e-06, "loss": 0.70624554, "num_input_tokens_seen": 186197140, "step": 8667, "time_per_iteration": 2.665583372116089 }, { "auxiliary_loss_clip": 0.01089337, "auxiliary_loss_mlp": 0.01039011, "balance_loss_clip": 1.05282402, "balance_loss_mlp": 1.02584982, "epoch": 0.5211483541259582, "flos": 24825298089600.0, "grad_norm": 1.3808961063616443, "language_loss": 0.81199509, "learning_rate": 1.960082828259629e-06, "loss": 0.83327854, "num_input_tokens_seen": 186216800, "step": 8668, "time_per_iteration": 2.802410125732422 }, { "auxiliary_loss_clip": 0.01105597, "auxiliary_loss_mlp": 0.01031995, "balance_loss_clip": 1.04507339, "balance_loss_mlp": 1.01803613, "epoch": 0.5212084773786262, "flos": 20370651143040.0, "grad_norm": 2.086648647266329, "language_loss": 0.63722765, "learning_rate": 1.9596934449152623e-06, "loss": 0.65860361, "num_input_tokens_seen": 186235320, "step": 8669, "time_per_iteration": 2.681579113006592 }, { "auxiliary_loss_clip": 0.01102666, "auxiliary_loss_mlp": 0.00771955, "balance_loss_clip": 1.04595864, "balance_loss_mlp": 1.00027704, "epoch": 0.5212686006312941, "flos": 23145468693120.0, "grad_norm": 1.5766402887224458, "language_loss": 0.66502392, "learning_rate": 1.959304063099325e-06, "loss": 0.68377018, "num_input_tokens_seen": 186254460, "step": 8670, "time_per_iteration": 2.7425742149353027 }, { "auxiliary_loss_clip": 0.01085453, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.04303861, "balance_loss_mlp": 1.02063334, "epoch": 0.5213287238839621, "flos": 27774030314880.0, "grad_norm": 2.122031398938641, "language_loss": 0.76534224, "learning_rate": 1.9589146828265806e-06, "loss": 0.78653324, "num_input_tokens_seen": 186269465, "step": 8671, "time_per_iteration": 2.7530081272125244 }, { "auxiliary_loss_clip": 0.01096106, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.04865241, "balance_loss_mlp": 1.02665734, "epoch": 0.5213888471366301, "flos": 19937676602880.0, "grad_norm": 2.569347871916013, "language_loss": 0.78284293, "learning_rate": 1.958525304111796e-06, "loss": 0.80421865, "num_input_tokens_seen": 186288660, "step": 8672, "time_per_iteration": 2.7782974243164062 }, { "auxiliary_loss_clip": 0.01085385, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.04014993, "balance_loss_mlp": 1.02035856, "epoch": 0.521448970389298, "flos": 16982731324800.0, "grad_norm": 1.8835859039826313, "language_loss": 0.72004962, "learning_rate": 1.958135926969736e-06, "loss": 0.74123341, "num_input_tokens_seen": 186305760, "step": 8673, "time_per_iteration": 2.7094011306762695 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.04249072, "balance_loss_mlp": 1.02049243, "epoch": 0.5215090936419661, "flos": 18989706816000.0, "grad_norm": 1.4914552209414809, "language_loss": 0.74901187, "learning_rate": 1.957746551415166e-06, "loss": 0.77044559, "num_input_tokens_seen": 186324135, "step": 8674, "time_per_iteration": 2.6582236289978027 }, { "auxiliary_loss_clip": 0.01097767, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.0421474, "balance_loss_mlp": 1.02030408, "epoch": 0.521569216894634, "flos": 16143427157760.0, "grad_norm": 2.0310628766426615, "language_loss": 0.86121237, "learning_rate": 1.9573571774628506e-06, "loss": 0.88254112, "num_input_tokens_seen": 186340205, "step": 8675, "time_per_iteration": 2.659674882888794 }, { "auxiliary_loss_clip": 0.01022959, "auxiliary_loss_mlp": 0.01006796, "balance_loss_clip": 1.01756668, "balance_loss_mlp": 1.00524664, "epoch": 0.521629340147302, "flos": 57579493282560.0, "grad_norm": 0.8681331347139113, "language_loss": 0.63129932, "learning_rate": 1.9569678051275556e-06, "loss": 0.65159684, "num_input_tokens_seen": 186396940, "step": 8676, "time_per_iteration": 3.205299139022827 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.04485834, "balance_loss_mlp": 1.0172416, "epoch": 0.5216894633999699, "flos": 26796901662720.0, "grad_norm": 1.5700830686566873, "language_loss": 0.68696839, "learning_rate": 1.956578434424046e-06, "loss": 0.70837998, "num_input_tokens_seen": 186418680, "step": 8677, "time_per_iteration": 2.7582013607025146 }, { "auxiliary_loss_clip": 0.0111011, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.04261422, "balance_loss_mlp": 1.01857519, "epoch": 0.5217495866526379, "flos": 26358719650560.0, "grad_norm": 1.8246312930355708, "language_loss": 0.65474886, "learning_rate": 1.956189065367086e-06, "loss": 0.67617249, "num_input_tokens_seen": 186438265, "step": 8678, "time_per_iteration": 4.216279029846191 }, { "auxiliary_loss_clip": 0.01101119, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.03927827, "balance_loss_mlp": 1.02188301, "epoch": 0.5218097099053058, "flos": 23584009841280.0, "grad_norm": 2.0476762683914287, "language_loss": 0.67981493, "learning_rate": 1.9557996979714414e-06, "loss": 0.70119429, "num_input_tokens_seen": 186456870, "step": 8679, "time_per_iteration": 2.7411186695098877 }, { "auxiliary_loss_clip": 0.01125585, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.04630351, "balance_loss_mlp": 1.02463043, "epoch": 0.5218698331579739, "flos": 18077396256000.0, "grad_norm": 1.6988813784316565, "language_loss": 0.66861475, "learning_rate": 1.9554103322518764e-06, "loss": 0.69025725, "num_input_tokens_seen": 186476425, "step": 8680, "time_per_iteration": 2.656953811645508 }, { "auxiliary_loss_clip": 0.0112586, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.04645705, "balance_loss_mlp": 1.02533197, "epoch": 0.5219299564106418, "flos": 19281121856640.0, "grad_norm": 2.024829019659845, "language_loss": 0.83280826, "learning_rate": 1.955020968223156e-06, "loss": 0.85446072, "num_input_tokens_seen": 186492555, "step": 8681, "time_per_iteration": 4.351206541061401 }, { "auxiliary_loss_clip": 0.01098299, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.0424881, "balance_loss_mlp": 1.02001929, "epoch": 0.5219900796633098, "flos": 26651355753600.0, "grad_norm": 2.0563808347758563, "language_loss": 0.77594543, "learning_rate": 1.9546316059000454e-06, "loss": 0.79726237, "num_input_tokens_seen": 186513190, "step": 8682, "time_per_iteration": 2.836205005645752 }, { "auxiliary_loss_clip": 0.01084257, "auxiliary_loss_mlp": 0.01048472, "balance_loss_clip": 1.03948176, "balance_loss_mlp": 1.03558517, "epoch": 0.5220502029159777, "flos": 34312717382400.0, "grad_norm": 1.4694894100116993, "language_loss": 0.68905342, "learning_rate": 1.9542422452973082e-06, "loss": 0.71038067, "num_input_tokens_seen": 186534830, "step": 8683, "time_per_iteration": 2.8703176975250244 }, { "auxiliary_loss_clip": 0.01091474, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.04399586, "balance_loss_mlp": 1.02824771, "epoch": 0.5221103261686457, "flos": 22156488552960.0, "grad_norm": 1.7170989726331638, "language_loss": 0.76116288, "learning_rate": 1.9538528864297104e-06, "loss": 0.78250128, "num_input_tokens_seen": 186554390, "step": 8684, "time_per_iteration": 2.8443922996520996 }, { "auxiliary_loss_clip": 0.0110091, "auxiliary_loss_mlp": 0.00771126, "balance_loss_clip": 1.0387888, "balance_loss_mlp": 1.00024819, "epoch": 0.5221704494213137, "flos": 19208402772480.0, "grad_norm": 1.8259321745961588, "language_loss": 0.75595027, "learning_rate": 1.9534635293120153e-06, "loss": 0.7746706, "num_input_tokens_seen": 186572360, "step": 8685, "time_per_iteration": 4.343646049499512 }, { "auxiliary_loss_clip": 0.01101598, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.04539514, "balance_loss_mlp": 1.02856123, "epoch": 0.5222305726739817, "flos": 19354056422400.0, "grad_norm": 1.8098495762940472, "language_loss": 0.80820441, "learning_rate": 1.9530741739589876e-06, "loss": 0.82963777, "num_input_tokens_seen": 186590655, "step": 8686, "time_per_iteration": 2.9524481296539307 }, { "auxiliary_loss_clip": 0.01102372, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.04477715, "balance_loss_mlp": 1.02207708, "epoch": 0.5222906959266497, "flos": 27814789272960.0, "grad_norm": 1.5584733304526452, "language_loss": 0.69955659, "learning_rate": 1.9526848203853927e-06, "loss": 0.72092646, "num_input_tokens_seen": 186610345, "step": 8687, "time_per_iteration": 2.8442130088806152 }, { "auxiliary_loss_clip": 0.01119347, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.04286504, "balance_loss_mlp": 1.02110982, "epoch": 0.5223508191793176, "flos": 12712988615040.0, "grad_norm": 2.218511460216324, "language_loss": 0.83229095, "learning_rate": 1.9522954686059936e-06, "loss": 0.85382187, "num_input_tokens_seen": 186624360, "step": 8688, "time_per_iteration": 2.6338348388671875 }, { "auxiliary_loss_clip": 0.01111374, "auxiliary_loss_mlp": 0.00771369, "balance_loss_clip": 1.04469848, "balance_loss_mlp": 1.00028682, "epoch": 0.5224109424319856, "flos": 15632238752640.0, "grad_norm": 2.3403806989505744, "language_loss": 0.73484588, "learning_rate": 1.9519061186355558e-06, "loss": 0.75367332, "num_input_tokens_seen": 186638680, "step": 8689, "time_per_iteration": 2.7219626903533936 }, { "auxiliary_loss_clip": 0.01098413, "auxiliary_loss_mlp": 0.01039301, "balance_loss_clip": 1.04080057, "balance_loss_mlp": 1.02569962, "epoch": 0.5224710656846535, "flos": 15742233175680.0, "grad_norm": 1.8348188856486891, "language_loss": 0.83713108, "learning_rate": 1.9515167704888417e-06, "loss": 0.85850823, "num_input_tokens_seen": 186655840, "step": 8690, "time_per_iteration": 2.7358436584472656 }, { "auxiliary_loss_clip": 0.01088108, "auxiliary_loss_mlp": 0.01042101, "balance_loss_clip": 1.04381537, "balance_loss_mlp": 1.0276053, "epoch": 0.5225311889373215, "flos": 26030998938240.0, "grad_norm": 2.015928049267595, "language_loss": 0.79080188, "learning_rate": 1.9511274241806173e-06, "loss": 0.81210393, "num_input_tokens_seen": 186674150, "step": 8691, "time_per_iteration": 2.813861131668091 }, { "auxiliary_loss_clip": 0.01120671, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.04700625, "balance_loss_mlp": 1.02552676, "epoch": 0.5225913121899894, "flos": 18369278173440.0, "grad_norm": 2.3023499072102194, "language_loss": 0.76491982, "learning_rate": 1.950738079725646e-06, "loss": 0.78652847, "num_input_tokens_seen": 186690675, "step": 8692, "time_per_iteration": 2.73480224609375 }, { "auxiliary_loss_clip": 0.01108877, "auxiliary_loss_mlp": 0.01039055, "balance_loss_clip": 1.04479527, "balance_loss_mlp": 1.02631116, "epoch": 0.5226514354426575, "flos": 29273516501760.0, "grad_norm": 1.6247734368015925, "language_loss": 0.72325015, "learning_rate": 1.950348737138691e-06, "loss": 0.7447294, "num_input_tokens_seen": 186710380, "step": 8693, "time_per_iteration": 2.782871723175049 }, { "auxiliary_loss_clip": 0.01126187, "auxiliary_loss_mlp": 0.01042643, "balance_loss_clip": 1.04384446, "balance_loss_mlp": 1.02753901, "epoch": 0.5227115586953254, "flos": 22853299466880.0, "grad_norm": 7.53216872329228, "language_loss": 0.8220976, "learning_rate": 1.949959396434517e-06, "loss": 0.84378588, "num_input_tokens_seen": 186729135, "step": 8694, "time_per_iteration": 2.6748385429382324 }, { "auxiliary_loss_clip": 0.01013741, "auxiliary_loss_mlp": 0.01003883, "balance_loss_clip": 1.02031374, "balance_loss_mlp": 1.00224972, "epoch": 0.5227716819479934, "flos": 57474419022720.0, "grad_norm": 0.775564151874101, "language_loss": 0.55647832, "learning_rate": 1.949570057627888e-06, "loss": 0.57665455, "num_input_tokens_seen": 186791115, "step": 8695, "time_per_iteration": 3.345134973526001 }, { "auxiliary_loss_clip": 0.01061261, "auxiliary_loss_mlp": 0.01041707, "balance_loss_clip": 1.04356098, "balance_loss_mlp": 1.0283134, "epoch": 0.5228318052006613, "flos": 13808264077440.0, "grad_norm": 1.8671615474987673, "language_loss": 0.732638, "learning_rate": 1.9491807207335672e-06, "loss": 0.75366765, "num_input_tokens_seen": 186808660, "step": 8696, "time_per_iteration": 2.782350540161133 }, { "auxiliary_loss_clip": 0.01099328, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.0429219, "balance_loss_mlp": 1.02538478, "epoch": 0.5228919284533293, "flos": 15596184476160.0, "grad_norm": 1.7190001113055795, "language_loss": 0.71068561, "learning_rate": 1.948791385766319e-06, "loss": 0.73207062, "num_input_tokens_seen": 186825900, "step": 8697, "time_per_iteration": 2.781651735305786 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.04413819, "balance_loss_mlp": 1.02498996, "epoch": 0.5229520517059973, "flos": 22491499726080.0, "grad_norm": 1.9475868659159346, "language_loss": 0.80332339, "learning_rate": 1.948402052740906e-06, "loss": 0.82462299, "num_input_tokens_seen": 186843735, "step": 8698, "time_per_iteration": 2.7078070640563965 }, { "auxiliary_loss_clip": 0.01110911, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.04286766, "balance_loss_mlp": 1.02576292, "epoch": 0.5230121749586653, "flos": 22090880361600.0, "grad_norm": 1.6510046342053804, "language_loss": 0.74265802, "learning_rate": 1.948012721672093e-06, "loss": 0.7641564, "num_input_tokens_seen": 186862440, "step": 8699, "time_per_iteration": 2.667205333709717 }, { "auxiliary_loss_clip": 0.01113513, "auxiliary_loss_mlp": 0.00773315, "balance_loss_clip": 1.04171407, "balance_loss_mlp": 1.00029182, "epoch": 0.5230722982113333, "flos": 22127150119680.0, "grad_norm": 1.8535119798273105, "language_loss": 0.73102427, "learning_rate": 1.947623392574642e-06, "loss": 0.74989247, "num_input_tokens_seen": 186880940, "step": 8700, "time_per_iteration": 2.7250688076019287 }, { "auxiliary_loss_clip": 0.01100202, "auxiliary_loss_mlp": 0.01039746, "balance_loss_clip": 1.04480553, "balance_loss_mlp": 1.02510738, "epoch": 0.5231324214640012, "flos": 25009268572800.0, "grad_norm": 1.8378710861613805, "language_loss": 0.67156309, "learning_rate": 1.947234065463318e-06, "loss": 0.69296253, "num_input_tokens_seen": 186900785, "step": 8701, "time_per_iteration": 2.830300807952881 }, { "auxiliary_loss_clip": 0.0110603, "auxiliary_loss_mlp": 0.00771586, "balance_loss_clip": 1.04569697, "balance_loss_mlp": 1.0002594, "epoch": 0.5231925447166692, "flos": 25740517651200.0, "grad_norm": 1.7245960424067608, "language_loss": 0.66710031, "learning_rate": 1.9468447403528826e-06, "loss": 0.68587643, "num_input_tokens_seen": 186920895, "step": 8702, "time_per_iteration": 2.725583791732788 }, { "auxiliary_loss_clip": 0.01100659, "auxiliary_loss_mlp": 0.01039254, "balance_loss_clip": 1.04362679, "balance_loss_mlp": 1.02464485, "epoch": 0.5232526679693371, "flos": 21433930565760.0, "grad_norm": 1.7906940342438376, "language_loss": 0.76647937, "learning_rate": 1.946455417258101e-06, "loss": 0.78787845, "num_input_tokens_seen": 186940605, "step": 8703, "time_per_iteration": 2.7585973739624023 }, { "auxiliary_loss_clip": 0.01117607, "auxiliary_loss_mlp": 0.01043637, "balance_loss_clip": 1.04529738, "balance_loss_mlp": 1.02807403, "epoch": 0.5233127912220051, "flos": 35298393471360.0, "grad_norm": 2.3077994186551036, "language_loss": 0.76945215, "learning_rate": 1.9460660961937348e-06, "loss": 0.79106462, "num_input_tokens_seen": 186960820, "step": 8704, "time_per_iteration": 2.8613169193267822 }, { "auxiliary_loss_clip": 0.01102832, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.04692268, "balance_loss_mlp": 1.02798438, "epoch": 0.523372914474673, "flos": 17051320344960.0, "grad_norm": 1.8023730932949449, "language_loss": 0.78725791, "learning_rate": 1.9456767771745474e-06, "loss": 0.80869591, "num_input_tokens_seen": 186976240, "step": 8705, "time_per_iteration": 2.741025924682617 }, { "auxiliary_loss_clip": 0.01106252, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.04467177, "balance_loss_mlp": 1.02273059, "epoch": 0.5234330377273411, "flos": 18406302117120.0, "grad_norm": 2.80572723928073, "language_loss": 0.69824338, "learning_rate": 1.9452874602153027e-06, "loss": 0.71967667, "num_input_tokens_seen": 186992855, "step": 8706, "time_per_iteration": 2.6872975826263428 }, { "auxiliary_loss_clip": 0.01035877, "auxiliary_loss_mlp": 0.01013693, "balance_loss_clip": 1.01881003, "balance_loss_mlp": 1.01213157, "epoch": 0.523493160980009, "flos": 65850296970240.0, "grad_norm": 0.6808139995313122, "language_loss": 0.52465838, "learning_rate": 1.9448981453307623e-06, "loss": 0.54515409, "num_input_tokens_seen": 187051205, "step": 8707, "time_per_iteration": 3.2341713905334473 }, { "auxiliary_loss_clip": 0.01098509, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04139447, "balance_loss_mlp": 1.02380002, "epoch": 0.523553284232677, "flos": 21872076664320.0, "grad_norm": 1.6877057679435725, "language_loss": 0.74618769, "learning_rate": 1.9445088325356904e-06, "loss": 0.76754665, "num_input_tokens_seen": 187070540, "step": 8708, "time_per_iteration": 2.8342666625976562 }, { "auxiliary_loss_clip": 0.0109528, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.04457259, "balance_loss_mlp": 1.01772881, "epoch": 0.5236134074853449, "flos": 20848191482880.0, "grad_norm": 1.566541485414049, "language_loss": 0.7730183, "learning_rate": 1.944119521844849e-06, "loss": 0.79428267, "num_input_tokens_seen": 187089975, "step": 8709, "time_per_iteration": 2.708807945251465 }, { "auxiliary_loss_clip": 0.01074175, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.03733826, "balance_loss_mlp": 1.02211428, "epoch": 0.5236735307380129, "flos": 25520421064320.0, "grad_norm": 2.041376547108184, "language_loss": 0.83508044, "learning_rate": 1.9437302132730003e-06, "loss": 0.85620999, "num_input_tokens_seen": 187108775, "step": 8710, "time_per_iteration": 2.7781410217285156 }, { "auxiliary_loss_clip": 0.01093974, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.04229414, "balance_loss_mlp": 1.01794267, "epoch": 0.523733653990681, "flos": 23583112001280.0, "grad_norm": 2.2254848949827983, "language_loss": 0.69715381, "learning_rate": 1.943340906834908e-06, "loss": 0.7184099, "num_input_tokens_seen": 187128830, "step": 8711, "time_per_iteration": 2.7991995811462402 }, { "auxiliary_loss_clip": 0.01114283, "auxiliary_loss_mlp": 0.01039219, "balance_loss_clip": 1.04482269, "balance_loss_mlp": 1.02475893, "epoch": 0.5237937772433489, "flos": 21106245767040.0, "grad_norm": 2.0479693285364764, "language_loss": 0.8319692, "learning_rate": 1.9429516025453345e-06, "loss": 0.85350424, "num_input_tokens_seen": 187149570, "step": 8712, "time_per_iteration": 2.6913018226623535 }, { "auxiliary_loss_clip": 0.01126488, "auxiliary_loss_mlp": 0.01042299, "balance_loss_clip": 1.04477775, "balance_loss_mlp": 1.02704, "epoch": 0.5238539004960169, "flos": 19172887200000.0, "grad_norm": 2.12392132979159, "language_loss": 0.69795638, "learning_rate": 1.9425623004190415e-06, "loss": 0.71964419, "num_input_tokens_seen": 187170575, "step": 8713, "time_per_iteration": 2.6037533283233643 }, { "auxiliary_loss_clip": 0.01087813, "auxiliary_loss_mlp": 0.01040708, "balance_loss_clip": 1.03908944, "balance_loss_mlp": 1.02369666, "epoch": 0.5239140237486848, "flos": 17888218300800.0, "grad_norm": 2.8914750795344233, "language_loss": 0.76703346, "learning_rate": 1.9421730004707925e-06, "loss": 0.78831869, "num_input_tokens_seen": 187187190, "step": 8714, "time_per_iteration": 2.717984676361084 }, { "auxiliary_loss_clip": 0.01086969, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.0413481, "balance_loss_mlp": 1.02729511, "epoch": 0.5239741470013528, "flos": 17930413802880.0, "grad_norm": 1.9287276707329408, "language_loss": 0.7608462, "learning_rate": 1.9417837027153483e-06, "loss": 0.78215897, "num_input_tokens_seen": 187204350, "step": 8715, "time_per_iteration": 2.6999671459198 }, { "auxiliary_loss_clip": 0.01099192, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.0417552, "balance_loss_mlp": 1.02110636, "epoch": 0.5240342702540207, "flos": 30993386584320.0, "grad_norm": 2.1294970054785622, "language_loss": 0.71165496, "learning_rate": 1.9413944071674723e-06, "loss": 0.73300266, "num_input_tokens_seen": 187225605, "step": 8716, "time_per_iteration": 2.744347333908081 }, { "auxiliary_loss_clip": 0.01121973, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.04380643, "balance_loss_mlp": 1.02563596, "epoch": 0.5240943935066887, "flos": 25005066681600.0, "grad_norm": 3.2118480553546087, "language_loss": 0.87086689, "learning_rate": 1.941005113841926e-06, "loss": 0.89247203, "num_input_tokens_seen": 187241335, "step": 8717, "time_per_iteration": 4.158156394958496 }, { "auxiliary_loss_clip": 0.01109045, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.0454371, "balance_loss_mlp": 1.02164412, "epoch": 0.5241545167593566, "flos": 23659099223040.0, "grad_norm": 1.880090780763199, "language_loss": 0.61121464, "learning_rate": 1.9406158227534723e-06, "loss": 0.63266253, "num_input_tokens_seen": 187259925, "step": 8718, "time_per_iteration": 2.671760320663452 }, { "auxiliary_loss_clip": 0.01094217, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04272294, "balance_loss_mlp": 1.02387953, "epoch": 0.5242146400120247, "flos": 23400398494080.0, "grad_norm": 1.8098933087704439, "language_loss": 0.72060192, "learning_rate": 1.940226533916872e-06, "loss": 0.74193311, "num_input_tokens_seen": 187279035, "step": 8719, "time_per_iteration": 2.815864324569702 }, { "auxiliary_loss_clip": 0.01109147, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.04305363, "balance_loss_mlp": 1.01676893, "epoch": 0.5242747632646926, "flos": 17749065012480.0, "grad_norm": 1.9600898858885738, "language_loss": 0.73258477, "learning_rate": 1.9398372473468877e-06, "loss": 0.7539705, "num_input_tokens_seen": 187297555, "step": 8720, "time_per_iteration": 4.34027624130249 }, { "auxiliary_loss_clip": 0.01110975, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.042588, "balance_loss_mlp": 1.02323568, "epoch": 0.5243348865173606, "flos": 32597731549440.0, "grad_norm": 1.7136870064395262, "language_loss": 0.7059021, "learning_rate": 1.939447963058281e-06, "loss": 0.72738934, "num_input_tokens_seen": 187320265, "step": 8721, "time_per_iteration": 4.457958698272705 }, { "auxiliary_loss_clip": 0.01064422, "auxiliary_loss_mlp": 0.0103891, "balance_loss_clip": 1.03628516, "balance_loss_mlp": 1.02399719, "epoch": 0.5243950097700285, "flos": 25484115392640.0, "grad_norm": 1.8741175153878353, "language_loss": 0.86506796, "learning_rate": 1.939058681065813e-06, "loss": 0.88610125, "num_input_tokens_seen": 187338045, "step": 8722, "time_per_iteration": 2.851713180541992 }, { "auxiliary_loss_clip": 0.01122948, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.0449574, "balance_loss_mlp": 1.01830578, "epoch": 0.5244551330226965, "flos": 15268391936640.0, "grad_norm": 1.8614764349338224, "language_loss": 0.79853708, "learning_rate": 1.938669401384247e-06, "loss": 0.82009959, "num_input_tokens_seen": 187356040, "step": 8723, "time_per_iteration": 2.567403554916382 }, { "auxiliary_loss_clip": 0.01111191, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.04611158, "balance_loss_mlp": 1.02747166, "epoch": 0.5245152562753645, "flos": 22237108629120.0, "grad_norm": 2.070314434964904, "language_loss": 0.75515735, "learning_rate": 1.9382801240283426e-06, "loss": 0.77669066, "num_input_tokens_seen": 187374185, "step": 8724, "time_per_iteration": 4.372815847396851 }, { "auxiliary_loss_clip": 0.01128433, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.04391563, "balance_loss_mlp": 1.02228856, "epoch": 0.5245753795280325, "flos": 29426460612480.0, "grad_norm": 1.7393951886603523, "language_loss": 0.70450562, "learning_rate": 1.9378908490128625e-06, "loss": 0.72617668, "num_input_tokens_seen": 187396640, "step": 8725, "time_per_iteration": 2.691462278366089 }, { "auxiliary_loss_clip": 0.01014562, "auxiliary_loss_mlp": 0.0100467, "balance_loss_clip": 1.01748943, "balance_loss_mlp": 1.0025723, "epoch": 0.5246355027807005, "flos": 58834392785280.0, "grad_norm": 0.751972672191828, "language_loss": 0.55635381, "learning_rate": 1.937501576352568e-06, "loss": 0.57654613, "num_input_tokens_seen": 187455945, "step": 8726, "time_per_iteration": 3.2482144832611084 }, { "auxiliary_loss_clip": 0.01023582, "auxiliary_loss_mlp": 0.01000951, "balance_loss_clip": 1.02279115, "balance_loss_mlp": 0.9995268, "epoch": 0.5246956260333684, "flos": 64526592965760.0, "grad_norm": 0.7878423938979384, "language_loss": 0.58313322, "learning_rate": 1.937112306062219e-06, "loss": 0.60337853, "num_input_tokens_seen": 187519975, "step": 8727, "time_per_iteration": 3.2606794834136963 }, { "auxiliary_loss_clip": 0.01114413, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.0418663, "balance_loss_mlp": 1.02111006, "epoch": 0.5247557492860364, "flos": 24533631653760.0, "grad_norm": 1.3167349097133665, "language_loss": 0.70678449, "learning_rate": 1.9367230381565786e-06, "loss": 0.72828913, "num_input_tokens_seen": 187541775, "step": 8728, "time_per_iteration": 2.6979823112487793 }, { "auxiliary_loss_clip": 0.01110188, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.04107904, "balance_loss_mlp": 1.01636648, "epoch": 0.5248158725387043, "flos": 18806131382400.0, "grad_norm": 1.4052080718589413, "language_loss": 0.69816244, "learning_rate": 1.9363337726504062e-06, "loss": 0.71955991, "num_input_tokens_seen": 187560425, "step": 8729, "time_per_iteration": 2.6898272037506104 }, { "auxiliary_loss_clip": 0.01084395, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.04138565, "balance_loss_mlp": 1.02001655, "epoch": 0.5248759957913723, "flos": 20955851521920.0, "grad_norm": 1.9953537122640765, "language_loss": 0.83565557, "learning_rate": 1.935944509558464e-06, "loss": 0.85684621, "num_input_tokens_seen": 187579930, "step": 8730, "time_per_iteration": 2.719953775405884 }, { "auxiliary_loss_clip": 0.01087481, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.04011822, "balance_loss_mlp": 1.02177548, "epoch": 0.5249361190440403, "flos": 18660980522880.0, "grad_norm": 2.0205964009231816, "language_loss": 0.79403269, "learning_rate": 1.9355552488955125e-06, "loss": 0.81527805, "num_input_tokens_seen": 187595365, "step": 8731, "time_per_iteration": 2.741563081741333 }, { "auxiliary_loss_clip": 0.01105082, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.03996611, "balance_loss_mlp": 1.0172075, "epoch": 0.5249962422967083, "flos": 24863327614080.0, "grad_norm": 1.917738069421625, "language_loss": 0.83558822, "learning_rate": 1.935165990676312e-06, "loss": 0.85694802, "num_input_tokens_seen": 187614715, "step": 8732, "time_per_iteration": 2.672537326812744 }, { "auxiliary_loss_clip": 0.01109755, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.04267287, "balance_loss_mlp": 1.0239923, "epoch": 0.5250563655493762, "flos": 15262681674240.0, "grad_norm": 1.7357983281517446, "language_loss": 0.77602309, "learning_rate": 1.9347767349156237e-06, "loss": 0.79749608, "num_input_tokens_seen": 187630745, "step": 8733, "time_per_iteration": 2.651329278945923 }, { "auxiliary_loss_clip": 0.01126312, "auxiliary_loss_mlp": 0.01036227, "balance_loss_clip": 1.04450274, "balance_loss_mlp": 1.02157617, "epoch": 0.5251164888020442, "flos": 18625177641600.0, "grad_norm": 1.892740616554097, "language_loss": 0.8202911, "learning_rate": 1.934387481628208e-06, "loss": 0.84191644, "num_input_tokens_seen": 187648200, "step": 8734, "time_per_iteration": 2.608727216720581 }, { "auxiliary_loss_clip": 0.01091339, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.04116642, "balance_loss_mlp": 1.01467109, "epoch": 0.5251766120547121, "flos": 29710764760320.0, "grad_norm": 1.3668287037138613, "language_loss": 0.76932037, "learning_rate": 1.933998230828826e-06, "loss": 0.79051596, "num_input_tokens_seen": 187669205, "step": 8735, "time_per_iteration": 2.703274965286255 }, { "auxiliary_loss_clip": 0.01112983, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.04413259, "balance_loss_mlp": 1.02544188, "epoch": 0.5252367353073801, "flos": 23440295525760.0, "grad_norm": 1.7627870360178364, "language_loss": 0.80808437, "learning_rate": 1.9336089825322376e-06, "loss": 0.82960117, "num_input_tokens_seen": 187690890, "step": 8736, "time_per_iteration": 2.6869864463806152 }, { "auxiliary_loss_clip": 0.01124902, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04460597, "balance_loss_mlp": 1.02199018, "epoch": 0.5252968585600482, "flos": 30810708990720.0, "grad_norm": 2.2019442049314626, "language_loss": 0.69824821, "learning_rate": 1.9332197367532033e-06, "loss": 0.71986508, "num_input_tokens_seen": 187713045, "step": 8737, "time_per_iteration": 2.694178342819214 }, { "auxiliary_loss_clip": 0.01101601, "auxiliary_loss_mlp": 0.01038957, "balance_loss_clip": 1.04274702, "balance_loss_mlp": 1.02473521, "epoch": 0.5253569818127161, "flos": 20628274464000.0, "grad_norm": 1.4444028137471083, "language_loss": 0.77386785, "learning_rate": 1.9328304935064833e-06, "loss": 0.79527342, "num_input_tokens_seen": 187733640, "step": 8738, "time_per_iteration": 2.7655301094055176 }, { "auxiliary_loss_clip": 0.01012696, "auxiliary_loss_mlp": 0.00752303, "balance_loss_clip": 1.01498532, "balance_loss_mlp": 0.99995118, "epoch": 0.5254171050653841, "flos": 63428695810560.0, "grad_norm": 0.7418872270660203, "language_loss": 0.54437888, "learning_rate": 1.932441252806837e-06, "loss": 0.56202877, "num_input_tokens_seen": 187792930, "step": 8739, "time_per_iteration": 3.183931350708008 }, { "auxiliary_loss_clip": 0.01093164, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.03987527, "balance_loss_mlp": 1.02572989, "epoch": 0.525477228318052, "flos": 34670782108800.0, "grad_norm": 1.6115423077763054, "language_loss": 0.84719479, "learning_rate": 1.9320520146690263e-06, "loss": 0.8685174, "num_input_tokens_seen": 187812495, "step": 8740, "time_per_iteration": 2.8701846599578857 }, { "auxiliary_loss_clip": 0.01106251, "auxiliary_loss_mlp": 0.00771888, "balance_loss_clip": 1.03936994, "balance_loss_mlp": 1.00030541, "epoch": 0.52553735157072, "flos": 17930844766080.0, "grad_norm": 2.112576285349714, "language_loss": 0.69466913, "learning_rate": 1.9316627791078093e-06, "loss": 0.71345055, "num_input_tokens_seen": 187829685, "step": 8741, "time_per_iteration": 2.721233606338501 }, { "auxiliary_loss_clip": 0.01101687, "auxiliary_loss_mlp": 0.0103584, "balance_loss_clip": 1.04140949, "balance_loss_mlp": 1.02171421, "epoch": 0.5255974748233879, "flos": 9940864584960.0, "grad_norm": 1.8031333880336204, "language_loss": 0.66328311, "learning_rate": 1.931273546137947e-06, "loss": 0.68465841, "num_input_tokens_seen": 187846495, "step": 8742, "time_per_iteration": 2.695504903793335 }, { "auxiliary_loss_clip": 0.01086092, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.03882444, "balance_loss_mlp": 1.02666903, "epoch": 0.5256575980760559, "flos": 16868427269760.0, "grad_norm": 1.9909144400242709, "language_loss": 0.63219392, "learning_rate": 1.9308843157741983e-06, "loss": 0.65347725, "num_input_tokens_seen": 187862010, "step": 8743, "time_per_iteration": 2.712376832962036 }, { "auxiliary_loss_clip": 0.0102969, "auxiliary_loss_mlp": 0.01008337, "balance_loss_clip": 1.01230693, "balance_loss_mlp": 1.00641751, "epoch": 0.5257177213287239, "flos": 62386210362240.0, "grad_norm": 0.7739828883360421, "language_loss": 0.5410347, "learning_rate": 1.930495088031323e-06, "loss": 0.56141496, "num_input_tokens_seen": 187922730, "step": 8744, "time_per_iteration": 3.281756639480591 }, { "auxiliary_loss_clip": 0.01106094, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.04534447, "balance_loss_mlp": 1.02202296, "epoch": 0.5257778445813919, "flos": 20776908942720.0, "grad_norm": 2.5030900138953274, "language_loss": 0.75859022, "learning_rate": 1.9301058629240814e-06, "loss": 0.7800293, "num_input_tokens_seen": 187940160, "step": 8745, "time_per_iteration": 2.642817258834839 }, { "auxiliary_loss_clip": 0.01110515, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.04153466, "balance_loss_mlp": 1.02948213, "epoch": 0.5258379678340598, "flos": 17018606033280.0, "grad_norm": 1.7823830080970366, "language_loss": 0.8089028, "learning_rate": 1.9297166404672324e-06, "loss": 0.83043599, "num_input_tokens_seen": 187958625, "step": 8746, "time_per_iteration": 2.5678205490112305 }, { "auxiliary_loss_clip": 0.01108698, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.04006267, "balance_loss_mlp": 1.02191806, "epoch": 0.5258980910867278, "flos": 21068754946560.0, "grad_norm": 2.1394959039376475, "language_loss": 0.75231433, "learning_rate": 1.9293274206755353e-06, "loss": 0.77376711, "num_input_tokens_seen": 187977575, "step": 8747, "time_per_iteration": -0.009610652923583984 }, { "auxiliary_loss_clip": 0.0105854, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.03949201, "balance_loss_mlp": 1.01987767, "epoch": 0.5259582143393957, "flos": 18004461690240.0, "grad_norm": 2.0175880820051058, "language_loss": 0.82632613, "learning_rate": 1.9289382035637505e-06, "loss": 0.84725058, "num_input_tokens_seen": 187996650, "step": 8748, "time_per_iteration": 2.7604665756225586 }, { "auxiliary_loss_clip": 0.01099486, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.03856742, "balance_loss_mlp": 1.01846862, "epoch": 0.5260183375920637, "flos": 22783848520320.0, "grad_norm": 2.328081087853481, "language_loss": 0.80873966, "learning_rate": 1.9285489891466345e-06, "loss": 0.83006573, "num_input_tokens_seen": 188013510, "step": 8749, "time_per_iteration": 2.6853184700012207 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.01040189, "balance_loss_clip": 1.04381132, "balance_loss_mlp": 1.02556193, "epoch": 0.5260784608447318, "flos": 27052406081280.0, "grad_norm": 1.7699462129252088, "language_loss": 0.72291499, "learning_rate": 1.9281597774389487e-06, "loss": 0.74441439, "num_input_tokens_seen": 188032085, "step": 8750, "time_per_iteration": 2.6771364212036133 }, { "auxiliary_loss_clip": 0.01098374, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03887165, "balance_loss_mlp": 1.02362585, "epoch": 0.5261385840973997, "flos": 20662820369280.0, "grad_norm": 1.3348346616556535, "language_loss": 0.76186317, "learning_rate": 1.9277705684554517e-06, "loss": 0.78322065, "num_input_tokens_seen": 188050590, "step": 8751, "time_per_iteration": 2.7016804218292236 }, { "auxiliary_loss_clip": 0.01119796, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.04339051, "balance_loss_mlp": 1.02622056, "epoch": 0.5261987073500677, "flos": 23622649896960.0, "grad_norm": 1.7424279065253616, "language_loss": 0.75831163, "learning_rate": 1.927381362210902e-06, "loss": 0.77990663, "num_input_tokens_seen": 188071620, "step": 8752, "time_per_iteration": 2.7128703594207764 }, { "auxiliary_loss_clip": 0.01112565, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.04177046, "balance_loss_mlp": 1.01780224, "epoch": 0.5262588306027356, "flos": 27636241743360.0, "grad_norm": 2.1757268908288707, "language_loss": 0.67754769, "learning_rate": 1.926992158720058e-06, "loss": 0.69900852, "num_input_tokens_seen": 188091740, "step": 8753, "time_per_iteration": 2.678269147872925 }, { "auxiliary_loss_clip": 0.01111599, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.04266751, "balance_loss_mlp": 1.02072084, "epoch": 0.5263189538554036, "flos": 21759711943680.0, "grad_norm": 1.6342208992061138, "language_loss": 0.84114075, "learning_rate": 1.9266029579976785e-06, "loss": 0.86259949, "num_input_tokens_seen": 188111165, "step": 8754, "time_per_iteration": 2.6858248710632324 }, { "auxiliary_loss_clip": 0.01109767, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.04159164, "balance_loss_mlp": 1.02159333, "epoch": 0.5263790771080715, "flos": 14276359140480.0, "grad_norm": 2.0064086672514323, "language_loss": 0.87360156, "learning_rate": 1.926213760058522e-06, "loss": 0.89505792, "num_input_tokens_seen": 188127825, "step": 8755, "time_per_iteration": 2.5783674716949463 }, { "auxiliary_loss_clip": 0.01007681, "auxiliary_loss_mlp": 0.01000927, "balance_loss_clip": 1.01328659, "balance_loss_mlp": 0.99918669, "epoch": 0.5264392003607395, "flos": 65806413528960.0, "grad_norm": 0.7404552494369754, "language_loss": 0.5880959, "learning_rate": 1.9258245649173477e-06, "loss": 0.60818201, "num_input_tokens_seen": 188194050, "step": 8756, "time_per_iteration": 3.308302402496338 }, { "auxiliary_loss_clip": 0.01094156, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.0415833, "balance_loss_mlp": 1.02182269, "epoch": 0.5264993236134075, "flos": 21032413361280.0, "grad_norm": 1.6572717697992079, "language_loss": 0.70703959, "learning_rate": 1.925435372588913e-06, "loss": 0.72834826, "num_input_tokens_seen": 188212565, "step": 8757, "time_per_iteration": 4.195650100708008 }, { "auxiliary_loss_clip": 0.0110952, "auxiliary_loss_mlp": 0.01040036, "balance_loss_clip": 1.04061294, "balance_loss_mlp": 1.02590346, "epoch": 0.5265594468660755, "flos": 16618202150400.0, "grad_norm": 2.0494500796269577, "language_loss": 0.88039553, "learning_rate": 1.9250461830879768e-06, "loss": 0.90189111, "num_input_tokens_seen": 188229505, "step": 8758, "time_per_iteration": 2.63089656829834 }, { "auxiliary_loss_clip": 0.01061465, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.03887105, "balance_loss_mlp": 1.02301979, "epoch": 0.5266195701187434, "flos": 24134125610880.0, "grad_norm": 1.4473751902891179, "language_loss": 0.75895298, "learning_rate": 1.9246569964292965e-06, "loss": 0.77994329, "num_input_tokens_seen": 188250395, "step": 8759, "time_per_iteration": 4.702188968658447 }, { "auxiliary_loss_clip": 0.01098136, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.04185557, "balance_loss_mlp": 1.0181073, "epoch": 0.5266796933714114, "flos": 15844111125120.0, "grad_norm": 1.7900777891811301, "language_loss": 0.71485013, "learning_rate": 1.9242678126276307e-06, "loss": 0.73614085, "num_input_tokens_seen": 188266785, "step": 8760, "time_per_iteration": 4.256975412368774 }, { "auxiliary_loss_clip": 0.01098696, "auxiliary_loss_mlp": 0.01040967, "balance_loss_clip": 1.04177952, "balance_loss_mlp": 1.02593493, "epoch": 0.5267398166240793, "flos": 20951434149120.0, "grad_norm": 2.6157951761776697, "language_loss": 0.75801802, "learning_rate": 1.923878631697736e-06, "loss": 0.77941465, "num_input_tokens_seen": 188282525, "step": 8761, "time_per_iteration": 2.685028553009033 }, { "auxiliary_loss_clip": 0.01104735, "auxiliary_loss_mlp": 0.00771727, "balance_loss_clip": 1.03871739, "balance_loss_mlp": 1.00023258, "epoch": 0.5267999398767473, "flos": 20996394998400.0, "grad_norm": 1.8739254444127986, "language_loss": 0.70466101, "learning_rate": 1.923489453654373e-06, "loss": 0.72342563, "num_input_tokens_seen": 188301395, "step": 8762, "time_per_iteration": 2.727120876312256 }, { "auxiliary_loss_clip": 0.01014324, "auxiliary_loss_mlp": 0.00999661, "balance_loss_clip": 1.00980198, "balance_loss_mlp": 0.99816543, "epoch": 0.5268600631294152, "flos": 66849401767680.0, "grad_norm": 0.9282030794038212, "language_loss": 0.65443593, "learning_rate": 1.9231002785122963e-06, "loss": 0.67457575, "num_input_tokens_seen": 188357665, "step": 8763, "time_per_iteration": 3.109525203704834 }, { "auxiliary_loss_clip": 0.01109455, "auxiliary_loss_mlp": 0.01030406, "balance_loss_clip": 1.04166603, "balance_loss_mlp": 1.01676226, "epoch": 0.5269201863820833, "flos": 17165552572800.0, "grad_norm": 1.6243900815433006, "language_loss": 0.71050072, "learning_rate": 1.922711106286265e-06, "loss": 0.73189938, "num_input_tokens_seen": 188376935, "step": 8764, "time_per_iteration": 4.168430328369141 }, { "auxiliary_loss_clip": 0.01080487, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.03809977, "balance_loss_mlp": 1.01832938, "epoch": 0.5269803096347513, "flos": 20522589672960.0, "grad_norm": 1.5962933914095123, "language_loss": 0.74318087, "learning_rate": 1.9223219369910368e-06, "loss": 0.76432389, "num_input_tokens_seen": 188394995, "step": 8765, "time_per_iteration": 2.7441658973693848 }, { "auxiliary_loss_clip": 0.01098499, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.03631091, "balance_loss_mlp": 1.02200055, "epoch": 0.5270404328874192, "flos": 27230989524480.0, "grad_norm": 1.60818818085183, "language_loss": 0.85403508, "learning_rate": 1.9219327706413677e-06, "loss": 0.87539107, "num_input_tokens_seen": 188415475, "step": 8766, "time_per_iteration": 2.7902116775512695 }, { "auxiliary_loss_clip": 0.0112556, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.0449605, "balance_loss_mlp": 1.02492046, "epoch": 0.5271005561400872, "flos": 23110491824640.0, "grad_norm": 1.780636206979604, "language_loss": 0.79070592, "learning_rate": 1.921543607252017e-06, "loss": 0.81235737, "num_input_tokens_seen": 188435665, "step": 8767, "time_per_iteration": 2.6986846923828125 }, { "auxiliary_loss_clip": 0.01114967, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04406393, "balance_loss_mlp": 1.02407432, "epoch": 0.5271606793927551, "flos": 22564793427840.0, "grad_norm": 1.6576657234027676, "language_loss": 0.73513746, "learning_rate": 1.9211544468377394e-06, "loss": 0.75668073, "num_input_tokens_seen": 188455405, "step": 8768, "time_per_iteration": 2.695497989654541 }, { "auxiliary_loss_clip": 0.01092606, "auxiliary_loss_mlp": 0.01048135, "balance_loss_clip": 1.03795791, "balance_loss_mlp": 1.03445613, "epoch": 0.5272208026454231, "flos": 18764259102720.0, "grad_norm": 1.9012673693956994, "language_loss": 0.7428031, "learning_rate": 1.9207652894132933e-06, "loss": 0.76421046, "num_input_tokens_seen": 188472940, "step": 8769, "time_per_iteration": 2.7763235569000244 }, { "auxiliary_loss_clip": 0.01082308, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.03746688, "balance_loss_mlp": 1.02675128, "epoch": 0.5272809258980911, "flos": 20412164286720.0, "grad_norm": 1.8328085669464766, "language_loss": 0.7360974, "learning_rate": 1.920376134993436e-06, "loss": 0.75732535, "num_input_tokens_seen": 188493035, "step": 8770, "time_per_iteration": 2.7274930477142334 }, { "auxiliary_loss_clip": 0.011224, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.04366255, "balance_loss_mlp": 1.02199364, "epoch": 0.5273410491507591, "flos": 28256742213120.0, "grad_norm": 1.7661010025178618, "language_loss": 0.68258119, "learning_rate": 1.9199869835929224e-06, "loss": 0.704162, "num_input_tokens_seen": 188513860, "step": 8771, "time_per_iteration": 2.6751418113708496 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.0429647, "balance_loss_mlp": 1.02500653, "epoch": 0.527401172403427, "flos": 22455158140800.0, "grad_norm": 1.9220412670697933, "language_loss": 0.76438117, "learning_rate": 1.9195978352265115e-06, "loss": 0.78586286, "num_input_tokens_seen": 188533345, "step": 8772, "time_per_iteration": 2.7865138053894043 }, { "auxiliary_loss_clip": 0.01107055, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.04159784, "balance_loss_mlp": 1.03290582, "epoch": 0.527461295656095, "flos": 21031084558080.0, "grad_norm": 2.1683746410962472, "language_loss": 0.65569091, "learning_rate": 1.9192086899089585e-06, "loss": 0.67723751, "num_input_tokens_seen": 188551550, "step": 8773, "time_per_iteration": 2.648556709289551 }, { "auxiliary_loss_clip": 0.01089634, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 1.04127073, "balance_loss_mlp": 1.02838576, "epoch": 0.5275214189087629, "flos": 26322018929280.0, "grad_norm": 1.7479537399696432, "language_loss": 0.85893595, "learning_rate": 1.91881954765502e-06, "loss": 0.88024169, "num_input_tokens_seen": 188571615, "step": 8774, "time_per_iteration": 2.8036038875579834 }, { "auxiliary_loss_clip": 0.01088366, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.03889024, "balance_loss_mlp": 1.02204525, "epoch": 0.5275815421614309, "flos": 20047024581120.0, "grad_norm": 1.657417688760408, "language_loss": 0.80199802, "learning_rate": 1.9184304084794523e-06, "loss": 0.82323706, "num_input_tokens_seen": 188591965, "step": 8775, "time_per_iteration": 2.7011687755584717 }, { "auxiliary_loss_clip": 0.01096581, "auxiliary_loss_mlp": 0.01042615, "balance_loss_clip": 1.03883219, "balance_loss_mlp": 1.02843523, "epoch": 0.5276416654140988, "flos": 21432206712960.0, "grad_norm": 1.7666023716485497, "language_loss": 0.83578467, "learning_rate": 1.918041272397012e-06, "loss": 0.85717654, "num_input_tokens_seen": 188610675, "step": 8776, "time_per_iteration": 2.6593801975250244 }, { "auxiliary_loss_clip": 0.01093105, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.04135871, "balance_loss_mlp": 1.0225482, "epoch": 0.5277017886667669, "flos": 17165085696000.0, "grad_norm": 1.7073238735749807, "language_loss": 0.67856812, "learning_rate": 1.9176521394224547e-06, "loss": 0.6998651, "num_input_tokens_seen": 188628235, "step": 8777, "time_per_iteration": 2.684119462966919 }, { "auxiliary_loss_clip": 0.01098291, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.0435065, "balance_loss_mlp": 1.02887487, "epoch": 0.5277619119194349, "flos": 20448146736000.0, "grad_norm": 1.6906001817136074, "language_loss": 0.8258512, "learning_rate": 1.9172630095705358e-06, "loss": 0.84725994, "num_input_tokens_seen": 188648925, "step": 8778, "time_per_iteration": 2.682415723800659 }, { "auxiliary_loss_clip": 0.01111904, "auxiliary_loss_mlp": 0.01042858, "balance_loss_clip": 1.04339361, "balance_loss_mlp": 1.02807617, "epoch": 0.5278220351721028, "flos": 24061083304320.0, "grad_norm": 2.7851808389493913, "language_loss": 0.79809994, "learning_rate": 1.916873882856013e-06, "loss": 0.81964755, "num_input_tokens_seen": 188668125, "step": 8779, "time_per_iteration": 2.6585779190063477 }, { "auxiliary_loss_clip": 0.01105817, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.04011083, "balance_loss_mlp": 1.02326131, "epoch": 0.5278821584247708, "flos": 24642907804800.0, "grad_norm": 2.3801118784221487, "language_loss": 0.76782715, "learning_rate": 1.9164847592936406e-06, "loss": 0.78924787, "num_input_tokens_seen": 188684410, "step": 8780, "time_per_iteration": 2.64528489112854 }, { "auxiliary_loss_clip": 0.01092369, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.04324102, "balance_loss_mlp": 1.01723862, "epoch": 0.5279422816774387, "flos": 35408244240000.0, "grad_norm": 1.6460087018057796, "language_loss": 0.7001918, "learning_rate": 1.916095638898174e-06, "loss": 0.72142857, "num_input_tokens_seen": 188706130, "step": 8781, "time_per_iteration": 2.8247299194335938 }, { "auxiliary_loss_clip": 0.01107498, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.04195011, "balance_loss_mlp": 1.02773809, "epoch": 0.5280024049301068, "flos": 22967028904320.0, "grad_norm": 1.5355974889681627, "language_loss": 0.72236538, "learning_rate": 1.9157065216843696e-06, "loss": 0.7438432, "num_input_tokens_seen": 188725030, "step": 8782, "time_per_iteration": 2.6150832176208496 }, { "auxiliary_loss_clip": 0.01090709, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.03973758, "balance_loss_mlp": 1.0204308, "epoch": 0.5280625281827747, "flos": 21507619317120.0, "grad_norm": 1.8366229943518229, "language_loss": 0.68489599, "learning_rate": 1.915317407666982e-06, "loss": 0.70613807, "num_input_tokens_seen": 188744325, "step": 8783, "time_per_iteration": 2.7228338718414307 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.04475784, "balance_loss_mlp": 1.02599382, "epoch": 0.5281226514354427, "flos": 31208167958400.0, "grad_norm": 1.8621065563663965, "language_loss": 0.69557488, "learning_rate": 1.9149282968607674e-06, "loss": 0.71719718, "num_input_tokens_seen": 188765100, "step": 8784, "time_per_iteration": 2.756030797958374 }, { "auxiliary_loss_clip": 0.01124818, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.04128921, "balance_loss_mlp": 1.01935077, "epoch": 0.5281827746881106, "flos": 25077821679360.0, "grad_norm": 3.8002246773271238, "language_loss": 0.7503646, "learning_rate": 1.91453918928048e-06, "loss": 0.77195537, "num_input_tokens_seen": 188783995, "step": 8785, "time_per_iteration": 2.6486949920654297 }, { "auxiliary_loss_clip": 0.01110957, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.04315662, "balance_loss_mlp": 1.02070904, "epoch": 0.5282428979407786, "flos": 20631255292800.0, "grad_norm": 1.5855662273934061, "language_loss": 0.83260286, "learning_rate": 1.9141500849408745e-06, "loss": 0.85406423, "num_input_tokens_seen": 188803120, "step": 8786, "time_per_iteration": 2.6352970600128174 }, { "auxiliary_loss_clip": 0.01083443, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 1.04014111, "balance_loss_mlp": 1.0136745, "epoch": 0.5283030211934465, "flos": 22419391173120.0, "grad_norm": 2.305341017618089, "language_loss": 0.82486933, "learning_rate": 1.9137609838567076e-06, "loss": 0.84596282, "num_input_tokens_seen": 188820960, "step": 8787, "time_per_iteration": 2.712639570236206 }, { "auxiliary_loss_clip": 0.01066097, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.03866088, "balance_loss_mlp": 1.01387453, "epoch": 0.5283631444461145, "flos": 23615467176960.0, "grad_norm": 1.663088771358256, "language_loss": 0.83609009, "learning_rate": 1.9133718860427316e-06, "loss": 0.85701656, "num_input_tokens_seen": 188837165, "step": 8788, "time_per_iteration": 2.7158761024475098 }, { "auxiliary_loss_clip": 0.01087908, "auxiliary_loss_mlp": 0.01041692, "balance_loss_clip": 1.04602289, "balance_loss_mlp": 1.02696919, "epoch": 0.5284232676987825, "flos": 32671994918400.0, "grad_norm": 1.8980499308542007, "language_loss": 0.75046682, "learning_rate": 1.9129827915137027e-06, "loss": 0.77176291, "num_input_tokens_seen": 188858555, "step": 8789, "time_per_iteration": 2.806339979171753 }, { "auxiliary_loss_clip": 0.01113755, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.04411733, "balance_loss_mlp": 1.02322817, "epoch": 0.5284833909514505, "flos": 26760919213440.0, "grad_norm": 1.5263217177178625, "language_loss": 0.69562709, "learning_rate": 1.9125937002843754e-06, "loss": 0.71713525, "num_input_tokens_seen": 188879050, "step": 8790, "time_per_iteration": 2.701814651489258 }, { "auxiliary_loss_clip": 0.01117978, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.04194212, "balance_loss_mlp": 1.01685631, "epoch": 0.5285435142041185, "flos": 22090700793600.0, "grad_norm": 1.472851859989372, "language_loss": 0.79096156, "learning_rate": 1.9122046123695036e-06, "loss": 0.812433, "num_input_tokens_seen": 188898885, "step": 8791, "time_per_iteration": 2.609342575073242 }, { "auxiliary_loss_clip": 0.01063984, "auxiliary_loss_mlp": 0.01029869, "balance_loss_clip": 1.04006243, "balance_loss_mlp": 1.01632702, "epoch": 0.5286036374567864, "flos": 20375463565440.0, "grad_norm": 2.747278304747908, "language_loss": 0.66302419, "learning_rate": 1.9118155277838423e-06, "loss": 0.6839627, "num_input_tokens_seen": 188917225, "step": 8792, "time_per_iteration": 2.713622570037842 }, { "auxiliary_loss_clip": 0.01090251, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.03743482, "balance_loss_mlp": 1.02670956, "epoch": 0.5286637607094544, "flos": 24352175122560.0, "grad_norm": 1.9116255636929125, "language_loss": 0.79727674, "learning_rate": 1.9114264465421443e-06, "loss": 0.81858563, "num_input_tokens_seen": 188936120, "step": 8793, "time_per_iteration": 2.6645493507385254 }, { "auxiliary_loss_clip": 0.01121499, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.04323554, "balance_loss_mlp": 1.03118658, "epoch": 0.5287238839621223, "flos": 17271165536640.0, "grad_norm": 2.655732529836172, "language_loss": 0.84749115, "learning_rate": 1.9110373686591645e-06, "loss": 0.86916077, "num_input_tokens_seen": 188953405, "step": 8794, "time_per_iteration": 2.8306803703308105 }, { "auxiliary_loss_clip": 0.01097868, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.03908813, "balance_loss_mlp": 1.02062225, "epoch": 0.5287840072147904, "flos": 17566890209280.0, "grad_norm": 2.1997369626435894, "language_loss": 0.676875, "learning_rate": 1.9106482941496564e-06, "loss": 0.69820529, "num_input_tokens_seen": 188971150, "step": 8795, "time_per_iteration": 2.703134059906006 }, { "auxiliary_loss_clip": 0.01098455, "auxiliary_loss_mlp": 0.010334, "balance_loss_clip": 1.04339266, "balance_loss_mlp": 1.01989961, "epoch": 0.5288441304674583, "flos": 18552099421440.0, "grad_norm": 2.036052201037856, "language_loss": 0.80291003, "learning_rate": 1.910259223028374e-06, "loss": 0.82422858, "num_input_tokens_seen": 188989550, "step": 8796, "time_per_iteration": 2.6733570098876953 }, { "auxiliary_loss_clip": 0.01079591, "auxiliary_loss_mlp": 0.01043571, "balance_loss_clip": 1.03867388, "balance_loss_mlp": 1.02758455, "epoch": 0.5289042537201263, "flos": 20814507504000.0, "grad_norm": 1.5572831824692925, "language_loss": 0.69010925, "learning_rate": 1.909870155310071e-06, "loss": 0.71134079, "num_input_tokens_seen": 189008795, "step": 8797, "time_per_iteration": 4.254164934158325 }, { "auxiliary_loss_clip": 0.01101135, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.04237545, "balance_loss_mlp": 1.02374518, "epoch": 0.5289643769727942, "flos": 15735265937280.0, "grad_norm": 1.6872492204324914, "language_loss": 0.82684171, "learning_rate": 1.9094810910095005e-06, "loss": 0.84821934, "num_input_tokens_seen": 189025540, "step": 8798, "time_per_iteration": 2.7167000770568848 }, { "auxiliary_loss_clip": 0.01096424, "auxiliary_loss_mlp": 0.00774405, "balance_loss_clip": 1.03896332, "balance_loss_mlp": 1.00029516, "epoch": 0.5290245002254622, "flos": 19537308633600.0, "grad_norm": 1.9585595365508919, "language_loss": 0.70825863, "learning_rate": 1.9090920301414166e-06, "loss": 0.72696698, "num_input_tokens_seen": 189044885, "step": 8799, "time_per_iteration": 4.350652694702148 }, { "auxiliary_loss_clip": 0.01111399, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.04659581, "balance_loss_mlp": 1.02507114, "epoch": 0.5290846234781301, "flos": 15815131827840.0, "grad_norm": 2.2031970702340704, "language_loss": 0.69286144, "learning_rate": 1.9087029727205716e-06, "loss": 0.71435547, "num_input_tokens_seen": 189061280, "step": 8800, "time_per_iteration": 4.109759569168091 }, { "auxiliary_loss_clip": 0.01017957, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.01865292, "balance_loss_mlp": 1.03631306, "epoch": 0.5291447467307981, "flos": 70057624821120.0, "grad_norm": 0.9935539305247675, "language_loss": 0.56959099, "learning_rate": 1.9083139187617193e-06, "loss": 0.59014881, "num_input_tokens_seen": 189114775, "step": 8801, "time_per_iteration": 3.1419920921325684 }, { "auxiliary_loss_clip": 0.01110756, "auxiliary_loss_mlp": 0.01036206, "balance_loss_clip": 1.04886377, "balance_loss_mlp": 1.02271795, "epoch": 0.529204869983466, "flos": 28364186770560.0, "grad_norm": 1.5688016044474997, "language_loss": 0.6425091, "learning_rate": 1.9079248682796123e-06, "loss": 0.6639787, "num_input_tokens_seen": 189134700, "step": 8802, "time_per_iteration": 2.7467000484466553 }, { "auxiliary_loss_clip": 0.01101463, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.04380429, "balance_loss_mlp": 1.01772761, "epoch": 0.5292649932361341, "flos": 33758830684800.0, "grad_norm": 3.351871019760029, "language_loss": 0.69098222, "learning_rate": 1.907535821289003e-06, "loss": 0.71231019, "num_input_tokens_seen": 189155365, "step": 8803, "time_per_iteration": 4.278867721557617 }, { "auxiliary_loss_clip": 0.01106005, "auxiliary_loss_mlp": 0.00770288, "balance_loss_clip": 1.04076648, "balance_loss_mlp": 1.00028872, "epoch": 0.5293251164888021, "flos": 20447679859200.0, "grad_norm": 1.7989646267917587, "language_loss": 0.76156348, "learning_rate": 1.9071467778046458e-06, "loss": 0.78032649, "num_input_tokens_seen": 189173885, "step": 8804, "time_per_iteration": 2.683661699295044 }, { "auxiliary_loss_clip": 0.01032487, "auxiliary_loss_mlp": 0.01019664, "balance_loss_clip": 1.01553822, "balance_loss_mlp": 1.01836514, "epoch": 0.52938523974147, "flos": 66545312204160.0, "grad_norm": 0.7526453486337231, "language_loss": 0.5290755, "learning_rate": 1.906757737841291e-06, "loss": 0.54959702, "num_input_tokens_seen": 189236515, "step": 8805, "time_per_iteration": 3.243603467941284 }, { "auxiliary_loss_clip": 0.0103203, "auxiliary_loss_mlp": 0.01016047, "balance_loss_clip": 1.01495409, "balance_loss_mlp": 1.01418769, "epoch": 0.529445362994138, "flos": 67151734542720.0, "grad_norm": 0.7522317031499139, "language_loss": 0.6378004, "learning_rate": 1.906368701413693e-06, "loss": 0.65828121, "num_input_tokens_seen": 189300500, "step": 8806, "time_per_iteration": 3.185899257659912 }, { "auxiliary_loss_clip": 0.01112977, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 1.04236031, "balance_loss_mlp": 1.02053213, "epoch": 0.5295054862468059, "flos": 17749316407680.0, "grad_norm": 1.5696878511475738, "language_loss": 0.72756052, "learning_rate": 1.9059796685366026e-06, "loss": 0.74903309, "num_input_tokens_seen": 189319745, "step": 8807, "time_per_iteration": 2.652667284011841 }, { "auxiliary_loss_clip": 0.01079975, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.04053009, "balance_loss_mlp": 1.01760888, "epoch": 0.529565609499474, "flos": 11397401084160.0, "grad_norm": 2.191041401806776, "language_loss": 0.69626606, "learning_rate": 1.9055906392247723e-06, "loss": 0.71737224, "num_input_tokens_seen": 189334550, "step": 8808, "time_per_iteration": 2.6991183757781982 }, { "auxiliary_loss_clip": 0.01109251, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.041991, "balance_loss_mlp": 1.01962066, "epoch": 0.5296257327521419, "flos": 17196363463680.0, "grad_norm": 1.8261828078243632, "language_loss": 0.8653447, "learning_rate": 1.9052016134929554e-06, "loss": 0.88676214, "num_input_tokens_seen": 189351735, "step": 8809, "time_per_iteration": 2.5995731353759766 }, { "auxiliary_loss_clip": 0.0111469, "auxiliary_loss_mlp": 0.01041403, "balance_loss_clip": 1.04281509, "balance_loss_mlp": 1.02607894, "epoch": 0.5296858560048099, "flos": 39964086777600.0, "grad_norm": 1.9222242916722383, "language_loss": 0.64388674, "learning_rate": 1.9048125913559016e-06, "loss": 0.66544765, "num_input_tokens_seen": 189373105, "step": 8810, "time_per_iteration": 2.776230573654175 }, { "auxiliary_loss_clip": 0.01119011, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.04296374, "balance_loss_mlp": 1.02509344, "epoch": 0.5297459792574778, "flos": 20961418129920.0, "grad_norm": 1.8063937788931883, "language_loss": 0.68213391, "learning_rate": 1.9044235728283646e-06, "loss": 0.70371044, "num_input_tokens_seen": 189394615, "step": 8811, "time_per_iteration": 2.684617757797241 }, { "auxiliary_loss_clip": 0.01007367, "auxiliary_loss_mlp": 0.0100546, "balance_loss_clip": 1.01854634, "balance_loss_mlp": 1.00402915, "epoch": 0.5298061025101458, "flos": 66523620389760.0, "grad_norm": 0.689972629111167, "language_loss": 0.53345251, "learning_rate": 1.9040345579250953e-06, "loss": 0.55358076, "num_input_tokens_seen": 189459750, "step": 8812, "time_per_iteration": 3.3905134201049805 }, { "auxiliary_loss_clip": 0.01023218, "auxiliary_loss_mlp": 0.01004548, "balance_loss_clip": 1.01716316, "balance_loss_mlp": 1.00321257, "epoch": 0.5298662257628137, "flos": 67662994775040.0, "grad_norm": 0.7359658604916758, "language_loss": 0.56288284, "learning_rate": 1.9036455466608453e-06, "loss": 0.58316052, "num_input_tokens_seen": 189527540, "step": 8813, "time_per_iteration": 3.2840702533721924 }, { "auxiliary_loss_clip": 0.01064136, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.0387466, "balance_loss_mlp": 1.01986289, "epoch": 0.5299263490154817, "flos": 19646405216640.0, "grad_norm": 1.8723589062576662, "language_loss": 0.81484783, "learning_rate": 1.9032565390503657e-06, "loss": 0.83582127, "num_input_tokens_seen": 189546900, "step": 8814, "time_per_iteration": 2.7889370918273926 }, { "auxiliary_loss_clip": 0.01129463, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.04835963, "balance_loss_mlp": 1.02225351, "epoch": 0.5299864722681497, "flos": 22055005653120.0, "grad_norm": 1.8736963674991467, "language_loss": 0.85159796, "learning_rate": 1.9028675351084076e-06, "loss": 0.87324965, "num_input_tokens_seen": 189566490, "step": 8815, "time_per_iteration": 2.588376998901367 }, { "auxiliary_loss_clip": 0.01119356, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.04443836, "balance_loss_mlp": 1.01802766, "epoch": 0.5300465955208177, "flos": 21763698353280.0, "grad_norm": 2.360835312755498, "language_loss": 0.66173548, "learning_rate": 1.9024785348497225e-06, "loss": 0.6832372, "num_input_tokens_seen": 189585580, "step": 8816, "time_per_iteration": 2.6367204189300537 }, { "auxiliary_loss_clip": 0.01098885, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.04165578, "balance_loss_mlp": 1.02370238, "epoch": 0.5301067187734857, "flos": 42996491735040.0, "grad_norm": 1.8428826452353317, "language_loss": 0.72204578, "learning_rate": 1.9020895382890611e-06, "loss": 0.74340779, "num_input_tokens_seen": 189608485, "step": 8817, "time_per_iteration": 2.8511815071105957 }, { "auxiliary_loss_clip": 0.01093351, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.03981018, "balance_loss_mlp": 1.01959896, "epoch": 0.5301668420261536, "flos": 20554298403840.0, "grad_norm": 1.7783802077805728, "language_loss": 0.65400332, "learning_rate": 1.9017005454411743e-06, "loss": 0.67528808, "num_input_tokens_seen": 189627815, "step": 8818, "time_per_iteration": 2.757228374481201 }, { "auxiliary_loss_clip": 0.01075022, "auxiliary_loss_mlp": 0.01033272, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01816273, "epoch": 0.5302269652788216, "flos": 17486665182720.0, "grad_norm": 1.8529738404346974, "language_loss": 0.75020683, "learning_rate": 1.9013115563208126e-06, "loss": 0.77128971, "num_input_tokens_seen": 189644850, "step": 8819, "time_per_iteration": 2.7458016872406006 }, { "auxiliary_loss_clip": 0.01088004, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.04190588, "balance_loss_mlp": 1.03143191, "epoch": 0.5302870885314895, "flos": 14574202715520.0, "grad_norm": 2.236781046268797, "language_loss": 0.81955135, "learning_rate": 1.9009225709427267e-06, "loss": 0.84090424, "num_input_tokens_seen": 189660945, "step": 8820, "time_per_iteration": 2.7917025089263916 }, { "auxiliary_loss_clip": 0.01101102, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.04137421, "balance_loss_mlp": 1.02192223, "epoch": 0.5303472117841576, "flos": 23438032968960.0, "grad_norm": 1.5105877277652986, "language_loss": 0.72733676, "learning_rate": 1.9005335893216667e-06, "loss": 0.74869215, "num_input_tokens_seen": 189680425, "step": 8821, "time_per_iteration": 2.664912462234497 }, { "auxiliary_loss_clip": 0.01092575, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.01958346, "epoch": 0.5304073350368255, "flos": 22709010533760.0, "grad_norm": 1.4432589414019072, "language_loss": 0.74112785, "learning_rate": 1.9001446114723824e-06, "loss": 0.76237607, "num_input_tokens_seen": 189700375, "step": 8822, "time_per_iteration": 2.7494471073150635 }, { "auxiliary_loss_clip": 0.01087967, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.03945005, "balance_loss_mlp": 1.02029884, "epoch": 0.5304674582894935, "flos": 27928554624000.0, "grad_norm": 1.6561028390766985, "language_loss": 0.67739707, "learning_rate": 1.8997556374096257e-06, "loss": 0.69862658, "num_input_tokens_seen": 189721225, "step": 8823, "time_per_iteration": 2.8298280239105225 }, { "auxiliary_loss_clip": 0.01127487, "auxiliary_loss_mlp": 0.01042695, "balance_loss_clip": 1.0455004, "balance_loss_mlp": 1.02722192, "epoch": 0.5305275815421614, "flos": 21250642440960.0, "grad_norm": 1.7679489191905855, "language_loss": 0.69459474, "learning_rate": 1.8993666671481444e-06, "loss": 0.71629655, "num_input_tokens_seen": 189740170, "step": 8824, "time_per_iteration": 2.7093706130981445 }, { "auxiliary_loss_clip": 0.01098459, "auxiliary_loss_mlp": 0.00770579, "balance_loss_clip": 1.04351103, "balance_loss_mlp": 1.00028551, "epoch": 0.5305877047948294, "flos": 17603088140160.0, "grad_norm": 2.079936946962719, "language_loss": 0.7578221, "learning_rate": 1.898977700702689e-06, "loss": 0.77651244, "num_input_tokens_seen": 189757890, "step": 8825, "time_per_iteration": 2.7240397930145264 }, { "auxiliary_loss_clip": 0.01042177, "auxiliary_loss_mlp": 0.01041743, "balance_loss_clip": 1.03510904, "balance_loss_mlp": 1.02771175, "epoch": 0.5306478280474973, "flos": 15195493284480.0, "grad_norm": 1.902170532497994, "language_loss": 0.85671568, "learning_rate": 1.8985887380880103e-06, "loss": 0.87755489, "num_input_tokens_seen": 189775390, "step": 8826, "time_per_iteration": 2.786893367767334 }, { "auxiliary_loss_clip": 0.0112111, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.04376101, "balance_loss_mlp": 1.01967907, "epoch": 0.5307079513001653, "flos": 15341218761600.0, "grad_norm": 1.3295158202050776, "language_loss": 0.64655942, "learning_rate": 1.8981997793188558e-06, "loss": 0.66810423, "num_input_tokens_seen": 189793975, "step": 8827, "time_per_iteration": 2.650259017944336 }, { "auxiliary_loss_clip": 0.01100521, "auxiliary_loss_mlp": 0.01041689, "balance_loss_clip": 1.04230511, "balance_loss_mlp": 1.02720535, "epoch": 0.5307680745528333, "flos": 43544452688640.0, "grad_norm": 1.5763280036459053, "language_loss": 0.60055244, "learning_rate": 1.8978108244099762e-06, "loss": 0.62197453, "num_input_tokens_seen": 189817870, "step": 8828, "time_per_iteration": 2.9273712635040283 }, { "auxiliary_loss_clip": 0.01115165, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.04400516, "balance_loss_mlp": 1.01779199, "epoch": 0.5308281978055013, "flos": 20048928001920.0, "grad_norm": 1.6623375431972864, "language_loss": 0.81171465, "learning_rate": 1.8974218733761208e-06, "loss": 0.83319587, "num_input_tokens_seen": 189837905, "step": 8829, "time_per_iteration": 2.6640090942382812 }, { "auxiliary_loss_clip": 0.01104846, "auxiliary_loss_mlp": 0.01035043, "balance_loss_clip": 1.043993, "balance_loss_mlp": 1.02136946, "epoch": 0.5308883210581693, "flos": 20703938463360.0, "grad_norm": 1.3895948203919835, "language_loss": 0.78245443, "learning_rate": 1.8970329262320375e-06, "loss": 0.80385327, "num_input_tokens_seen": 189856970, "step": 8830, "time_per_iteration": 2.736316680908203 }, { "auxiliary_loss_clip": 0.01111385, "auxiliary_loss_mlp": 0.01033264, "balance_loss_clip": 1.04335451, "balance_loss_mlp": 1.02036524, "epoch": 0.5309484443108372, "flos": 14355506759040.0, "grad_norm": 2.4391763831493165, "language_loss": 0.8031435, "learning_rate": 1.8966439829924768e-06, "loss": 0.82458997, "num_input_tokens_seen": 189872830, "step": 8831, "time_per_iteration": 2.6151957511901855 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.0430057, "balance_loss_mlp": 1.01951742, "epoch": 0.5310085675635052, "flos": 20010503427840.0, "grad_norm": 4.592110703983282, "language_loss": 0.73025942, "learning_rate": 1.896255043672186e-06, "loss": 0.75167632, "num_input_tokens_seen": 189891635, "step": 8832, "time_per_iteration": 2.6464226245880127 }, { "auxiliary_loss_clip": 0.01089691, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.04126275, "balance_loss_mlp": 1.02198887, "epoch": 0.5310686908161731, "flos": 22127293774080.0, "grad_norm": 2.4188792138763513, "language_loss": 0.75694382, "learning_rate": 1.8958661082859143e-06, "loss": 0.77820939, "num_input_tokens_seen": 189909050, "step": 8833, "time_per_iteration": 2.757716178894043 }, { "auxiliary_loss_clip": 0.01087272, "auxiliary_loss_mlp": 0.01036493, "balance_loss_clip": 1.03743505, "balance_loss_mlp": 1.02260494, "epoch": 0.5311288140688412, "flos": 24717889445760.0, "grad_norm": 1.6684529348681687, "language_loss": 0.73618537, "learning_rate": 1.8954771768484103e-06, "loss": 0.75742298, "num_input_tokens_seen": 189927405, "step": 8834, "time_per_iteration": 2.7447376251220703 }, { "auxiliary_loss_clip": 0.01127832, "auxiliary_loss_mlp": 0.01042563, "balance_loss_clip": 1.04435921, "balance_loss_mlp": 1.02734029, "epoch": 0.5311889373215091, "flos": 24097712198400.0, "grad_norm": 1.9940250251862053, "language_loss": 0.77417272, "learning_rate": 1.8950882493744226e-06, "loss": 0.79587668, "num_input_tokens_seen": 189947740, "step": 8835, "time_per_iteration": 2.654860734939575 }, { "auxiliary_loss_clip": 0.01097251, "auxiliary_loss_mlp": 0.01046402, "balance_loss_clip": 1.04259109, "balance_loss_mlp": 1.03138208, "epoch": 0.5312490605741771, "flos": 22017012042240.0, "grad_norm": 2.4706637723930505, "language_loss": 0.72355223, "learning_rate": 1.8946993258786985e-06, "loss": 0.7449888, "num_input_tokens_seen": 189966495, "step": 8836, "time_per_iteration": 2.694772243499756 }, { "auxiliary_loss_clip": 0.01104585, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.04374099, "balance_loss_mlp": 1.02537167, "epoch": 0.531309183826845, "flos": 19390541662080.0, "grad_norm": 1.705704926785557, "language_loss": 0.81026083, "learning_rate": 1.894310406375987e-06, "loss": 0.8317101, "num_input_tokens_seen": 189985325, "step": 8837, "time_per_iteration": 4.218893527984619 }, { "auxiliary_loss_clip": 0.01107393, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.04489708, "balance_loss_mlp": 1.02216005, "epoch": 0.531369307079513, "flos": 20190056538240.0, "grad_norm": 1.8031911656804687, "language_loss": 0.8618502, "learning_rate": 1.893921490881035e-06, "loss": 0.88328624, "num_input_tokens_seen": 190003290, "step": 8838, "time_per_iteration": 4.327972888946533 }, { "auxiliary_loss_clip": 0.01097617, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.04136765, "balance_loss_mlp": 1.02366185, "epoch": 0.5314294303321809, "flos": 18880143356160.0, "grad_norm": 1.7768925166398193, "language_loss": 0.72961235, "learning_rate": 1.8935325794085906e-06, "loss": 0.75095296, "num_input_tokens_seen": 190023260, "step": 8839, "time_per_iteration": 4.2734081745147705 }, { "auxiliary_loss_clip": 0.0110159, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.04086304, "balance_loss_mlp": 1.02885473, "epoch": 0.531489553584849, "flos": 23040035297280.0, "grad_norm": 1.7238696185302183, "language_loss": 0.76902539, "learning_rate": 1.8931436719734023e-06, "loss": 0.79046404, "num_input_tokens_seen": 190042035, "step": 8840, "time_per_iteration": 2.708387613296509 }, { "auxiliary_loss_clip": 0.01085488, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.04072022, "balance_loss_mlp": 1.01934612, "epoch": 0.5315496768375169, "flos": 19790478668160.0, "grad_norm": 2.0047240823259385, "language_loss": 0.77301592, "learning_rate": 1.892754768590216e-06, "loss": 0.7942099, "num_input_tokens_seen": 190057545, "step": 8841, "time_per_iteration": 2.6982758045196533 }, { "auxiliary_loss_clip": 0.0102526, "auxiliary_loss_mlp": 0.01022764, "balance_loss_clip": 1.01826656, "balance_loss_mlp": 1.02119017, "epoch": 0.5316098000901849, "flos": 71023228185600.0, "grad_norm": 0.6981779601463162, "language_loss": 0.56741858, "learning_rate": 1.8923658692737793e-06, "loss": 0.58789885, "num_input_tokens_seen": 190123800, "step": 8842, "time_per_iteration": 4.895024299621582 }, { "auxiliary_loss_clip": 0.01102673, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.04331183, "balance_loss_mlp": 1.02621484, "epoch": 0.5316699233428529, "flos": 16435560470400.0, "grad_norm": 1.8735975877067965, "language_loss": 0.73998511, "learning_rate": 1.8919769740388407e-06, "loss": 0.76142448, "num_input_tokens_seen": 190141625, "step": 8843, "time_per_iteration": 2.66169810295105 }, { "auxiliary_loss_clip": 0.01023627, "auxiliary_loss_mlp": 0.0100589, "balance_loss_clip": 1.01690733, "balance_loss_mlp": 1.00456095, "epoch": 0.5317300465955208, "flos": 67420814302080.0, "grad_norm": 0.8814346849515853, "language_loss": 0.61057651, "learning_rate": 1.891588082900145e-06, "loss": 0.63087165, "num_input_tokens_seen": 190198110, "step": 8844, "time_per_iteration": 3.297545909881592 }, { "auxiliary_loss_clip": 0.01032752, "auxiliary_loss_mlp": 0.01005725, "balance_loss_clip": 1.01528263, "balance_loss_mlp": 1.00425863, "epoch": 0.5317901698481888, "flos": 59508075340800.0, "grad_norm": 0.8422745451421196, "language_loss": 0.62147105, "learning_rate": 1.8911991958724411e-06, "loss": 0.64185584, "num_input_tokens_seen": 190259950, "step": 8845, "time_per_iteration": 3.1747312545776367 }, { "auxiliary_loss_clip": 0.01088974, "auxiliary_loss_mlp": 0.01040872, "balance_loss_clip": 1.04063165, "balance_loss_mlp": 1.02521944, "epoch": 0.5318502931008567, "flos": 19129219240320.0, "grad_norm": 1.8386701394288745, "language_loss": 0.74980247, "learning_rate": 1.890810312970474e-06, "loss": 0.77110094, "num_input_tokens_seen": 190278265, "step": 8846, "time_per_iteration": 2.734652519226074 }, { "auxiliary_loss_clip": 0.01111858, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.04369533, "balance_loss_mlp": 1.0226109, "epoch": 0.5319104163535248, "flos": 24681045070080.0, "grad_norm": 1.562458752543025, "language_loss": 0.75478411, "learning_rate": 1.8904214342089903e-06, "loss": 0.77625251, "num_input_tokens_seen": 190298400, "step": 8847, "time_per_iteration": 2.7175981998443604 }, { "auxiliary_loss_clip": 0.0110005, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.04175198, "balance_loss_mlp": 1.0193609, "epoch": 0.5319705396061927, "flos": 19385513758080.0, "grad_norm": 1.5938668259379032, "language_loss": 0.87875456, "learning_rate": 1.8900325596027378e-06, "loss": 0.90007627, "num_input_tokens_seen": 190316235, "step": 8848, "time_per_iteration": 2.777731418609619 }, { "auxiliary_loss_clip": 0.01084561, "auxiliary_loss_mlp": 0.01041363, "balance_loss_clip": 1.04119325, "balance_loss_mlp": 1.02549624, "epoch": 0.5320306628588607, "flos": 18259319664000.0, "grad_norm": 2.1051582434291833, "language_loss": 0.74326992, "learning_rate": 1.8896436891664609e-06, "loss": 0.76452917, "num_input_tokens_seen": 190335060, "step": 8849, "time_per_iteration": 2.7248313426971436 }, { "auxiliary_loss_clip": 0.01107496, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.03895473, "balance_loss_mlp": 1.0154624, "epoch": 0.5320907861115286, "flos": 23732321097600.0, "grad_norm": 1.8915242874982603, "language_loss": 0.79657137, "learning_rate": 1.8892548229149066e-06, "loss": 0.81794947, "num_input_tokens_seen": 190353265, "step": 8850, "time_per_iteration": 2.7357401847839355 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.04192996, "balance_loss_mlp": 1.01804209, "epoch": 0.5321509093641966, "flos": 34495251321600.0, "grad_norm": 1.633301633467878, "language_loss": 0.55076206, "learning_rate": 1.888865960862821e-06, "loss": 0.57229722, "num_input_tokens_seen": 190376575, "step": 8851, "time_per_iteration": 2.730081081390381 }, { "auxiliary_loss_clip": 0.01110617, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.04243159, "balance_loss_mlp": 1.0228914, "epoch": 0.5322110326168645, "flos": 20010934391040.0, "grad_norm": 1.5393101812132837, "language_loss": 0.68206942, "learning_rate": 1.8884771030249484e-06, "loss": 0.70353764, "num_input_tokens_seen": 190395185, "step": 8852, "time_per_iteration": 2.685267925262451 }, { "auxiliary_loss_clip": 0.01020981, "auxiliary_loss_mlp": 0.00752764, "balance_loss_clip": 1.01425028, "balance_loss_mlp": 0.99977398, "epoch": 0.5322711558695326, "flos": 64631164435200.0, "grad_norm": 0.7921902417648442, "language_loss": 0.62794167, "learning_rate": 1.8880882494160357e-06, "loss": 0.64567912, "num_input_tokens_seen": 190452595, "step": 8853, "time_per_iteration": 3.154197931289673 }, { "auxiliary_loss_clip": 0.01113411, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.04064846, "balance_loss_mlp": 1.01379788, "epoch": 0.5323312791222005, "flos": 14939342421120.0, "grad_norm": 2.437651920606879, "language_loss": 0.79789698, "learning_rate": 1.8876994000508278e-06, "loss": 0.81930912, "num_input_tokens_seen": 190469140, "step": 8854, "time_per_iteration": 2.6569535732269287 }, { "auxiliary_loss_clip": 0.01092841, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.0418992, "balance_loss_mlp": 1.01586115, "epoch": 0.5323914023748685, "flos": 23440834229760.0, "grad_norm": 1.7182223658194644, "language_loss": 0.73290253, "learning_rate": 1.8873105549440698e-06, "loss": 0.75411177, "num_input_tokens_seen": 190489015, "step": 8855, "time_per_iteration": 2.6984002590179443 }, { "auxiliary_loss_clip": 0.01095667, "auxiliary_loss_mlp": 0.0077104, "balance_loss_clip": 1.03969502, "balance_loss_mlp": 1.00030267, "epoch": 0.5324515256275365, "flos": 26286180134400.0, "grad_norm": 1.9960339019119333, "language_loss": 0.6505388, "learning_rate": 1.886921714110507e-06, "loss": 0.66920584, "num_input_tokens_seen": 190508065, "step": 8856, "time_per_iteration": 2.7057278156280518 }, { "auxiliary_loss_clip": 0.01100444, "auxiliary_loss_mlp": 0.0103908, "balance_loss_clip": 1.04079795, "balance_loss_mlp": 1.02341616, "epoch": 0.5325116488802044, "flos": 26870913636480.0, "grad_norm": 2.078757662178109, "language_loss": 0.77651089, "learning_rate": 1.8865328775648842e-06, "loss": 0.79790616, "num_input_tokens_seen": 190527045, "step": 8857, "time_per_iteration": 2.764199733734131 }, { "auxiliary_loss_clip": 0.01092407, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.04279578, "balance_loss_mlp": 1.02039194, "epoch": 0.5325717721328724, "flos": 25884734757120.0, "grad_norm": 2.3746118235231592, "language_loss": 0.70823711, "learning_rate": 1.8861440453219456e-06, "loss": 0.72950727, "num_input_tokens_seen": 190544075, "step": 8858, "time_per_iteration": 2.735534191131592 }, { "auxiliary_loss_clip": 0.01108427, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.0411067, "balance_loss_mlp": 1.02518916, "epoch": 0.5326318953855403, "flos": 21799321666560.0, "grad_norm": 1.83105211007431, "language_loss": 0.69232476, "learning_rate": 1.8857552173964367e-06, "loss": 0.71381092, "num_input_tokens_seen": 190566030, "step": 8859, "time_per_iteration": 2.773764133453369 }, { "auxiliary_loss_clip": 0.01109944, "auxiliary_loss_mlp": 0.01028838, "balance_loss_clip": 1.04517436, "balance_loss_mlp": 1.01671481, "epoch": 0.5326920186382084, "flos": 20922921728640.0, "grad_norm": 1.8423028887831514, "language_loss": 0.69617528, "learning_rate": 1.8853663938031013e-06, "loss": 0.71756315, "num_input_tokens_seen": 190585605, "step": 8860, "time_per_iteration": 2.689471483230591 }, { "auxiliary_loss_clip": 0.01102885, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.0451107, "balance_loss_mlp": 1.02258921, "epoch": 0.5327521418908763, "flos": 21433427775360.0, "grad_norm": 2.3281195979693297, "language_loss": 0.78340018, "learning_rate": 1.884977574556683e-06, "loss": 0.80478734, "num_input_tokens_seen": 190604625, "step": 8861, "time_per_iteration": 2.66679048538208 }, { "auxiliary_loss_clip": 0.01077125, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.03987145, "balance_loss_mlp": 1.02606571, "epoch": 0.5328122651435443, "flos": 21760250647680.0, "grad_norm": 1.7664447291359346, "language_loss": 0.85554659, "learning_rate": 1.8845887596719279e-06, "loss": 0.87672234, "num_input_tokens_seen": 190625060, "step": 8862, "time_per_iteration": 2.7928006649017334 }, { "auxiliary_loss_clip": 0.0109879, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.03952289, "balance_loss_mlp": 1.0237242, "epoch": 0.5328723883962122, "flos": 18296487262080.0, "grad_norm": 2.2696975914187116, "language_loss": 0.62147439, "learning_rate": 1.8841999491635778e-06, "loss": 0.64286011, "num_input_tokens_seen": 190643150, "step": 8863, "time_per_iteration": 2.685253381729126 }, { "auxiliary_loss_clip": 0.01098767, "auxiliary_loss_mlp": 0.01040661, "balance_loss_clip": 1.04511809, "balance_loss_mlp": 1.02661765, "epoch": 0.5329325116488802, "flos": 25374911068800.0, "grad_norm": 1.8529881391436633, "language_loss": 0.73310483, "learning_rate": 1.883811143046377e-06, "loss": 0.75449914, "num_input_tokens_seen": 190662725, "step": 8864, "time_per_iteration": 2.703639030456543 }, { "auxiliary_loss_clip": 0.01120661, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.04301071, "balance_loss_mlp": 1.02275968, "epoch": 0.5329926349015481, "flos": 25592098654080.0, "grad_norm": 1.6333657309737846, "language_loss": 0.64201105, "learning_rate": 1.8834223413350702e-06, "loss": 0.66357499, "num_input_tokens_seen": 190683680, "step": 8865, "time_per_iteration": 2.691087245941162 }, { "auxiliary_loss_clip": 0.01113033, "auxiliary_loss_mlp": 0.01029706, "balance_loss_clip": 1.0424211, "balance_loss_mlp": 1.01641965, "epoch": 0.5330527581542162, "flos": 22889605138560.0, "grad_norm": 3.0767575494694985, "language_loss": 0.78091645, "learning_rate": 1.8830335440443989e-06, "loss": 0.80234385, "num_input_tokens_seen": 190703350, "step": 8866, "time_per_iteration": 2.674612283706665 }, { "auxiliary_loss_clip": 0.01108068, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.04092908, "balance_loss_mlp": 1.01696241, "epoch": 0.5331128814068841, "flos": 16026752805120.0, "grad_norm": 1.842224927457961, "language_loss": 0.73840493, "learning_rate": 1.882644751189108e-06, "loss": 0.75978798, "num_input_tokens_seen": 190721170, "step": 8867, "time_per_iteration": 2.6963648796081543 }, { "auxiliary_loss_clip": 0.01098718, "auxiliary_loss_mlp": 0.01039247, "balance_loss_clip": 1.04040504, "balance_loss_mlp": 1.02402985, "epoch": 0.5331730046595521, "flos": 39344699629440.0, "grad_norm": 1.5703549780422514, "language_loss": 0.71881396, "learning_rate": 1.88225596278394e-06, "loss": 0.74019361, "num_input_tokens_seen": 190743795, "step": 8868, "time_per_iteration": 2.830118417739868 }, { "auxiliary_loss_clip": 0.01090763, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.04197335, "balance_loss_mlp": 1.0212791, "epoch": 0.5332331279122201, "flos": 24024382583040.0, "grad_norm": 5.550281122060094, "language_loss": 0.78397369, "learning_rate": 1.881867178843637e-06, "loss": 0.80522758, "num_input_tokens_seen": 190761560, "step": 8869, "time_per_iteration": 2.738565444946289 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.0432862, "balance_loss_mlp": 1.02336633, "epoch": 0.533293251164888, "flos": 17129318728320.0, "grad_norm": 1.7588400416982446, "language_loss": 0.75840724, "learning_rate": 1.8814783993829434e-06, "loss": 0.77992487, "num_input_tokens_seen": 190778875, "step": 8870, "time_per_iteration": 2.598963499069214 }, { "auxiliary_loss_clip": 0.01100618, "auxiliary_loss_mlp": 0.01038316, "balance_loss_clip": 1.04231286, "balance_loss_mlp": 1.02373052, "epoch": 0.533353374417556, "flos": 22126360020480.0, "grad_norm": 5.617051153369423, "language_loss": 0.75663799, "learning_rate": 1.8810896244165997e-06, "loss": 0.7780273, "num_input_tokens_seen": 190799830, "step": 8871, "time_per_iteration": 2.7459628582000732 }, { "auxiliary_loss_clip": 0.01099152, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.04201055, "balance_loss_mlp": 1.0202924, "epoch": 0.533413497670224, "flos": 15011091838080.0, "grad_norm": 1.8041252581471448, "language_loss": 0.7247498, "learning_rate": 1.8807008539593498e-06, "loss": 0.74607694, "num_input_tokens_seen": 190817155, "step": 8872, "time_per_iteration": 2.6604373455047607 }, { "auxiliary_loss_clip": 0.01100126, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04733372, "balance_loss_mlp": 1.02694392, "epoch": 0.533473620922892, "flos": 19609955890560.0, "grad_norm": 1.7875555414889834, "language_loss": 0.65306997, "learning_rate": 1.880312088025936e-06, "loss": 0.67448598, "num_input_tokens_seen": 190835240, "step": 8873, "time_per_iteration": 2.6587424278259277 }, { "auxiliary_loss_clip": 0.01098214, "auxiliary_loss_mlp": 0.0104372, "balance_loss_clip": 1.04254389, "balance_loss_mlp": 1.03035116, "epoch": 0.5335337441755599, "flos": 14282644020480.0, "grad_norm": 2.157272820213575, "language_loss": 0.80225539, "learning_rate": 1.879923326631099e-06, "loss": 0.82367474, "num_input_tokens_seen": 190851620, "step": 8874, "time_per_iteration": 2.723454475402832 }, { "auxiliary_loss_clip": 0.01112328, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.04300058, "balance_loss_mlp": 1.01653171, "epoch": 0.5335938674282279, "flos": 20814830726400.0, "grad_norm": 1.8315602861194333, "language_loss": 0.69789159, "learning_rate": 1.879534569789582e-06, "loss": 0.71931654, "num_input_tokens_seen": 190870545, "step": 8875, "time_per_iteration": 2.6051578521728516 }, { "auxiliary_loss_clip": 0.01045431, "auxiliary_loss_mlp": 0.01001312, "balance_loss_clip": 1.01922286, "balance_loss_mlp": 0.99979252, "epoch": 0.5336539906808958, "flos": 71396448451200.0, "grad_norm": 0.7211200965927701, "language_loss": 0.59631079, "learning_rate": 1.879145817516126e-06, "loss": 0.61677825, "num_input_tokens_seen": 190931995, "step": 8876, "time_per_iteration": 3.3114185333251953 }, { "auxiliary_loss_clip": 0.01113481, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.04467189, "balance_loss_mlp": 1.02705741, "epoch": 0.5337141139335638, "flos": 20152996680960.0, "grad_norm": 1.6786856291224888, "language_loss": 0.74847406, "learning_rate": 1.8787570698254727e-06, "loss": 0.77000904, "num_input_tokens_seen": 190949890, "step": 8877, "time_per_iteration": 4.474783182144165 }, { "auxiliary_loss_clip": 0.01030394, "auxiliary_loss_mlp": 0.01002162, "balance_loss_clip": 1.01585436, "balance_loss_mlp": 1.00046921, "epoch": 0.5337742371862317, "flos": 67728387484800.0, "grad_norm": 0.7582021069840851, "language_loss": 0.57155037, "learning_rate": 1.8783683267323629e-06, "loss": 0.59187591, "num_input_tokens_seen": 191008480, "step": 8878, "time_per_iteration": 4.623803615570068 }, { "auxiliary_loss_clip": 0.0112711, "auxiliary_loss_mlp": 0.0103613, "balance_loss_clip": 1.04414368, "balance_loss_mlp": 1.02169418, "epoch": 0.5338343604388998, "flos": 25008909436800.0, "grad_norm": 1.4672061419232192, "language_loss": 0.72301328, "learning_rate": 1.8779795882515395e-06, "loss": 0.74464571, "num_input_tokens_seen": 191028995, "step": 8879, "time_per_iteration": 2.646631956100464 }, { "auxiliary_loss_clip": 0.01126385, "auxiliary_loss_mlp": 0.01039416, "balance_loss_clip": 1.04535294, "balance_loss_mlp": 1.02487254, "epoch": 0.5338944836915677, "flos": 17601256546560.0, "grad_norm": 2.878615745391383, "language_loss": 0.83403212, "learning_rate": 1.8775908543977416e-06, "loss": 0.85569012, "num_input_tokens_seen": 191045285, "step": 8880, "time_per_iteration": 2.578953504562378 }, { "auxiliary_loss_clip": 0.01053817, "auxiliary_loss_mlp": 0.01036139, "balance_loss_clip": 1.03627348, "balance_loss_mlp": 1.02279377, "epoch": 0.5339546069442357, "flos": 21724124544000.0, "grad_norm": 1.3711441541735603, "language_loss": 0.79637486, "learning_rate": 1.8772021251857107e-06, "loss": 0.81727445, "num_input_tokens_seen": 191066105, "step": 8881, "time_per_iteration": 4.335238695144653 }, { "auxiliary_loss_clip": 0.0102058, "auxiliary_loss_mlp": 0.00999984, "balance_loss_clip": 1.01616335, "balance_loss_mlp": 0.99846381, "epoch": 0.5340147301969036, "flos": 69723583315200.0, "grad_norm": 0.7924040124288975, "language_loss": 0.59248376, "learning_rate": 1.8768134006301882e-06, "loss": 0.61268938, "num_input_tokens_seen": 191126315, "step": 8882, "time_per_iteration": 3.1252357959747314 }, { "auxiliary_loss_clip": 0.01025577, "auxiliary_loss_mlp": 0.01019116, "balance_loss_clip": 1.01780772, "balance_loss_mlp": 1.01768577, "epoch": 0.5340748534495716, "flos": 63880701580800.0, "grad_norm": 0.8651438881324313, "language_loss": 0.63574433, "learning_rate": 1.876424680745913e-06, "loss": 0.65619123, "num_input_tokens_seen": 191174240, "step": 8883, "time_per_iteration": 3.0245001316070557 }, { "auxiliary_loss_clip": 0.01079245, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.03873086, "balance_loss_mlp": 1.02523685, "epoch": 0.5341349767022396, "flos": 28694313694080.0, "grad_norm": 2.1049960022330385, "language_loss": 0.8200773, "learning_rate": 1.8760359655476272e-06, "loss": 0.8412708, "num_input_tokens_seen": 191193335, "step": 8884, "time_per_iteration": 2.8096158504486084 }, { "auxiliary_loss_clip": 0.01088886, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.0403688, "balance_loss_mlp": 1.02865684, "epoch": 0.5341950999549075, "flos": 16289691338880.0, "grad_norm": 1.6281705583461854, "language_loss": 0.72372848, "learning_rate": 1.8756472550500695e-06, "loss": 0.74504721, "num_input_tokens_seen": 191210900, "step": 8885, "time_per_iteration": 2.6555016040802 }, { "auxiliary_loss_clip": 0.01103878, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.04014146, "balance_loss_mlp": 1.02301598, "epoch": 0.5342552232075756, "flos": 14355650413440.0, "grad_norm": 2.9046192596208846, "language_loss": 0.79004246, "learning_rate": 1.87525854926798e-06, "loss": 0.81146097, "num_input_tokens_seen": 191226730, "step": 8886, "time_per_iteration": 2.6476478576660156 }, { "auxiliary_loss_clip": 0.01083524, "auxiliary_loss_mlp": 0.00772223, "balance_loss_clip": 1.04013681, "balance_loss_mlp": 1.00027037, "epoch": 0.5343153464602435, "flos": 30297976300800.0, "grad_norm": 1.5332505330022492, "language_loss": 0.750615, "learning_rate": 1.8748698482160996e-06, "loss": 0.76917243, "num_input_tokens_seen": 191250435, "step": 8887, "time_per_iteration": 2.7690041065216064 }, { "auxiliary_loss_clip": 0.01095123, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.03800249, "balance_loss_mlp": 1.02050543, "epoch": 0.5343754697129115, "flos": 15596292216960.0, "grad_norm": 2.322348043408552, "language_loss": 0.68717337, "learning_rate": 1.8744811519091663e-06, "loss": 0.70846909, "num_input_tokens_seen": 191268315, "step": 8888, "time_per_iteration": 2.631999969482422 }, { "auxiliary_loss_clip": 0.01118819, "auxiliary_loss_mlp": 0.01041785, "balance_loss_clip": 1.04266095, "balance_loss_mlp": 1.02738404, "epoch": 0.5344355929655794, "flos": 16909617191040.0, "grad_norm": 2.080624189448151, "language_loss": 0.77346873, "learning_rate": 1.8740924603619208e-06, "loss": 0.79507482, "num_input_tokens_seen": 191287000, "step": 8889, "time_per_iteration": 2.621675729751587 }, { "auxiliary_loss_clip": 0.01122598, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.04449213, "balance_loss_mlp": 1.02922511, "epoch": 0.5344957162182474, "flos": 16798186224000.0, "grad_norm": 2.052201989860069, "language_loss": 0.69323713, "learning_rate": 1.873703773589102e-06, "loss": 0.71489, "num_input_tokens_seen": 191304565, "step": 8890, "time_per_iteration": 2.6052801609039307 }, { "auxiliary_loss_clip": 0.01128191, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.04494905, "balance_loss_mlp": 1.0359515, "epoch": 0.5345558394709153, "flos": 12705590413440.0, "grad_norm": 2.21737658698942, "language_loss": 0.77022809, "learning_rate": 1.8733150916054483e-06, "loss": 0.79203498, "num_input_tokens_seen": 191318300, "step": 8891, "time_per_iteration": 2.533200263977051 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.04030669, "balance_loss_mlp": 1.02807951, "epoch": 0.5346159627235834, "flos": 22455050400000.0, "grad_norm": 2.8109589169570857, "language_loss": 0.74259919, "learning_rate": 1.872926414425699e-06, "loss": 0.76406056, "num_input_tokens_seen": 191337925, "step": 8892, "time_per_iteration": 2.674466609954834 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.04592252, "balance_loss_mlp": 1.02414215, "epoch": 0.5346760859762513, "flos": 22415763899520.0, "grad_norm": 1.9745937936433648, "language_loss": 0.87865257, "learning_rate": 1.8725377420645932e-06, "loss": 0.90010953, "num_input_tokens_seen": 191357120, "step": 8893, "time_per_iteration": 2.7012922763824463 }, { "auxiliary_loss_clip": 0.0111971, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.04291701, "balance_loss_mlp": 1.02377474, "epoch": 0.5347362092289193, "flos": 22816131868800.0, "grad_norm": 1.9421223728327293, "language_loss": 0.72379559, "learning_rate": 1.872149074536869e-06, "loss": 0.74535728, "num_input_tokens_seen": 191375395, "step": 8894, "time_per_iteration": 2.590670108795166 }, { "auxiliary_loss_clip": 0.01111441, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.04253268, "balance_loss_mlp": 1.01799238, "epoch": 0.5347963324815872, "flos": 23219480666880.0, "grad_norm": 1.965554622310178, "language_loss": 0.74611443, "learning_rate": 1.8717604118572648e-06, "loss": 0.76754439, "num_input_tokens_seen": 191395595, "step": 8895, "time_per_iteration": 2.6462347507476807 }, { "auxiliary_loss_clip": 0.01089565, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 1.04067063, "balance_loss_mlp": 1.02246881, "epoch": 0.5348564557342552, "flos": 22601350494720.0, "grad_norm": 1.8148089507657776, "language_loss": 0.76860476, "learning_rate": 1.8713717540405178e-06, "loss": 0.78986579, "num_input_tokens_seen": 191413730, "step": 8896, "time_per_iteration": 2.6798579692840576 }, { "auxiliary_loss_clip": 0.01093639, "auxiliary_loss_mlp": 0.01027964, "balance_loss_clip": 1.04279101, "balance_loss_mlp": 1.01502943, "epoch": 0.5349165789869232, "flos": 18002378701440.0, "grad_norm": 1.8518658883520687, "language_loss": 0.78188956, "learning_rate": 1.8709831011013676e-06, "loss": 0.80310559, "num_input_tokens_seen": 191432400, "step": 8897, "time_per_iteration": 2.6509950160980225 }, { "auxiliary_loss_clip": 0.01113143, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.04366183, "balance_loss_mlp": 1.01799703, "epoch": 0.5349767022395912, "flos": 17159770483200.0, "grad_norm": 1.7403204910626056, "language_loss": 0.75393677, "learning_rate": 1.8705944530545509e-06, "loss": 0.7753883, "num_input_tokens_seen": 191448855, "step": 8898, "time_per_iteration": 2.682753086090088 }, { "auxiliary_loss_clip": 0.01037971, "auxiliary_loss_mlp": 0.01005108, "balance_loss_clip": 1.0205543, "balance_loss_mlp": 1.00373161, "epoch": 0.5350368254922592, "flos": 70992058158720.0, "grad_norm": 0.9010106507685076, "language_loss": 0.57955837, "learning_rate": 1.8702058099148052e-06, "loss": 0.59998918, "num_input_tokens_seen": 191519690, "step": 8899, "time_per_iteration": 3.3475701808929443 }, { "auxiliary_loss_clip": 0.01101715, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.04445124, "balance_loss_mlp": 1.02107263, "epoch": 0.5350969487449271, "flos": 27417833095680.0, "grad_norm": 2.547752496503206, "language_loss": 0.69974548, "learning_rate": 1.869817171696868e-06, "loss": 0.72110939, "num_input_tokens_seen": 191539380, "step": 8900, "time_per_iteration": 2.7260618209838867 }, { "auxiliary_loss_clip": 0.01099442, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.03943968, "balance_loss_mlp": 1.02212465, "epoch": 0.5351570719975951, "flos": 19316134638720.0, "grad_norm": 1.7903210344042488, "language_loss": 0.71756148, "learning_rate": 1.8694285384154777e-06, "loss": 0.73891389, "num_input_tokens_seen": 191557400, "step": 8901, "time_per_iteration": 2.661510467529297 }, { "auxiliary_loss_clip": 0.01087314, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.03631806, "balance_loss_mlp": 1.02237511, "epoch": 0.535217195250263, "flos": 19828580019840.0, "grad_norm": 1.7989041746924002, "language_loss": 0.77021015, "learning_rate": 1.8690399100853699e-06, "loss": 0.791453, "num_input_tokens_seen": 191575860, "step": 8902, "time_per_iteration": 2.69665789604187 }, { "auxiliary_loss_clip": 0.01087231, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.04053831, "balance_loss_mlp": 1.0283792, "epoch": 0.535277318502931, "flos": 22127868391680.0, "grad_norm": 1.509633063766185, "language_loss": 0.70147592, "learning_rate": 1.868651286721281e-06, "loss": 0.72276717, "num_input_tokens_seen": 191595775, "step": 8903, "time_per_iteration": 2.676028251647949 }, { "auxiliary_loss_clip": 0.0111537, "auxiliary_loss_mlp": 0.00772296, "balance_loss_clip": 1.04395127, "balance_loss_mlp": 1.00028765, "epoch": 0.5353374417555989, "flos": 25045897466880.0, "grad_norm": 1.6001480056643833, "language_loss": 0.72911739, "learning_rate": 1.86826266833795e-06, "loss": 0.74799401, "num_input_tokens_seen": 191617785, "step": 8904, "time_per_iteration": 2.7466139793395996 }, { "auxiliary_loss_clip": 0.01099985, "auxiliary_loss_mlp": 0.01041546, "balance_loss_clip": 1.04453778, "balance_loss_mlp": 1.02705002, "epoch": 0.535397565008267, "flos": 19388710068480.0, "grad_norm": 1.8242307652956307, "language_loss": 0.73365581, "learning_rate": 1.8678740549501103e-06, "loss": 0.7550711, "num_input_tokens_seen": 191636900, "step": 8905, "time_per_iteration": 2.772406578063965 }, { "auxiliary_loss_clip": 0.01105525, "auxiliary_loss_mlp": 0.0103776, "balance_loss_clip": 1.04141188, "balance_loss_mlp": 1.02607787, "epoch": 0.5354576882609349, "flos": 21471205904640.0, "grad_norm": 1.6628200467542797, "language_loss": 0.83795619, "learning_rate": 1.8674854465725005e-06, "loss": 0.85938901, "num_input_tokens_seen": 191656720, "step": 8906, "time_per_iteration": 2.7151100635528564 }, { "auxiliary_loss_clip": 0.01115256, "auxiliary_loss_mlp": 0.00771962, "balance_loss_clip": 1.04406035, "balance_loss_mlp": 1.00027847, "epoch": 0.5355178115136029, "flos": 20777519473920.0, "grad_norm": 1.884591574516044, "language_loss": 0.74096596, "learning_rate": 1.8670968432198563e-06, "loss": 0.75983804, "num_input_tokens_seen": 191674445, "step": 8907, "time_per_iteration": 2.6978471279144287 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.04144001, "balance_loss_mlp": 1.02508759, "epoch": 0.5355779347662708, "flos": 23514020190720.0, "grad_norm": 2.160786888323469, "language_loss": 0.76593792, "learning_rate": 1.866708244906912e-06, "loss": 0.7874096, "num_input_tokens_seen": 191695000, "step": 8908, "time_per_iteration": 2.6536221504211426 }, { "auxiliary_loss_clip": 0.01097449, "auxiliary_loss_mlp": 0.00772377, "balance_loss_clip": 1.04248428, "balance_loss_mlp": 1.00030112, "epoch": 0.5356380580189388, "flos": 20303211358080.0, "grad_norm": 3.03117864576072, "language_loss": 0.740637, "learning_rate": 1.8663196516484055e-06, "loss": 0.75933528, "num_input_tokens_seen": 191713295, "step": 8909, "time_per_iteration": 2.665473461151123 }, { "auxiliary_loss_clip": 0.01082798, "auxiliary_loss_mlp": 0.01042054, "balance_loss_clip": 1.0436362, "balance_loss_mlp": 1.02891159, "epoch": 0.5356981812716068, "flos": 21361642444800.0, "grad_norm": 2.1999922776778233, "language_loss": 0.84319562, "learning_rate": 1.8659310634590702e-06, "loss": 0.86444414, "num_input_tokens_seen": 191732725, "step": 8910, "time_per_iteration": 2.715521812438965 }, { "auxiliary_loss_clip": 0.01102329, "auxiliary_loss_mlp": 0.0103318, "balance_loss_clip": 1.04114723, "balance_loss_mlp": 1.01928067, "epoch": 0.5357583045242748, "flos": 23111246010240.0, "grad_norm": 1.6725390900013062, "language_loss": 0.81822705, "learning_rate": 1.8655424803536427e-06, "loss": 0.8395822, "num_input_tokens_seen": 191753765, "step": 8911, "time_per_iteration": 2.715254068374634 }, { "auxiliary_loss_clip": 0.0108401, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.04012454, "balance_loss_mlp": 1.03019536, "epoch": 0.5358184277769428, "flos": 21141761339520.0, "grad_norm": 5.639232337071921, "language_loss": 0.69078076, "learning_rate": 1.8651539023468585e-06, "loss": 0.71206129, "num_input_tokens_seen": 191773560, "step": 8912, "time_per_iteration": 2.6743216514587402 }, { "auxiliary_loss_clip": 0.01098459, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.04129279, "balance_loss_mlp": 1.02273059, "epoch": 0.5358785510296107, "flos": 16282400878080.0, "grad_norm": 2.041064157993178, "language_loss": 0.71507263, "learning_rate": 1.8647653294534509e-06, "loss": 0.73642015, "num_input_tokens_seen": 191791255, "step": 8913, "time_per_iteration": 2.6959731578826904 }, { "auxiliary_loss_clip": 0.01092724, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.04161441, "balance_loss_mlp": 1.02512836, "epoch": 0.5359386742822787, "flos": 16976877408000.0, "grad_norm": 1.9206134465038889, "language_loss": 0.72290546, "learning_rate": 1.864376761688156e-06, "loss": 0.74422276, "num_input_tokens_seen": 191809325, "step": 8914, "time_per_iteration": 2.678020477294922 }, { "auxiliary_loss_clip": 0.01104699, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.04611683, "balance_loss_mlp": 1.02468383, "epoch": 0.5359987975349466, "flos": 20812927305600.0, "grad_norm": 1.8719693529557881, "language_loss": 0.70668626, "learning_rate": 1.8639881990657079e-06, "loss": 0.72813135, "num_input_tokens_seen": 191829795, "step": 8915, "time_per_iteration": 2.653940200805664 }, { "auxiliary_loss_clip": 0.01094002, "auxiliary_loss_mlp": 0.01045487, "balance_loss_clip": 1.04047489, "balance_loss_mlp": 1.03118742, "epoch": 0.5360589207876146, "flos": 22199941031040.0, "grad_norm": 1.5982896811499068, "language_loss": 0.74664176, "learning_rate": 1.8635996416008408e-06, "loss": 0.76803666, "num_input_tokens_seen": 191850840, "step": 8916, "time_per_iteration": 4.3477959632873535 }, { "auxiliary_loss_clip": 0.01081313, "auxiliary_loss_mlp": 0.00772126, "balance_loss_clip": 1.04081666, "balance_loss_mlp": 1.00021815, "epoch": 0.5361190440402825, "flos": 31394365084800.0, "grad_norm": 1.8553858112595492, "language_loss": 0.72677946, "learning_rate": 1.863211089308289e-06, "loss": 0.74531382, "num_input_tokens_seen": 191869520, "step": 8917, "time_per_iteration": 2.808074712753296 }, { "auxiliary_loss_clip": 0.01102423, "auxiliary_loss_mlp": 0.01041518, "balance_loss_clip": 1.0441047, "balance_loss_mlp": 1.02715325, "epoch": 0.5361791672929506, "flos": 16069882060800.0, "grad_norm": 1.960367430660897, "language_loss": 0.71014392, "learning_rate": 1.8628225422027865e-06, "loss": 0.73158336, "num_input_tokens_seen": 191887240, "step": 8918, "time_per_iteration": 4.185984134674072 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.01036881, "balance_loss_clip": 1.0469594, "balance_loss_mlp": 1.02306461, "epoch": 0.5362392905456185, "flos": 20740926493440.0, "grad_norm": 1.4605213362212828, "language_loss": 0.74976659, "learning_rate": 1.862434000299067e-06, "loss": 0.77119553, "num_input_tokens_seen": 191905690, "step": 8919, "time_per_iteration": 2.694120407104492 }, { "auxiliary_loss_clip": 0.01093376, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.04010797, "balance_loss_mlp": 1.02207744, "epoch": 0.5362994137982865, "flos": 17340077779200.0, "grad_norm": 1.9976483392210334, "language_loss": 0.71690488, "learning_rate": 1.862045463611864e-06, "loss": 0.73819816, "num_input_tokens_seen": 191920725, "step": 8920, "time_per_iteration": 2.6273410320281982 }, { "auxiliary_loss_clip": 0.01105087, "auxiliary_loss_mlp": 0.01040608, "balance_loss_clip": 1.03961456, "balance_loss_mlp": 1.02532554, "epoch": 0.5363595370509544, "flos": 42813957795840.0, "grad_norm": 1.3877970230156793, "language_loss": 0.68828928, "learning_rate": 1.8616569321559105e-06, "loss": 0.70974618, "num_input_tokens_seen": 191944645, "step": 8921, "time_per_iteration": 4.31537938117981 }, { "auxiliary_loss_clip": 0.01114121, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.04631782, "balance_loss_mlp": 1.0227288, "epoch": 0.5364196603036224, "flos": 19171953446400.0, "grad_norm": 1.8336561717381605, "language_loss": 0.81926084, "learning_rate": 1.86126840594594e-06, "loss": 0.84076393, "num_input_tokens_seen": 191962265, "step": 8922, "time_per_iteration": 2.6045267581939697 }, { "auxiliary_loss_clip": 0.01117037, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.04637003, "balance_loss_mlp": 1.01782727, "epoch": 0.5364797835562904, "flos": 17931060247680.0, "grad_norm": 2.029402038210475, "language_loss": 0.76969302, "learning_rate": 1.860879884996686e-06, "loss": 0.79117376, "num_input_tokens_seen": 191978850, "step": 8923, "time_per_iteration": 2.627131223678589 }, { "auxiliary_loss_clip": 0.01097305, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.04099584, "balance_loss_mlp": 1.02144074, "epoch": 0.5365399068089584, "flos": 30228058477440.0, "grad_norm": 1.4696173336709724, "language_loss": 0.70680726, "learning_rate": 1.8604913693228804e-06, "loss": 0.72813559, "num_input_tokens_seen": 192002000, "step": 8924, "time_per_iteration": 2.7947139739990234 }, { "auxiliary_loss_clip": 0.01093943, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.0430336, "balance_loss_mlp": 1.02501917, "epoch": 0.5366000300616264, "flos": 24891696380160.0, "grad_norm": 2.0937693746484456, "language_loss": 0.87335229, "learning_rate": 1.8601028589392558e-06, "loss": 0.8946951, "num_input_tokens_seen": 192019100, "step": 8925, "time_per_iteration": 2.768362045288086 }, { "auxiliary_loss_clip": 0.01123484, "auxiliary_loss_mlp": 0.01031699, "balance_loss_clip": 1.04188776, "balance_loss_mlp": 1.01764417, "epoch": 0.5366601533142943, "flos": 29826649013760.0, "grad_norm": 1.5047259348419413, "language_loss": 0.77962756, "learning_rate": 1.8597143538605455e-06, "loss": 0.80117941, "num_input_tokens_seen": 192041660, "step": 8926, "time_per_iteration": 2.715451955795288 }, { "auxiliary_loss_clip": 0.01087054, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.04502523, "balance_loss_mlp": 1.01944578, "epoch": 0.5367202765669623, "flos": 27199352620800.0, "grad_norm": 1.5425961750104156, "language_loss": 0.66906953, "learning_rate": 1.85932585410148e-06, "loss": 0.69026089, "num_input_tokens_seen": 192063540, "step": 8927, "time_per_iteration": 2.7890443801879883 }, { "auxiliary_loss_clip": 0.0111207, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.04044211, "balance_loss_mlp": 1.01719475, "epoch": 0.5367803998196302, "flos": 20229953569920.0, "grad_norm": 1.7627850836145547, "language_loss": 0.73644257, "learning_rate": 1.8589373596767929e-06, "loss": 0.75787145, "num_input_tokens_seen": 192081760, "step": 8928, "time_per_iteration": 2.6679322719573975 }, { "auxiliary_loss_clip": 0.01097621, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.03983617, "balance_loss_mlp": 1.02038312, "epoch": 0.5368405230722982, "flos": 32154629374080.0, "grad_norm": 1.8947277080350169, "language_loss": 0.63138568, "learning_rate": 1.8585488706012154e-06, "loss": 0.65269947, "num_input_tokens_seen": 192101620, "step": 8929, "time_per_iteration": 2.77915620803833 }, { "auxiliary_loss_clip": 0.01112721, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.04284871, "balance_loss_mlp": 1.02102494, "epoch": 0.5369006463249661, "flos": 26247935128320.0, "grad_norm": 1.6504217106645076, "language_loss": 0.65814567, "learning_rate": 1.8581603868894781e-06, "loss": 0.67962325, "num_input_tokens_seen": 192121805, "step": 8930, "time_per_iteration": 2.671699285507202 }, { "auxiliary_loss_clip": 0.01070837, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.03888655, "balance_loss_mlp": 1.01519203, "epoch": 0.5369607695776342, "flos": 26211306234240.0, "grad_norm": 1.4657060850123025, "language_loss": 0.67106915, "learning_rate": 1.8577719085563136e-06, "loss": 0.69207126, "num_input_tokens_seen": 192141765, "step": 8931, "time_per_iteration": 2.791450023651123 }, { "auxiliary_loss_clip": 0.0107183, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.03937209, "balance_loss_mlp": 1.02028155, "epoch": 0.5370208928303021, "flos": 25009017177600.0, "grad_norm": 1.6675319791175172, "language_loss": 0.76147091, "learning_rate": 1.8573834356164525e-06, "loss": 0.78254229, "num_input_tokens_seen": 192161560, "step": 8932, "time_per_iteration": 2.817074775695801 }, { "auxiliary_loss_clip": 0.0108812, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.04271507, "balance_loss_mlp": 1.02086663, "epoch": 0.5370810160829701, "flos": 31792147274880.0, "grad_norm": 1.7321457922490968, "language_loss": 0.66103363, "learning_rate": 1.8569949680846261e-06, "loss": 0.68226647, "num_input_tokens_seen": 192180190, "step": 8933, "time_per_iteration": 2.7999963760375977 }, { "auxiliary_loss_clip": 0.01106374, "auxiliary_loss_mlp": 0.0077107, "balance_loss_clip": 1.04321599, "balance_loss_mlp": 1.00030327, "epoch": 0.537141139335638, "flos": 23842602829440.0, "grad_norm": 1.7096623259043264, "language_loss": 0.83137345, "learning_rate": 1.856606505975565e-06, "loss": 0.8501479, "num_input_tokens_seen": 192198855, "step": 8934, "time_per_iteration": 2.77140474319458 }, { "auxiliary_loss_clip": 0.01083657, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.03906775, "balance_loss_mlp": 1.02371967, "epoch": 0.537201262588306, "flos": 18508826511360.0, "grad_norm": 1.9684207217946548, "language_loss": 0.79907835, "learning_rate": 1.856218049303999e-06, "loss": 0.82030034, "num_input_tokens_seen": 192216555, "step": 8935, "time_per_iteration": 2.714343547821045 }, { "auxiliary_loss_clip": 0.01111571, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.04217649, "balance_loss_mlp": 1.02750206, "epoch": 0.537261385840974, "flos": 25662950231040.0, "grad_norm": 2.937428754588345, "language_loss": 0.84070867, "learning_rate": 1.855829598084659e-06, "loss": 0.86224055, "num_input_tokens_seen": 192236910, "step": 8936, "time_per_iteration": 2.6816179752349854 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04575956, "balance_loss_mlp": 1.018255, "epoch": 0.537321509093642, "flos": 40735017406080.0, "grad_norm": 1.2320449417851727, "language_loss": 0.72774732, "learning_rate": 1.8554411523322754e-06, "loss": 0.74897963, "num_input_tokens_seen": 192260790, "step": 8937, "time_per_iteration": 2.9294662475585938 }, { "auxiliary_loss_clip": 0.01097303, "auxiliary_loss_mlp": 0.0103947, "balance_loss_clip": 1.03866911, "balance_loss_mlp": 1.02411556, "epoch": 0.53738163234631, "flos": 17238487138560.0, "grad_norm": 2.4958463124017825, "language_loss": 0.82070464, "learning_rate": 1.8550527120615778e-06, "loss": 0.84207237, "num_input_tokens_seen": 192277230, "step": 8938, "time_per_iteration": 2.7016329765319824 }, { "auxiliary_loss_clip": 0.01128942, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.04445028, "balance_loss_mlp": 1.02425027, "epoch": 0.5374417555989779, "flos": 12821977457280.0, "grad_norm": 2.39037719214814, "language_loss": 0.80410939, "learning_rate": 1.8546642772872957e-06, "loss": 0.8257767, "num_input_tokens_seen": 192292840, "step": 8939, "time_per_iteration": 2.588257312774658 }, { "auxiliary_loss_clip": 0.01012372, "auxiliary_loss_mlp": 0.01007323, "balance_loss_clip": 1.01498079, "balance_loss_mlp": 1.00561845, "epoch": 0.5375018788516459, "flos": 67256018703360.0, "grad_norm": 0.706070728219951, "language_loss": 0.52408826, "learning_rate": 1.8542758480241589e-06, "loss": 0.5442853, "num_input_tokens_seen": 192358240, "step": 8940, "time_per_iteration": 3.276360273361206 }, { "auxiliary_loss_clip": 0.01083174, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.04148936, "balance_loss_mlp": 1.01995516, "epoch": 0.5375620021043138, "flos": 18114168804480.0, "grad_norm": 2.0987581231461725, "language_loss": 0.71804386, "learning_rate": 1.8538874242868965e-06, "loss": 0.73920786, "num_input_tokens_seen": 192377370, "step": 8941, "time_per_iteration": 2.732537269592285 }, { "auxiliary_loss_clip": 0.01092897, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.03881931, "balance_loss_mlp": 1.01767242, "epoch": 0.5376221253569818, "flos": 23149383275520.0, "grad_norm": 1.733585832372728, "language_loss": 0.79825974, "learning_rate": 1.853499006090237e-06, "loss": 0.81949472, "num_input_tokens_seen": 192396450, "step": 8942, "time_per_iteration": 2.723686695098877 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01038334, "balance_loss_clip": 1.04432559, "balance_loss_mlp": 1.02416599, "epoch": 0.5376822486096497, "flos": 29972302663680.0, "grad_norm": 1.8527940596038397, "language_loss": 0.70161736, "learning_rate": 1.853110593448911e-06, "loss": 0.72326851, "num_input_tokens_seen": 192417390, "step": 8943, "time_per_iteration": 2.683830499649048 }, { "auxiliary_loss_clip": 0.01030181, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.01417148, "balance_loss_mlp": 1.00145841, "epoch": 0.5377423718623178, "flos": 54168950874240.0, "grad_norm": 0.8559023322108498, "language_loss": 0.5964179, "learning_rate": 1.852722186377645e-06, "loss": 0.61674768, "num_input_tokens_seen": 192478060, "step": 8944, "time_per_iteration": 3.195451498031616 }, { "auxiliary_loss_clip": 0.01075816, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.04020023, "balance_loss_mlp": 1.02198291, "epoch": 0.5378024951149857, "flos": 23257079228160.0, "grad_norm": 2.0363151234070567, "language_loss": 0.77896553, "learning_rate": 1.852333784891169e-06, "loss": 0.80009592, "num_input_tokens_seen": 192495985, "step": 8945, "time_per_iteration": 2.7992632389068604 }, { "auxiliary_loss_clip": 0.01114593, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.04309297, "balance_loss_mlp": 1.02173805, "epoch": 0.5378626183676537, "flos": 24024095274240.0, "grad_norm": 1.6722587357114949, "language_loss": 0.68561995, "learning_rate": 1.8519453890042112e-06, "loss": 0.70712113, "num_input_tokens_seen": 192515445, "step": 8946, "time_per_iteration": 2.6522717475891113 }, { "auxiliary_loss_clip": 0.01078154, "auxiliary_loss_mlp": 0.0104253, "balance_loss_clip": 1.04271758, "balance_loss_mlp": 1.02895761, "epoch": 0.5379227416203216, "flos": 27161789973120.0, "grad_norm": 1.8248631368800923, "language_loss": 0.76991701, "learning_rate": 1.851556998731498e-06, "loss": 0.79112387, "num_input_tokens_seen": 192536530, "step": 8947, "time_per_iteration": 2.796123743057251 }, { "auxiliary_loss_clip": 0.0111442, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.04487777, "balance_loss_mlp": 1.01940608, "epoch": 0.5379828648729896, "flos": 24681619687680.0, "grad_norm": 1.55307874766799, "language_loss": 0.60198331, "learning_rate": 1.8511686140877592e-06, "loss": 0.6234535, "num_input_tokens_seen": 192556075, "step": 8948, "time_per_iteration": 2.7054309844970703 }, { "auxiliary_loss_clip": 0.01082153, "auxiliary_loss_mlp": 0.01037517, "balance_loss_clip": 1.03970575, "balance_loss_mlp": 1.02415979, "epoch": 0.5380429881256577, "flos": 22523280284160.0, "grad_norm": 1.6281037537893495, "language_loss": 0.79697102, "learning_rate": 1.8507802350877205e-06, "loss": 0.81816769, "num_input_tokens_seen": 192575535, "step": 8949, "time_per_iteration": 2.8140738010406494 }, { "auxiliary_loss_clip": 0.01078335, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.03704572, "balance_loss_mlp": 1.02679944, "epoch": 0.5381031113783256, "flos": 26979543342720.0, "grad_norm": 2.0888170828860444, "language_loss": 0.77963328, "learning_rate": 1.850391861746111e-06, "loss": 0.80084026, "num_input_tokens_seen": 192594490, "step": 8950, "time_per_iteration": 2.7498505115509033 }, { "auxiliary_loss_clip": 0.01110071, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.05072141, "balance_loss_mlp": 1.01671791, "epoch": 0.5381632346309936, "flos": 24754087376640.0, "grad_norm": 1.5816580812213883, "language_loss": 0.72668755, "learning_rate": 1.8500034940776573e-06, "loss": 0.7480849, "num_input_tokens_seen": 192615650, "step": 8951, "time_per_iteration": 2.7927658557891846 }, { "auxiliary_loss_clip": 0.01122901, "auxiliary_loss_mlp": 0.00772698, "balance_loss_clip": 1.04232633, "balance_loss_mlp": 1.00031877, "epoch": 0.5382233578836615, "flos": 15560058372480.0, "grad_norm": 1.7038907930473366, "language_loss": 0.74791837, "learning_rate": 1.849615132097085e-06, "loss": 0.76687431, "num_input_tokens_seen": 192633840, "step": 8952, "time_per_iteration": 2.663555860519409 }, { "auxiliary_loss_clip": 0.01103413, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.04635072, "balance_loss_mlp": 1.02090442, "epoch": 0.5382834811363295, "flos": 25084501608960.0, "grad_norm": 1.486507819644587, "language_loss": 0.79733002, "learning_rate": 1.8492267758191228e-06, "loss": 0.81871235, "num_input_tokens_seen": 192655890, "step": 8953, "time_per_iteration": 2.7213597297668457 }, { "auxiliary_loss_clip": 0.01092412, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.04632258, "balance_loss_mlp": 1.02147865, "epoch": 0.5383436043889974, "flos": 13297901685120.0, "grad_norm": 1.8841614793520622, "language_loss": 0.80665779, "learning_rate": 1.8488384252584964e-06, "loss": 0.82794857, "num_input_tokens_seen": 192673025, "step": 8954, "time_per_iteration": 2.7119338512420654 }, { "auxiliary_loss_clip": 0.01124989, "auxiliary_loss_mlp": 0.0103348, "balance_loss_clip": 1.04552889, "balance_loss_mlp": 1.0192287, "epoch": 0.5384037276416654, "flos": 23039388852480.0, "grad_norm": 2.080642260770838, "language_loss": 0.76782274, "learning_rate": 1.8484500804299318e-06, "loss": 0.78940743, "num_input_tokens_seen": 192692190, "step": 8955, "time_per_iteration": 4.170248746871948 }, { "auxiliary_loss_clip": 0.01100368, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.04422796, "balance_loss_mlp": 1.02911186, "epoch": 0.5384638508943334, "flos": 20631147552000.0, "grad_norm": 1.64526518725267, "language_loss": 0.78446829, "learning_rate": 1.8480617413481557e-06, "loss": 0.8059057, "num_input_tokens_seen": 192710380, "step": 8956, "time_per_iteration": 4.346608638763428 }, { "auxiliary_loss_clip": 0.01014882, "auxiliary_loss_mlp": 0.01009567, "balance_loss_clip": 1.01641572, "balance_loss_mlp": 1.00802886, "epoch": 0.5385239741470014, "flos": 66737683491840.0, "grad_norm": 0.8632221777835867, "language_loss": 0.63366526, "learning_rate": 1.8476734080278932e-06, "loss": 0.6539098, "num_input_tokens_seen": 192768995, "step": 8957, "time_per_iteration": 4.689607381820679 }, { "auxiliary_loss_clip": 0.01003314, "auxiliary_loss_mlp": 0.00999601, "balance_loss_clip": 1.01686144, "balance_loss_mlp": 0.99808067, "epoch": 0.5385840973996693, "flos": 64716058229760.0, "grad_norm": 0.7163688318545376, "language_loss": 0.5155347, "learning_rate": 1.8472850804838705e-06, "loss": 0.53556383, "num_input_tokens_seen": 192825585, "step": 8958, "time_per_iteration": 3.263490676879883 }, { "auxiliary_loss_clip": 0.01118278, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.04870462, "balance_loss_mlp": 1.01945472, "epoch": 0.5386442206523373, "flos": 26141783460480.0, "grad_norm": 1.5599827476789179, "language_loss": 0.77335596, "learning_rate": 1.8468967587308128e-06, "loss": 0.79488432, "num_input_tokens_seen": 192847335, "step": 8959, "time_per_iteration": 2.6936423778533936 }, { "auxiliary_loss_clip": 0.01078149, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.04148221, "balance_loss_mlp": 1.02258778, "epoch": 0.5387043439050052, "flos": 18251849635200.0, "grad_norm": 2.554990268603387, "language_loss": 0.84077597, "learning_rate": 1.8465084427834455e-06, "loss": 0.86191648, "num_input_tokens_seen": 192862205, "step": 8960, "time_per_iteration": 4.281194686889648 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.0460726, "balance_loss_mlp": 1.01955807, "epoch": 0.5387644671576732, "flos": 29788296266880.0, "grad_norm": 1.4386251393877574, "language_loss": 0.78275657, "learning_rate": 1.8461201326564933e-06, "loss": 0.80423284, "num_input_tokens_seen": 192883695, "step": 8961, "time_per_iteration": 2.7518913745880127 }, { "auxiliary_loss_clip": 0.01089107, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.041345, "balance_loss_mlp": 1.02189803, "epoch": 0.5388245904103413, "flos": 22374466237440.0, "grad_norm": 11.100507002897315, "language_loss": 0.84070158, "learning_rate": 1.845731828364681e-06, "loss": 0.86194789, "num_input_tokens_seen": 192900190, "step": 8962, "time_per_iteration": 2.745964288711548 }, { "auxiliary_loss_clip": 0.01020426, "auxiliary_loss_mlp": 0.01002497, "balance_loss_clip": 1.01872444, "balance_loss_mlp": 1.00114429, "epoch": 0.5388847136630092, "flos": 69807794751360.0, "grad_norm": 0.7287303599556714, "language_loss": 0.5418579, "learning_rate": 1.8453435299227333e-06, "loss": 0.56208712, "num_input_tokens_seen": 192958675, "step": 8963, "time_per_iteration": 3.0952982902526855 }, { "auxiliary_loss_clip": 0.01022568, "auxiliary_loss_mlp": 0.01009564, "balance_loss_clip": 1.01615238, "balance_loss_mlp": 1.00817513, "epoch": 0.5389448369156772, "flos": 69822303845760.0, "grad_norm": 1.4175775222807738, "language_loss": 0.63305563, "learning_rate": 1.8449552373453744e-06, "loss": 0.65337688, "num_input_tokens_seen": 193033135, "step": 8964, "time_per_iteration": 3.2670536041259766 }, { "auxiliary_loss_clip": 0.01065573, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.04052043, "balance_loss_mlp": 1.02049828, "epoch": 0.5390049601683451, "flos": 31722444933120.0, "grad_norm": 1.4839412969014603, "language_loss": 0.69941193, "learning_rate": 1.8445669506473287e-06, "loss": 0.72041768, "num_input_tokens_seen": 193055570, "step": 8965, "time_per_iteration": 2.8793537616729736 }, { "auxiliary_loss_clip": 0.01097921, "auxiliary_loss_mlp": 0.00772841, "balance_loss_clip": 1.04318738, "balance_loss_mlp": 1.00031877, "epoch": 0.5390650834210131, "flos": 18113486446080.0, "grad_norm": 3.9331383698311297, "language_loss": 0.82359982, "learning_rate": 1.8441786698433192e-06, "loss": 0.84230745, "num_input_tokens_seen": 193073120, "step": 8966, "time_per_iteration": 2.7008259296417236 }, { "auxiliary_loss_clip": 0.0112489, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.04688132, "balance_loss_mlp": 1.01831603, "epoch": 0.539125206673681, "flos": 17416711445760.0, "grad_norm": 1.8273360824105822, "language_loss": 0.72234643, "learning_rate": 1.8437903949480706e-06, "loss": 0.74391627, "num_input_tokens_seen": 193090105, "step": 8967, "time_per_iteration": 2.536813974380493 }, { "auxiliary_loss_clip": 0.01101272, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.04193211, "balance_loss_mlp": 1.02177858, "epoch": 0.539185329926349, "flos": 22198935450240.0, "grad_norm": 2.8461637045489394, "language_loss": 0.81760883, "learning_rate": 1.8434021259763065e-06, "loss": 0.83896482, "num_input_tokens_seen": 193109325, "step": 8968, "time_per_iteration": 2.6812336444854736 }, { "auxiliary_loss_clip": 0.01095464, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04489422, "balance_loss_mlp": 1.0244931, "epoch": 0.539245453179017, "flos": 21434397442560.0, "grad_norm": 1.479768408322399, "language_loss": 0.74093103, "learning_rate": 1.8430138629427484e-06, "loss": 0.76228386, "num_input_tokens_seen": 193130595, "step": 8969, "time_per_iteration": 2.775066614151001 }, { "auxiliary_loss_clip": 0.01089398, "auxiliary_loss_mlp": 0.00772297, "balance_loss_clip": 1.03885353, "balance_loss_mlp": 1.00019646, "epoch": 0.539305576431685, "flos": 20735000749440.0, "grad_norm": 1.789523366494458, "language_loss": 0.82301641, "learning_rate": 1.8426256058621205e-06, "loss": 0.84163332, "num_input_tokens_seen": 193148930, "step": 8970, "time_per_iteration": 2.709660053253174 }, { "auxiliary_loss_clip": 0.0109962, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.04434752, "balance_loss_mlp": 1.02398705, "epoch": 0.5393656996843529, "flos": 30920452018560.0, "grad_norm": 1.3749735874734272, "language_loss": 0.75481087, "learning_rate": 1.842237354749146e-06, "loss": 0.77617574, "num_input_tokens_seen": 193170140, "step": 8971, "time_per_iteration": 2.759859800338745 }, { "auxiliary_loss_clip": 0.01031428, "auxiliary_loss_mlp": 0.01020808, "balance_loss_clip": 1.01404476, "balance_loss_mlp": 1.01906729, "epoch": 0.5394258229370209, "flos": 50317781351040.0, "grad_norm": 0.8852076637627846, "language_loss": 0.60268009, "learning_rate": 1.8418491096185465e-06, "loss": 0.62320244, "num_input_tokens_seen": 193227235, "step": 8972, "time_per_iteration": 3.1906497478485107 }, { "auxiliary_loss_clip": 0.01113524, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.0430851, "balance_loss_mlp": 1.02806175, "epoch": 0.5394859461896888, "flos": 25411935012480.0, "grad_norm": 1.3798913966673876, "language_loss": 0.78418267, "learning_rate": 1.841460870485045e-06, "loss": 0.80574697, "num_input_tokens_seen": 193248435, "step": 8973, "time_per_iteration": 2.67616868019104 }, { "auxiliary_loss_clip": 0.01119952, "auxiliary_loss_mlp": 0.01038926, "balance_loss_clip": 1.04402721, "balance_loss_mlp": 1.0234288, "epoch": 0.5395460694423568, "flos": 25478476957440.0, "grad_norm": 1.97267381364002, "language_loss": 0.73745018, "learning_rate": 1.8410726373633623e-06, "loss": 0.75903904, "num_input_tokens_seen": 193267490, "step": 8974, "time_per_iteration": 2.6896610260009766 }, { "auxiliary_loss_clip": 0.01038786, "auxiliary_loss_mlp": 0.01002204, "balance_loss_clip": 1.01252413, "balance_loss_mlp": 1.00089288, "epoch": 0.5396061926950249, "flos": 53249493507840.0, "grad_norm": 0.7368178577125409, "language_loss": 0.51070768, "learning_rate": 1.8406844102682215e-06, "loss": 0.53111756, "num_input_tokens_seen": 193326050, "step": 8975, "time_per_iteration": 3.1316938400268555 }, { "auxiliary_loss_clip": 0.01110433, "auxiliary_loss_mlp": 0.01042663, "balance_loss_clip": 1.04242885, "balance_loss_mlp": 1.02821445, "epoch": 0.5396663159476928, "flos": 26725080418560.0, "grad_norm": 2.630341512403146, "language_loss": 0.72291577, "learning_rate": 1.840296189214344e-06, "loss": 0.74444675, "num_input_tokens_seen": 193348785, "step": 8976, "time_per_iteration": 2.722482681274414 }, { "auxiliary_loss_clip": 0.01107068, "auxiliary_loss_mlp": 0.00771891, "balance_loss_clip": 1.0392096, "balance_loss_mlp": 1.00027895, "epoch": 0.5397264392003608, "flos": 23253380127360.0, "grad_norm": 1.6269165395400453, "language_loss": 0.69827849, "learning_rate": 1.8399079742164509e-06, "loss": 0.71706808, "num_input_tokens_seen": 193367080, "step": 8977, "time_per_iteration": 2.661503553390503 }, { "auxiliary_loss_clip": 0.0105269, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.03996563, "balance_loss_mlp": 1.02390814, "epoch": 0.5397865624530287, "flos": 18294188791680.0, "grad_norm": 1.662156020825611, "language_loss": 0.7259683, "learning_rate": 1.8395197652892636e-06, "loss": 0.74687779, "num_input_tokens_seen": 193383715, "step": 8978, "time_per_iteration": 2.7381365299224854 }, { "auxiliary_loss_clip": 0.01087228, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.04297757, "balance_loss_mlp": 1.02373815, "epoch": 0.5398466857056967, "flos": 15297514888320.0, "grad_norm": 1.853626793115837, "language_loss": 0.74536407, "learning_rate": 1.8391315624475028e-06, "loss": 0.76663339, "num_input_tokens_seen": 193400560, "step": 8979, "time_per_iteration": 2.694063425064087 }, { "auxiliary_loss_clip": 0.01072362, "auxiliary_loss_mlp": 0.01049968, "balance_loss_clip": 1.04104912, "balance_loss_mlp": 1.03438091, "epoch": 0.5399068089583646, "flos": 17821748183040.0, "grad_norm": 1.8942057962212562, "language_loss": 0.76699525, "learning_rate": 1.8387433657058892e-06, "loss": 0.78821856, "num_input_tokens_seen": 193418680, "step": 8980, "time_per_iteration": 2.820065498352051 }, { "auxiliary_loss_clip": 0.01123296, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.04266453, "balance_loss_mlp": 1.02159715, "epoch": 0.5399669322110326, "flos": 27381635164800.0, "grad_norm": 1.799033275645953, "language_loss": 0.82047689, "learning_rate": 1.8383551750791431e-06, "loss": 0.84206468, "num_input_tokens_seen": 193439310, "step": 8981, "time_per_iteration": 2.6362786293029785 }, { "auxiliary_loss_clip": 0.01114328, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.0414052, "balance_loss_mlp": 1.01837707, "epoch": 0.5400270554637006, "flos": 20449116403200.0, "grad_norm": 1.8414706821019682, "language_loss": 0.66744691, "learning_rate": 1.8379669905819857e-06, "loss": 0.68892789, "num_input_tokens_seen": 193458115, "step": 8982, "time_per_iteration": 2.621446371078491 }, { "auxiliary_loss_clip": 0.01087174, "auxiliary_loss_mlp": 0.00771772, "balance_loss_clip": 1.04236412, "balance_loss_mlp": 1.00037217, "epoch": 0.5400871787163686, "flos": 21689578638720.0, "grad_norm": 1.585959219226275, "language_loss": 0.82838899, "learning_rate": 1.8375788122291358e-06, "loss": 0.84697849, "num_input_tokens_seen": 193477365, "step": 8983, "time_per_iteration": 2.725118637084961 }, { "auxiliary_loss_clip": 0.0107373, "auxiliary_loss_mlp": 0.01037262, "balance_loss_clip": 1.03868723, "balance_loss_mlp": 1.0226711, "epoch": 0.5401473019690365, "flos": 19204739585280.0, "grad_norm": 1.7940455633993566, "language_loss": 0.71052921, "learning_rate": 1.8371906400353138e-06, "loss": 0.73163915, "num_input_tokens_seen": 193495595, "step": 8984, "time_per_iteration": 2.7552812099456787 }, { "auxiliary_loss_clip": 0.01129583, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.04673409, "balance_loss_mlp": 1.02464724, "epoch": 0.5402074252217045, "flos": 20627376624000.0, "grad_norm": 1.7153215255445333, "language_loss": 0.80088288, "learning_rate": 1.8368024740152386e-06, "loss": 0.82257855, "num_input_tokens_seen": 193514035, "step": 8985, "time_per_iteration": 2.6251611709594727 }, { "auxiliary_loss_clip": 0.01076326, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.03776312, "balance_loss_mlp": 1.01603341, "epoch": 0.5402675484743724, "flos": 24973465691520.0, "grad_norm": 1.6597478268739005, "language_loss": 0.79092562, "learning_rate": 1.83641431418363e-06, "loss": 0.81199366, "num_input_tokens_seen": 193535445, "step": 8986, "time_per_iteration": 2.7512738704681396 }, { "auxiliary_loss_clip": 0.01105948, "auxiliary_loss_mlp": 0.01041249, "balance_loss_clip": 1.0403738, "balance_loss_mlp": 1.02647913, "epoch": 0.5403276717270404, "flos": 19459022941440.0, "grad_norm": 1.5813568652048575, "language_loss": 0.77027225, "learning_rate": 1.8360261605552075e-06, "loss": 0.79174423, "num_input_tokens_seen": 193554780, "step": 8987, "time_per_iteration": 2.678215265274048 }, { "auxiliary_loss_clip": 0.01094562, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.04025865, "balance_loss_mlp": 1.021613, "epoch": 0.5403877949797083, "flos": 18442140912000.0, "grad_norm": 3.169719409567684, "language_loss": 0.71186262, "learning_rate": 1.8356380131446887e-06, "loss": 0.73316658, "num_input_tokens_seen": 193573580, "step": 8988, "time_per_iteration": 2.779327869415283 }, { "auxiliary_loss_clip": 0.01073421, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.03765535, "balance_loss_mlp": 1.02508116, "epoch": 0.5404479182323764, "flos": 28292868316800.0, "grad_norm": 2.25930737507901, "language_loss": 0.67611122, "learning_rate": 1.8352498719667934e-06, "loss": 0.69724679, "num_input_tokens_seen": 193590490, "step": 8989, "time_per_iteration": 2.7891674041748047 }, { "auxiliary_loss_clip": 0.01111206, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.04164839, "balance_loss_mlp": 1.02667785, "epoch": 0.5405080414850444, "flos": 23367325046400.0, "grad_norm": 1.5585472280182338, "language_loss": 0.77394271, "learning_rate": 1.8348617370362399e-06, "loss": 0.79546589, "num_input_tokens_seen": 193609900, "step": 8990, "time_per_iteration": 2.6976635456085205 }, { "auxiliary_loss_clip": 0.01106061, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.03980994, "balance_loss_mlp": 1.01427758, "epoch": 0.5405681647377123, "flos": 21106425335040.0, "grad_norm": 1.9802166321118257, "language_loss": 0.69258702, "learning_rate": 1.834473608367745e-06, "loss": 0.71391636, "num_input_tokens_seen": 193629775, "step": 8991, "time_per_iteration": 2.6734046936035156 }, { "auxiliary_loss_clip": 0.01061373, "auxiliary_loss_mlp": 0.01034138, "balance_loss_clip": 1.03470838, "balance_loss_mlp": 1.01864719, "epoch": 0.5406282879903803, "flos": 20449188230400.0, "grad_norm": 1.8615919781627641, "language_loss": 0.75722122, "learning_rate": 1.8340854859760277e-06, "loss": 0.77817637, "num_input_tokens_seen": 193648070, "step": 8992, "time_per_iteration": 2.7986576557159424 }, { "auxiliary_loss_clip": 0.01094937, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.03807545, "balance_loss_mlp": 1.02672255, "epoch": 0.5406884112430482, "flos": 14209493973120.0, "grad_norm": 2.5485108966117704, "language_loss": 0.76453286, "learning_rate": 1.8336973698758056e-06, "loss": 0.78590572, "num_input_tokens_seen": 193665060, "step": 8993, "time_per_iteration": 2.7208335399627686 }, { "auxiliary_loss_clip": 0.01106981, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.03966081, "balance_loss_mlp": 1.01783895, "epoch": 0.5407485344957162, "flos": 23875568536320.0, "grad_norm": 1.7082267966393392, "language_loss": 0.70645487, "learning_rate": 1.8333092600817959e-06, "loss": 0.72783911, "num_input_tokens_seen": 193683620, "step": 8994, "time_per_iteration": 2.724794626235962 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.03957391, "balance_loss_mlp": 1.01583362, "epoch": 0.5408086577483842, "flos": 23148485435520.0, "grad_norm": 3.058822592256831, "language_loss": 0.75407541, "learning_rate": 1.8329211566087157e-06, "loss": 0.77546465, "num_input_tokens_seen": 193702990, "step": 8995, "time_per_iteration": 5.971833229064941 }, { "auxiliary_loss_clip": 0.0110732, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.04115582, "balance_loss_mlp": 1.02335215, "epoch": 0.5408687810010522, "flos": 18771046773120.0, "grad_norm": 1.7630879917097735, "language_loss": 0.73701608, "learning_rate": 1.832533059471282e-06, "loss": 0.75845045, "num_input_tokens_seen": 193721785, "step": 8996, "time_per_iteration": 4.209546327590942 }, { "auxiliary_loss_clip": 0.0107249, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.03679025, "balance_loss_mlp": 1.02018428, "epoch": 0.5409289042537201, "flos": 13881557779200.0, "grad_norm": 2.7958611639566557, "language_loss": 0.73200142, "learning_rate": 1.8321449686842115e-06, "loss": 0.75306082, "num_input_tokens_seen": 193740315, "step": 8997, "time_per_iteration": 2.6815428733825684 }, { "auxiliary_loss_clip": 0.0112099, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.04214144, "balance_loss_mlp": 1.02241874, "epoch": 0.5409890275063881, "flos": 14465357527680.0, "grad_norm": 2.1382567541010706, "language_loss": 0.71990108, "learning_rate": 1.8317568842622207e-06, "loss": 0.74147719, "num_input_tokens_seen": 193757580, "step": 8998, "time_per_iteration": 2.516322374343872 }, { "auxiliary_loss_clip": 0.01084198, "auxiliary_loss_mlp": 0.01038336, "balance_loss_clip": 1.03824925, "balance_loss_mlp": 1.02481771, "epoch": 0.541049150759056, "flos": 48977449349760.0, "grad_norm": 1.4737906597538892, "language_loss": 0.7077291, "learning_rate": 1.8313688062200256e-06, "loss": 0.72895443, "num_input_tokens_seen": 193780965, "step": 8999, "time_per_iteration": 4.582181215286255 }, { "auxiliary_loss_clip": 0.01092675, "auxiliary_loss_mlp": 0.01037736, "balance_loss_clip": 1.04016924, "balance_loss_mlp": 1.02372253, "epoch": 0.541109274011724, "flos": 18147601388160.0, "grad_norm": 2.7892757576067972, "language_loss": 0.80210066, "learning_rate": 1.8309807345723422e-06, "loss": 0.82340479, "num_input_tokens_seen": 193797855, "step": 9000, "time_per_iteration": 2.6335151195526123 }, { "auxiliary_loss_clip": 0.01069713, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.03577805, "balance_loss_mlp": 1.01837265, "epoch": 0.541169397264392, "flos": 20522553759360.0, "grad_norm": 1.6231589706551275, "language_loss": 0.73037231, "learning_rate": 1.8305926693338863e-06, "loss": 0.75140095, "num_input_tokens_seen": 193817375, "step": 9001, "time_per_iteration": 2.854574680328369 }, { "auxiliary_loss_clip": 0.01088976, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.03875196, "balance_loss_mlp": 1.0225749, "epoch": 0.54122952051706, "flos": 20044043752320.0, "grad_norm": 2.3946252475459704, "language_loss": 0.85775471, "learning_rate": 1.8302046105193734e-06, "loss": 0.87901723, "num_input_tokens_seen": 193832205, "step": 9002, "time_per_iteration": 2.83799409866333 }, { "auxiliary_loss_clip": 0.01071827, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.03876507, "balance_loss_mlp": 1.0244441, "epoch": 0.541289643769728, "flos": 19062246332160.0, "grad_norm": 1.9022782971983632, "language_loss": 0.78010678, "learning_rate": 1.8298165581435183e-06, "loss": 0.80119157, "num_input_tokens_seen": 193849830, "step": 9003, "time_per_iteration": 2.8771512508392334 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.03998888, "balance_loss_mlp": 1.01659191, "epoch": 0.5413497670223959, "flos": 22382295402240.0, "grad_norm": 2.4815464780266905, "language_loss": 0.69489288, "learning_rate": 1.8294285122210372e-06, "loss": 0.71626127, "num_input_tokens_seen": 193869945, "step": 9004, "time_per_iteration": 2.7296600341796875 }, { "auxiliary_loss_clip": 0.01027886, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.01221299, "balance_loss_mlp": 0.99943334, "epoch": 0.5414098902750639, "flos": 70031734093440.0, "grad_norm": 0.9691738453098017, "language_loss": 0.59067202, "learning_rate": 1.8290404727666434e-06, "loss": 0.61095953, "num_input_tokens_seen": 193930860, "step": 9005, "time_per_iteration": 3.2482104301452637 }, { "auxiliary_loss_clip": 0.011229, "auxiliary_loss_mlp": 0.00771475, "balance_loss_clip": 1.04402518, "balance_loss_mlp": 1.00026715, "epoch": 0.5414700135277318, "flos": 21798962530560.0, "grad_norm": 3.1081571461352357, "language_loss": 0.78251934, "learning_rate": 1.8286524397950517e-06, "loss": 0.80146307, "num_input_tokens_seen": 193949075, "step": 9006, "time_per_iteration": 2.646697521209717 }, { "auxiliary_loss_clip": 0.01099607, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.04162097, "balance_loss_mlp": 1.02380466, "epoch": 0.5415301367803999, "flos": 16907929251840.0, "grad_norm": 2.04905315291525, "language_loss": 0.82968152, "learning_rate": 1.8282644133209777e-06, "loss": 0.85103542, "num_input_tokens_seen": 193967630, "step": 9007, "time_per_iteration": 2.6906566619873047 }, { "auxiliary_loss_clip": 0.01105367, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.04186976, "balance_loss_mlp": 1.02084875, "epoch": 0.5415902600330678, "flos": 25704176065920.0, "grad_norm": 2.002533361325265, "language_loss": 0.67188275, "learning_rate": 1.8278763933591334e-06, "loss": 0.69328809, "num_input_tokens_seen": 193988730, "step": 9008, "time_per_iteration": 2.6538190841674805 }, { "auxiliary_loss_clip": 0.01126211, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.04396832, "balance_loss_mlp": 1.01836514, "epoch": 0.5416503832857358, "flos": 19208151377280.0, "grad_norm": 1.9615897276879948, "language_loss": 0.73713046, "learning_rate": 1.827488379924234e-06, "loss": 0.75872469, "num_input_tokens_seen": 194005160, "step": 9009, "time_per_iteration": 2.5716910362243652 }, { "auxiliary_loss_clip": 0.01072637, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.04184818, "balance_loss_mlp": 1.02171135, "epoch": 0.5417105065384037, "flos": 12713706887040.0, "grad_norm": 2.1963503735452417, "language_loss": 0.87984347, "learning_rate": 1.8271003730309923e-06, "loss": 0.90093064, "num_input_tokens_seen": 194021700, "step": 9010, "time_per_iteration": 2.725271701812744 }, { "auxiliary_loss_clip": 0.01120446, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.04260874, "balance_loss_mlp": 1.02448332, "epoch": 0.5417706297910717, "flos": 30335933998080.0, "grad_norm": 1.8667479755469423, "language_loss": 0.65187848, "learning_rate": 1.826712372694122e-06, "loss": 0.67345679, "num_input_tokens_seen": 194042620, "step": 9011, "time_per_iteration": 2.6546692848205566 }, { "auxiliary_loss_clip": 0.01111756, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.04458547, "balance_loss_mlp": 1.02324426, "epoch": 0.5418307530437396, "flos": 29020992912000.0, "grad_norm": 2.8570982701345797, "language_loss": 0.79252279, "learning_rate": 1.8263243789283362e-06, "loss": 0.81400692, "num_input_tokens_seen": 194061800, "step": 9012, "time_per_iteration": 2.6907572746276855 }, { "auxiliary_loss_clip": 0.01119813, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.04184949, "balance_loss_mlp": 1.01965845, "epoch": 0.5418908762964076, "flos": 16873455173760.0, "grad_norm": 2.191987247231765, "language_loss": 0.74450612, "learning_rate": 1.8259363917483466e-06, "loss": 0.76603615, "num_input_tokens_seen": 194079890, "step": 9013, "time_per_iteration": 2.6294262409210205 }, { "auxiliary_loss_clip": 0.01085863, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.04200959, "balance_loss_mlp": 1.01776361, "epoch": 0.5419509995490756, "flos": 18949702043520.0, "grad_norm": 2.094538198423721, "language_loss": 0.72306025, "learning_rate": 1.8255484111688667e-06, "loss": 0.74422872, "num_input_tokens_seen": 194097625, "step": 9014, "time_per_iteration": 2.653125524520874 }, { "auxiliary_loss_clip": 0.01099897, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.04301429, "balance_loss_mlp": 1.01888418, "epoch": 0.5420111228017436, "flos": 18077719478400.0, "grad_norm": 1.5497382301526352, "language_loss": 0.807073, "learning_rate": 1.8251604372046085e-06, "loss": 0.82839555, "num_input_tokens_seen": 194116055, "step": 9015, "time_per_iteration": 2.6197831630706787 }, { "auxiliary_loss_clip": 0.01117394, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.04648256, "balance_loss_mlp": 1.02635145, "epoch": 0.5420712460544116, "flos": 19061779455360.0, "grad_norm": 2.4637362060141053, "language_loss": 0.81252277, "learning_rate": 1.8247724698702843e-06, "loss": 0.83409584, "num_input_tokens_seen": 194130365, "step": 9016, "time_per_iteration": 2.617722988128662 }, { "auxiliary_loss_clip": 0.01121755, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.04375124, "balance_loss_mlp": 1.01745152, "epoch": 0.5421313693070795, "flos": 18187103370240.0, "grad_norm": 1.6999373176246328, "language_loss": 0.81182349, "learning_rate": 1.8243845091806053e-06, "loss": 0.83334422, "num_input_tokens_seen": 194148975, "step": 9017, "time_per_iteration": 2.629488706588745 }, { "auxiliary_loss_clip": 0.01119384, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.04308951, "balance_loss_mlp": 1.02270925, "epoch": 0.5421914925597475, "flos": 13005947940480.0, "grad_norm": 1.767329743248484, "language_loss": 0.77847707, "learning_rate": 1.8239965551502837e-06, "loss": 0.80002874, "num_input_tokens_seen": 194167185, "step": 9018, "time_per_iteration": 2.595520257949829 }, { "auxiliary_loss_clip": 0.01121333, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04014397, "balance_loss_mlp": 1.02462447, "epoch": 0.5422516158124154, "flos": 46758457831680.0, "grad_norm": 1.6302803515957, "language_loss": 0.66417134, "learning_rate": 1.8236086077940303e-06, "loss": 0.68577361, "num_input_tokens_seen": 194192840, "step": 9019, "time_per_iteration": 2.8572912216186523 }, { "auxiliary_loss_clip": 0.01101197, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.03910589, "balance_loss_mlp": 1.02315295, "epoch": 0.5423117390650835, "flos": 31758642864000.0, "grad_norm": 1.5350920710342792, "language_loss": 0.69515598, "learning_rate": 1.8232206671265555e-06, "loss": 0.71653378, "num_input_tokens_seen": 194213150, "step": 9020, "time_per_iteration": 2.710081100463867 }, { "auxiliary_loss_clip": 0.01082322, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.03962088, "balance_loss_mlp": 1.02462053, "epoch": 0.5423718623177514, "flos": 27201974313600.0, "grad_norm": 1.5706670053852172, "language_loss": 0.80494618, "learning_rate": 1.8228327331625717e-06, "loss": 0.82614136, "num_input_tokens_seen": 194234665, "step": 9021, "time_per_iteration": 2.760133743286133 }, { "auxiliary_loss_clip": 0.01069543, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.03820395, "balance_loss_mlp": 1.02405667, "epoch": 0.5424319855704194, "flos": 23546447193600.0, "grad_norm": 2.2946341773433496, "language_loss": 0.78887641, "learning_rate": 1.822444805916788e-06, "loss": 0.80994844, "num_input_tokens_seen": 194253790, "step": 9022, "time_per_iteration": 2.8245437145233154 }, { "auxiliary_loss_clip": 0.01085662, "auxiliary_loss_mlp": 0.00771451, "balance_loss_clip": 1.03742123, "balance_loss_mlp": 1.00025558, "epoch": 0.5424921088230873, "flos": 26615624699520.0, "grad_norm": 1.6811700220554942, "language_loss": 0.8234387, "learning_rate": 1.822056885403915e-06, "loss": 0.84200984, "num_input_tokens_seen": 194274950, "step": 9023, "time_per_iteration": 2.722637891769409 }, { "auxiliary_loss_clip": 0.01105066, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.04266286, "balance_loss_mlp": 1.01798785, "epoch": 0.5425522320757553, "flos": 23586811102080.0, "grad_norm": 1.7285453701222258, "language_loss": 0.71582222, "learning_rate": 1.8216689716386627e-06, "loss": 0.73718333, "num_input_tokens_seen": 194296155, "step": 9024, "time_per_iteration": 2.6643166542053223 }, { "auxiliary_loss_clip": 0.01109023, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.03978658, "balance_loss_mlp": 1.02231479, "epoch": 0.5426123553284232, "flos": 30592264429440.0, "grad_norm": 1.7605396052132907, "language_loss": 0.65074313, "learning_rate": 1.8212810646357405e-06, "loss": 0.67218173, "num_input_tokens_seen": 194318025, "step": 9025, "time_per_iteration": 2.6963577270507812 }, { "auxiliary_loss_clip": 0.0109579, "auxiliary_loss_mlp": 0.00769932, "balance_loss_clip": 1.04664063, "balance_loss_mlp": 1.00038803, "epoch": 0.5426724785810912, "flos": 12495118671360.0, "grad_norm": 2.055737651503127, "language_loss": 0.73914909, "learning_rate": 1.8208931644098591e-06, "loss": 0.7578063, "num_input_tokens_seen": 194336150, "step": 9026, "time_per_iteration": 2.6317172050476074 }, { "auxiliary_loss_clip": 0.01095155, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.03804421, "balance_loss_mlp": 1.02545154, "epoch": 0.5427326018337592, "flos": 26064611089920.0, "grad_norm": 2.1949475938623224, "language_loss": 0.7840718, "learning_rate": 1.8205052709757265e-06, "loss": 0.80542773, "num_input_tokens_seen": 194355980, "step": 9027, "time_per_iteration": 2.6076927185058594 }, { "auxiliary_loss_clip": 0.01004652, "auxiliary_loss_mlp": 0.01011362, "balance_loss_clip": 1.00918782, "balance_loss_mlp": 1.00950241, "epoch": 0.5427927250864272, "flos": 65984745576960.0, "grad_norm": 0.759944437260396, "language_loss": 0.56566465, "learning_rate": 1.8201173843480515e-06, "loss": 0.58582479, "num_input_tokens_seen": 194422660, "step": 9028, "time_per_iteration": 3.173718214035034 }, { "auxiliary_loss_clip": 0.01078653, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.0437665, "balance_loss_mlp": 1.01519442, "epoch": 0.5428528483390952, "flos": 19975382904960.0, "grad_norm": 2.1789279213341857, "language_loss": 0.7763471, "learning_rate": 1.8197295045415442e-06, "loss": 0.79742968, "num_input_tokens_seen": 194438545, "step": 9029, "time_per_iteration": 2.6010968685150146 }, { "auxiliary_loss_clip": 0.01080602, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.0426538, "balance_loss_mlp": 1.01611137, "epoch": 0.5429129715917631, "flos": 21832323287040.0, "grad_norm": 1.5227150839007966, "language_loss": 0.8289423, "learning_rate": 1.8193416315709112e-06, "loss": 0.85004783, "num_input_tokens_seen": 194458060, "step": 9030, "time_per_iteration": 2.673872232437134 }, { "auxiliary_loss_clip": 0.01119103, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.04308653, "balance_loss_mlp": 1.01801896, "epoch": 0.5429730948444311, "flos": 27782685492480.0, "grad_norm": 1.5242093045096456, "language_loss": 0.74554878, "learning_rate": 1.8189537654508623e-06, "loss": 0.76705134, "num_input_tokens_seen": 194477405, "step": 9031, "time_per_iteration": 2.6361796855926514 }, { "auxiliary_loss_clip": 0.01099875, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.03957534, "balance_loss_mlp": 1.02664721, "epoch": 0.543033218097099, "flos": 26760452336640.0, "grad_norm": 1.8557133497087115, "language_loss": 0.85526693, "learning_rate": 1.8185659061961045e-06, "loss": 0.87666059, "num_input_tokens_seen": 194497085, "step": 9032, "time_per_iteration": 2.633051872253418 }, { "auxiliary_loss_clip": 0.01101785, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.04154074, "balance_loss_mlp": 1.02405477, "epoch": 0.5430933413497671, "flos": 22675254727680.0, "grad_norm": 1.789713495487195, "language_loss": 0.74318242, "learning_rate": 1.8181780538213457e-06, "loss": 0.76457155, "num_input_tokens_seen": 194516785, "step": 9033, "time_per_iteration": 2.654573917388916 }, { "auxiliary_loss_clip": 0.01080113, "auxiliary_loss_mlp": 0.01040958, "balance_loss_clip": 1.03826129, "balance_loss_mlp": 1.0267365, "epoch": 0.543153464602435, "flos": 24607499973120.0, "grad_norm": 1.5302152204895145, "language_loss": 0.75507742, "learning_rate": 1.8177902083412935e-06, "loss": 0.77628815, "num_input_tokens_seen": 194536475, "step": 9034, "time_per_iteration": 6.07684326171875 }, { "auxiliary_loss_clip": 0.01080457, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.04235947, "balance_loss_mlp": 1.02360463, "epoch": 0.543213587855103, "flos": 19025725178880.0, "grad_norm": 1.697596865274133, "language_loss": 0.84559906, "learning_rate": 1.817402369770655e-06, "loss": 0.86676252, "num_input_tokens_seen": 194554495, "step": 9035, "time_per_iteration": 4.246930122375488 }, { "auxiliary_loss_clip": 0.01010369, "auxiliary_loss_mlp": 0.01004655, "balance_loss_clip": 1.01446867, "balance_loss_mlp": 1.00328398, "epoch": 0.5432737111077709, "flos": 65686435125120.0, "grad_norm": 0.7105133860132232, "language_loss": 0.55900681, "learning_rate": 1.8170145381241364e-06, "loss": 0.57915699, "num_input_tokens_seen": 194617620, "step": 9036, "time_per_iteration": 3.214927911758423 }, { "auxiliary_loss_clip": 0.0106374, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.04064369, "balance_loss_mlp": 1.02285123, "epoch": 0.5433338343604389, "flos": 22091670460800.0, "grad_norm": 1.4967561616212492, "language_loss": 0.75198317, "learning_rate": 1.8166267134164451e-06, "loss": 0.77298009, "num_input_tokens_seen": 194637690, "step": 9037, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01089499, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.039361, "balance_loss_mlp": 1.02274799, "epoch": 0.5433939576131068, "flos": 34672649616000.0, "grad_norm": 1.6562121389813547, "language_loss": 0.66519392, "learning_rate": 1.8162388956622875e-06, "loss": 0.68645203, "num_input_tokens_seen": 194659520, "step": 9038, "time_per_iteration": 2.788142681121826 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.03904057, "balance_loss_mlp": 1.02456677, "epoch": 0.5434540808657748, "flos": 20303355012480.0, "grad_norm": 1.9500381910938636, "language_loss": 0.7809025, "learning_rate": 1.8158510848763692e-06, "loss": 0.80232668, "num_input_tokens_seen": 194677645, "step": 9039, "time_per_iteration": 4.200030326843262 }, { "auxiliary_loss_clip": 0.01076379, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.03707099, "balance_loss_mlp": 1.02523017, "epoch": 0.5435142041184428, "flos": 23112790295040.0, "grad_norm": 1.9066978344822971, "language_loss": 0.76675421, "learning_rate": 1.8154632810733962e-06, "loss": 0.7879017, "num_input_tokens_seen": 194697400, "step": 9040, "time_per_iteration": 2.752359628677368 }, { "auxiliary_loss_clip": 0.01021921, "auxiliary_loss_mlp": 0.0101024, "balance_loss_clip": 1.01599014, "balance_loss_mlp": 1.00891709, "epoch": 0.5435743273711108, "flos": 64012746954240.0, "grad_norm": 0.6657326543890927, "language_loss": 0.52456856, "learning_rate": 1.815075484268074e-06, "loss": 0.54489017, "num_input_tokens_seen": 194761205, "step": 9041, "time_per_iteration": 3.19743275642395 }, { "auxiliary_loss_clip": 0.01092893, "auxiliary_loss_mlp": 0.01043232, "balance_loss_clip": 1.04014623, "balance_loss_mlp": 1.0300709, "epoch": 0.5436344506237788, "flos": 25118903859840.0, "grad_norm": 1.6935261425615555, "language_loss": 0.76397556, "learning_rate": 1.8146876944751078e-06, "loss": 0.78533685, "num_input_tokens_seen": 194782445, "step": 9042, "time_per_iteration": 2.7176172733306885 }, { "auxiliary_loss_clip": 0.01082719, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.04040313, "balance_loss_mlp": 1.01886773, "epoch": 0.5436945738764467, "flos": 19572967860480.0, "grad_norm": 1.7014237411229687, "language_loss": 0.67346215, "learning_rate": 1.8142999117092033e-06, "loss": 0.69459915, "num_input_tokens_seen": 194800325, "step": 9043, "time_per_iteration": 2.7166213989257812 }, { "auxiliary_loss_clip": 0.0107861, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.03779316, "balance_loss_mlp": 1.01971054, "epoch": 0.5437546971291147, "flos": 21142515525120.0, "grad_norm": 1.5921714365650326, "language_loss": 0.84577447, "learning_rate": 1.8139121359850644e-06, "loss": 0.86688828, "num_input_tokens_seen": 194818675, "step": 9044, "time_per_iteration": 2.758593797683716 }, { "auxiliary_loss_clip": 0.01123207, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.04196227, "balance_loss_mlp": 1.01723039, "epoch": 0.5438148203817826, "flos": 25118688378240.0, "grad_norm": 1.5431059852471993, "language_loss": 0.62074721, "learning_rate": 1.8135243673173956e-06, "loss": 0.64228952, "num_input_tokens_seen": 194836595, "step": 9045, "time_per_iteration": 2.6207923889160156 }, { "auxiliary_loss_clip": 0.0112166, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.04318917, "balance_loss_mlp": 1.02179205, "epoch": 0.5438749436344507, "flos": 23002939526400.0, "grad_norm": 1.4293832885602564, "language_loss": 0.70140386, "learning_rate": 1.8131366057209023e-06, "loss": 0.72297299, "num_input_tokens_seen": 194857520, "step": 9046, "time_per_iteration": 2.6262285709381104 }, { "auxiliary_loss_clip": 0.01117279, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.04171467, "balance_loss_mlp": 1.01709127, "epoch": 0.5439350668871186, "flos": 15487016065920.0, "grad_norm": 1.95554521575616, "language_loss": 0.7724129, "learning_rate": 1.8127488512102868e-06, "loss": 0.79387808, "num_input_tokens_seen": 194876020, "step": 9047, "time_per_iteration": 2.592041492462158 }, { "auxiliary_loss_clip": 0.01094716, "auxiliary_loss_mlp": 0.01047772, "balance_loss_clip": 1.04039311, "balance_loss_mlp": 1.03321636, "epoch": 0.5439951901397866, "flos": 17238415311360.0, "grad_norm": 1.5854248061222735, "language_loss": 0.7262761, "learning_rate": 1.8123611038002547e-06, "loss": 0.74770093, "num_input_tokens_seen": 194894650, "step": 9048, "time_per_iteration": 2.667393684387207 }, { "auxiliary_loss_clip": 0.01069346, "auxiliary_loss_mlp": 0.01045305, "balance_loss_clip": 1.03664947, "balance_loss_mlp": 1.0298202, "epoch": 0.5440553133924545, "flos": 18661016436480.0, "grad_norm": 1.9805900660696516, "language_loss": 0.93650311, "learning_rate": 1.8119733635055076e-06, "loss": 0.95764971, "num_input_tokens_seen": 194911935, "step": 9049, "time_per_iteration": 2.7119088172912598 }, { "auxiliary_loss_clip": 0.0110651, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.03992295, "balance_loss_mlp": 1.02054429, "epoch": 0.5441154366451225, "flos": 27122934435840.0, "grad_norm": 1.7800719649484351, "language_loss": 0.73936987, "learning_rate": 1.8115856303407492e-06, "loss": 0.76076329, "num_input_tokens_seen": 194931620, "step": 9050, "time_per_iteration": 2.631661891937256 }, { "auxiliary_loss_clip": 0.01111441, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.0437777, "balance_loss_mlp": 1.01755428, "epoch": 0.5441755598977904, "flos": 25993867253760.0, "grad_norm": 1.737903905046117, "language_loss": 0.66990525, "learning_rate": 1.8111979043206832e-06, "loss": 0.69132841, "num_input_tokens_seen": 194952560, "step": 9051, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01080337, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.03722811, "balance_loss_mlp": 1.02039015, "epoch": 0.5442356831504584, "flos": 32380041173760.0, "grad_norm": 2.245844605247971, "language_loss": 0.67334735, "learning_rate": 1.810810185460011e-06, "loss": 0.69448429, "num_input_tokens_seen": 194973915, "step": 9052, "time_per_iteration": 2.778211832046509 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.04266417, "balance_loss_mlp": 1.02010286, "epoch": 0.5442958064031264, "flos": 24164290056960.0, "grad_norm": 1.8200748140762566, "language_loss": 0.92835879, "learning_rate": 1.810422473773436e-06, "loss": 0.9499042, "num_input_tokens_seen": 194990170, "step": 9053, "time_per_iteration": 2.6110095977783203 }, { "auxiliary_loss_clip": 0.01093907, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.04024363, "balance_loss_mlp": 1.02203834, "epoch": 0.5443559296557944, "flos": 18764690065920.0, "grad_norm": 2.3950140888374687, "language_loss": 0.83948398, "learning_rate": 1.8100347692756595e-06, "loss": 0.86077261, "num_input_tokens_seen": 195006395, "step": 9054, "time_per_iteration": 2.6261367797851562 }, { "auxiliary_loss_clip": 0.01090647, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.03965771, "balance_loss_mlp": 1.02094352, "epoch": 0.5444160529084624, "flos": 22632556435200.0, "grad_norm": 2.6065175825707327, "language_loss": 0.68213475, "learning_rate": 1.8096470719813836e-06, "loss": 0.70338708, "num_input_tokens_seen": 195025080, "step": 9055, "time_per_iteration": 2.623518705368042 }, { "auxiliary_loss_clip": 0.01000083, "auxiliary_loss_mlp": 0.00999074, "balance_loss_clip": 1.01110244, "balance_loss_mlp": 0.99770337, "epoch": 0.5444761761611303, "flos": 69671909600640.0, "grad_norm": 0.7426728731430834, "language_loss": 0.57650024, "learning_rate": 1.80925938190531e-06, "loss": 0.59649181, "num_input_tokens_seen": 195085725, "step": 9056, "time_per_iteration": 3.2228453159332275 }, { "auxiliary_loss_clip": 0.01087409, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.04208684, "balance_loss_mlp": 1.02234185, "epoch": 0.5445362994137983, "flos": 14278442129280.0, "grad_norm": 1.75653480561415, "language_loss": 0.69749284, "learning_rate": 1.8088716990621395e-06, "loss": 0.71872711, "num_input_tokens_seen": 195102585, "step": 9057, "time_per_iteration": 2.7110843658447266 }, { "auxiliary_loss_clip": 0.01106044, "auxiliary_loss_mlp": 0.01038419, "balance_loss_clip": 1.04014075, "balance_loss_mlp": 1.02472818, "epoch": 0.5445964226664662, "flos": 28986195611520.0, "grad_norm": 2.0738816921888366, "language_loss": 0.75373238, "learning_rate": 1.8084840234665738e-06, "loss": 0.775177, "num_input_tokens_seen": 195120055, "step": 9058, "time_per_iteration": 2.7001023292541504 }, { "auxiliary_loss_clip": 0.01003793, "auxiliary_loss_mlp": 0.01003874, "balance_loss_clip": 1.01181531, "balance_loss_mlp": 1.00230026, "epoch": 0.5446565459191343, "flos": 68620230270720.0, "grad_norm": 0.7925901763726337, "language_loss": 0.6261481, "learning_rate": 1.808096355133312e-06, "loss": 0.6462248, "num_input_tokens_seen": 195181045, "step": 9059, "time_per_iteration": 3.355748414993286 }, { "auxiliary_loss_clip": 0.01107073, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.0414511, "balance_loss_mlp": 1.01922059, "epoch": 0.5447166691718022, "flos": 16216469464320.0, "grad_norm": 1.790354282478879, "language_loss": 0.79365647, "learning_rate": 1.8077086940770572e-06, "loss": 0.81505585, "num_input_tokens_seen": 195198840, "step": 9060, "time_per_iteration": 2.6523141860961914 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.04219317, "balance_loss_mlp": 1.01976824, "epoch": 0.5447767924244702, "flos": 25849039616640.0, "grad_norm": 1.7487339019361072, "language_loss": 0.8006283, "learning_rate": 1.8073210403125072e-06, "loss": 0.82207763, "num_input_tokens_seen": 195218720, "step": 9061, "time_per_iteration": 2.660477876663208 }, { "auxiliary_loss_clip": 0.01107514, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.04152489, "balance_loss_mlp": 1.01595628, "epoch": 0.5448369156771381, "flos": 19677718897920.0, "grad_norm": 1.667542325640746, "language_loss": 0.8699556, "learning_rate": 1.8069333938543627e-06, "loss": 0.89131653, "num_input_tokens_seen": 195235770, "step": 9062, "time_per_iteration": 2.6527698040008545 }, { "auxiliary_loss_clip": 0.0109274, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03916395, "balance_loss_mlp": 1.02188551, "epoch": 0.5448970389298061, "flos": 19281804215040.0, "grad_norm": 1.6766222611874342, "language_loss": 0.82069784, "learning_rate": 1.8065457547173233e-06, "loss": 0.84199893, "num_input_tokens_seen": 195254870, "step": 9063, "time_per_iteration": 2.651977062225342 }, { "auxiliary_loss_clip": 0.01118028, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.0406127, "balance_loss_mlp": 1.01958823, "epoch": 0.544957162182474, "flos": 20991690316800.0, "grad_norm": 1.769153488212037, "language_loss": 0.63484013, "learning_rate": 1.8061581229160878e-06, "loss": 0.65634954, "num_input_tokens_seen": 195273390, "step": 9064, "time_per_iteration": 2.595914602279663 }, { "auxiliary_loss_clip": 0.0112242, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.04264021, "balance_loss_mlp": 1.02337003, "epoch": 0.545017285435142, "flos": 25374587846400.0, "grad_norm": 1.6143269954810184, "language_loss": 0.79795569, "learning_rate": 1.8057704984653566e-06, "loss": 0.81955075, "num_input_tokens_seen": 195295635, "step": 9065, "time_per_iteration": 2.647632360458374 }, { "auxiliary_loss_clip": 0.01082455, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.04022825, "balance_loss_mlp": 1.0211482, "epoch": 0.54507740868781, "flos": 19134749934720.0, "grad_norm": 2.1024584454927626, "language_loss": 0.77589709, "learning_rate": 1.805382881379827e-06, "loss": 0.79705, "num_input_tokens_seen": 195312545, "step": 9066, "time_per_iteration": 2.750904083251953 }, { "auxiliary_loss_clip": 0.01106868, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.04005289, "balance_loss_mlp": 1.01794958, "epoch": 0.545137531940478, "flos": 26249802635520.0, "grad_norm": 2.0527073359497665, "language_loss": 0.75859725, "learning_rate": 1.8049952716741975e-06, "loss": 0.77997983, "num_input_tokens_seen": 195332955, "step": 9067, "time_per_iteration": 2.68332839012146 }, { "auxiliary_loss_clip": 0.0108798, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.04256892, "balance_loss_mlp": 1.02183652, "epoch": 0.545197655193146, "flos": 37555629995520.0, "grad_norm": 6.876378009840058, "language_loss": 0.63596183, "learning_rate": 1.8046076693631682e-06, "loss": 0.65722257, "num_input_tokens_seen": 195355930, "step": 9068, "time_per_iteration": 2.893052816390991 }, { "auxiliary_loss_clip": 0.01080095, "auxiliary_loss_mlp": 0.01041608, "balance_loss_clip": 1.0446372, "balance_loss_mlp": 1.02935874, "epoch": 0.5452577784458139, "flos": 26031250333440.0, "grad_norm": 1.5002235169223528, "language_loss": 0.7186054, "learning_rate": 1.8042200744614343e-06, "loss": 0.73982239, "num_input_tokens_seen": 195376445, "step": 9069, "time_per_iteration": 2.7437844276428223 }, { "auxiliary_loss_clip": 0.01118098, "auxiliary_loss_mlp": 0.01028881, "balance_loss_clip": 1.04397726, "balance_loss_mlp": 1.0169543, "epoch": 0.5453179016984819, "flos": 17639034675840.0, "grad_norm": 1.9248359915141238, "language_loss": 0.73836279, "learning_rate": 1.8038324869836957e-06, "loss": 0.75983256, "num_input_tokens_seen": 195393725, "step": 9070, "time_per_iteration": 2.629026174545288 }, { "auxiliary_loss_clip": 0.01104842, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.0405302, "balance_loss_mlp": 1.02508879, "epoch": 0.5453780249511498, "flos": 23216679406080.0, "grad_norm": 2.895965777257026, "language_loss": 0.60386193, "learning_rate": 1.8034449069446489e-06, "loss": 0.62529415, "num_input_tokens_seen": 195411380, "step": 9071, "time_per_iteration": 2.787898540496826 }, { "auxiliary_loss_clip": 0.0103628, "auxiliary_loss_mlp": 0.01019994, "balance_loss_clip": 1.01031959, "balance_loss_mlp": 1.01858091, "epoch": 0.5454381482038179, "flos": 68696504801280.0, "grad_norm": 0.701915733274622, "language_loss": 0.57096583, "learning_rate": 1.80305733435899e-06, "loss": 0.59152853, "num_input_tokens_seen": 195482015, "step": 9072, "time_per_iteration": 3.3096070289611816 }, { "auxiliary_loss_clip": 0.01088718, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.03829658, "balance_loss_mlp": 1.02696621, "epoch": 0.5454982714564858, "flos": 13260626346240.0, "grad_norm": 1.6985686628313852, "language_loss": 0.6941787, "learning_rate": 1.8026697692414174e-06, "loss": 0.71547508, "num_input_tokens_seen": 195500440, "step": 9073, "time_per_iteration": 5.942334413528442 }, { "auxiliary_loss_clip": 0.01094077, "auxiliary_loss_mlp": 0.01042156, "balance_loss_clip": 1.03799677, "balance_loss_mlp": 1.02981734, "epoch": 0.5455583947091538, "flos": 21835878733440.0, "grad_norm": 1.7477774368009211, "language_loss": 0.7124452, "learning_rate": 1.802282211606627e-06, "loss": 0.73380756, "num_input_tokens_seen": 195520860, "step": 9074, "time_per_iteration": 2.6760778427124023 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.04050887, "balance_loss_mlp": 1.02611828, "epoch": 0.5456185179618217, "flos": 17817438551040.0, "grad_norm": 1.854490114521215, "language_loss": 0.68543398, "learning_rate": 1.8018946614693148e-06, "loss": 0.70688331, "num_input_tokens_seen": 195538615, "step": 9075, "time_per_iteration": 4.19740891456604 }, { "auxiliary_loss_clip": 0.01109026, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 1.04411292, "balance_loss_mlp": 1.02303696, "epoch": 0.5456786412144897, "flos": 21069401391360.0, "grad_norm": 1.8542702472429493, "language_loss": 0.80530715, "learning_rate": 1.8015071188441768e-06, "loss": 0.82674479, "num_input_tokens_seen": 195557460, "step": 9076, "time_per_iteration": 2.6821329593658447 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.04109383, "balance_loss_mlp": 1.01970196, "epoch": 0.5457387644671576, "flos": 23294965098240.0, "grad_norm": 1.6176910715306643, "language_loss": 0.80137533, "learning_rate": 1.8011195837459089e-06, "loss": 0.82279032, "num_input_tokens_seen": 195577985, "step": 9077, "time_per_iteration": 2.6378607749938965 }, { "auxiliary_loss_clip": 0.01103737, "auxiliary_loss_mlp": 0.01035436, "balance_loss_clip": 1.04032636, "balance_loss_mlp": 1.02293682, "epoch": 0.5457988877198257, "flos": 21617039122560.0, "grad_norm": 2.2183628478116346, "language_loss": 0.67997038, "learning_rate": 1.8007320561892064e-06, "loss": 0.70136213, "num_input_tokens_seen": 195597620, "step": 9078, "time_per_iteration": 4.261017560958862 }, { "auxiliary_loss_clip": 0.01114465, "auxiliary_loss_mlp": 0.01039359, "balance_loss_clip": 1.04379976, "balance_loss_mlp": 1.02579284, "epoch": 0.5458590109724936, "flos": 23762485543680.0, "grad_norm": 1.8448340723101526, "language_loss": 0.80507636, "learning_rate": 1.800344536188764e-06, "loss": 0.82661462, "num_input_tokens_seen": 195615910, "step": 9079, "time_per_iteration": 2.6384685039520264 }, { "auxiliary_loss_clip": 0.01124513, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.04221058, "balance_loss_mlp": 1.02032018, "epoch": 0.5459191342251616, "flos": 24424283675520.0, "grad_norm": 1.6928746227882223, "language_loss": 0.75848919, "learning_rate": 1.799957023759277e-06, "loss": 0.78008378, "num_input_tokens_seen": 195635620, "step": 9080, "time_per_iteration": 2.6506381034851074 }, { "auxiliary_loss_clip": 0.01080273, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.03795743, "balance_loss_mlp": 1.0230484, "epoch": 0.5459792574778296, "flos": 23623009032960.0, "grad_norm": 2.0769433103494737, "language_loss": 0.83164978, "learning_rate": 1.7995695189154392e-06, "loss": 0.85282731, "num_input_tokens_seen": 195652495, "step": 9081, "time_per_iteration": 2.705381393432617 }, { "auxiliary_loss_clip": 0.0112596, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.04470921, "balance_loss_mlp": 1.01884151, "epoch": 0.5460393807304975, "flos": 19135540033920.0, "grad_norm": 1.688461125774873, "language_loss": 0.70063365, "learning_rate": 1.7991820216719461e-06, "loss": 0.72221684, "num_input_tokens_seen": 195671965, "step": 9082, "time_per_iteration": 2.6176023483276367 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.03972983, "balance_loss_mlp": 1.01709151, "epoch": 0.5460995039831655, "flos": 35918534805120.0, "grad_norm": 1.559424348169526, "language_loss": 0.66653717, "learning_rate": 1.7987945320434906e-06, "loss": 0.68799043, "num_input_tokens_seen": 195694725, "step": 9083, "time_per_iteration": 2.710636854171753 }, { "auxiliary_loss_clip": 0.01091037, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.03879106, "balance_loss_mlp": 1.01998401, "epoch": 0.5461596272358334, "flos": 26759231274240.0, "grad_norm": 1.7271294710436846, "language_loss": 0.78584135, "learning_rate": 1.798407050044766e-06, "loss": 0.80708218, "num_input_tokens_seen": 195714090, "step": 9084, "time_per_iteration": 2.6876227855682373 }, { "auxiliary_loss_clip": 0.01111571, "auxiliary_loss_mlp": 0.01037411, "balance_loss_clip": 1.042117, "balance_loss_mlp": 1.02412558, "epoch": 0.5462197504885015, "flos": 20886580143360.0, "grad_norm": 2.0534049917888852, "language_loss": 0.75331509, "learning_rate": 1.7980195756904675e-06, "loss": 0.77480489, "num_input_tokens_seen": 195733585, "step": 9085, "time_per_iteration": 2.710315704345703 }, { "auxiliary_loss_clip": 0.01098293, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.0397166, "balance_loss_mlp": 1.02216959, "epoch": 0.5462798737411694, "flos": 25804976607360.0, "grad_norm": 2.0038443585531174, "language_loss": 0.75082123, "learning_rate": 1.7976321089952857e-06, "loss": 0.7721619, "num_input_tokens_seen": 195752820, "step": 9086, "time_per_iteration": 2.7101428508758545 }, { "auxiliary_loss_clip": 0.01102837, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.03951812, "balance_loss_mlp": 1.02227759, "epoch": 0.5463399969938374, "flos": 25775027642880.0, "grad_norm": 1.6829711206227542, "language_loss": 0.77097058, "learning_rate": 1.7972446499739155e-06, "loss": 0.79235566, "num_input_tokens_seen": 195773740, "step": 9087, "time_per_iteration": 2.6439003944396973 }, { "auxiliary_loss_clip": 0.01114018, "auxiliary_loss_mlp": 0.01042361, "balance_loss_clip": 1.04376245, "balance_loss_mlp": 1.02707863, "epoch": 0.5464001202465053, "flos": 18843298980480.0, "grad_norm": 1.9617582228039958, "language_loss": 0.77464199, "learning_rate": 1.7968571986410484e-06, "loss": 0.79620576, "num_input_tokens_seen": 195792125, "step": 9088, "time_per_iteration": 2.62850022315979 }, { "auxiliary_loss_clip": 0.00993547, "auxiliary_loss_mlp": 0.00999929, "balance_loss_clip": 1.02517176, "balance_loss_mlp": 0.99852258, "epoch": 0.5464602434991733, "flos": 69049541623680.0, "grad_norm": 0.7268281858475805, "language_loss": 0.57717931, "learning_rate": 1.7964697550113758e-06, "loss": 0.59711409, "num_input_tokens_seen": 195854935, "step": 9089, "time_per_iteration": 3.532050371170044 }, { "auxiliary_loss_clip": 0.01085451, "auxiliary_loss_mlp": 0.01038489, "balance_loss_clip": 1.03805399, "balance_loss_mlp": 1.02422571, "epoch": 0.5465203667518412, "flos": 27560039040000.0, "grad_norm": 1.7593878993297172, "language_loss": 0.76682436, "learning_rate": 1.7960823190995918e-06, "loss": 0.78806376, "num_input_tokens_seen": 195874715, "step": 9090, "time_per_iteration": 3.0779287815093994 }, { "auxiliary_loss_clip": 0.01106384, "auxiliary_loss_mlp": 0.01039408, "balance_loss_clip": 1.03928399, "balance_loss_mlp": 1.0233984, "epoch": 0.5465804900045093, "flos": 21210206705280.0, "grad_norm": 1.8843979676244431, "language_loss": 0.74037111, "learning_rate": 1.7956948909203855e-06, "loss": 0.76182902, "num_input_tokens_seen": 195892610, "step": 9091, "time_per_iteration": 2.6843886375427246 }, { "auxiliary_loss_clip": 0.01103772, "auxiliary_loss_mlp": 0.01037785, "balance_loss_clip": 1.04514658, "balance_loss_mlp": 1.02397454, "epoch": 0.5466406132571772, "flos": 22488949860480.0, "grad_norm": 1.8168674877061988, "language_loss": 0.78466463, "learning_rate": 1.7953074704884498e-06, "loss": 0.80608022, "num_input_tokens_seen": 195911085, "step": 9092, "time_per_iteration": 2.6951024532318115 }, { "auxiliary_loss_clip": 0.01125215, "auxiliary_loss_mlp": 0.01034303, "balance_loss_clip": 1.04363537, "balance_loss_mlp": 1.01997435, "epoch": 0.5467007365098452, "flos": 17675843137920.0, "grad_norm": 2.188123152779193, "language_loss": 0.74691254, "learning_rate": 1.794920057818476e-06, "loss": 0.76850772, "num_input_tokens_seen": 195929845, "step": 9093, "time_per_iteration": 2.596165657043457 }, { "auxiliary_loss_clip": 0.01112494, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.04044032, "balance_loss_mlp": 1.02444029, "epoch": 0.5467608597625132, "flos": 15698852524800.0, "grad_norm": 2.4498750676664414, "language_loss": 0.6874221, "learning_rate": 1.7945326529251533e-06, "loss": 0.70895356, "num_input_tokens_seen": 195946350, "step": 9094, "time_per_iteration": 2.617203712463379 }, { "auxiliary_loss_clip": 0.01100239, "auxiliary_loss_mlp": 0.0103544, "balance_loss_clip": 1.04255402, "balance_loss_mlp": 1.02238083, "epoch": 0.5468209830151811, "flos": 24312816794880.0, "grad_norm": 3.189829826251606, "language_loss": 0.67888498, "learning_rate": 1.7941452558231731e-06, "loss": 0.70024174, "num_input_tokens_seen": 195959840, "step": 9095, "time_per_iteration": 2.709214687347412 }, { "auxiliary_loss_clip": 0.01085979, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.0412364, "balance_loss_mlp": 1.0228703, "epoch": 0.5468811062678491, "flos": 29166323339520.0, "grad_norm": 1.772487886139895, "language_loss": 0.66687673, "learning_rate": 1.7937578665272256e-06, "loss": 0.68809879, "num_input_tokens_seen": 195981125, "step": 9096, "time_per_iteration": 2.768289804458618 }, { "auxiliary_loss_clip": 0.01013718, "auxiliary_loss_mlp": 0.01003083, "balance_loss_clip": 1.01639581, "balance_loss_mlp": 1.00179529, "epoch": 0.546941229520517, "flos": 67867037982720.0, "grad_norm": 0.7380745619271847, "language_loss": 0.57528484, "learning_rate": 1.7933704850520007e-06, "loss": 0.59545285, "num_input_tokens_seen": 196038880, "step": 9097, "time_per_iteration": 3.353034496307373 }, { "auxiliary_loss_clip": 0.01023908, "auxiliary_loss_mlp": 0.00999165, "balance_loss_clip": 1.01245689, "balance_loss_mlp": 0.99754351, "epoch": 0.5470013527731851, "flos": 58270306625280.0, "grad_norm": 0.9199423088856966, "language_loss": 0.64710629, "learning_rate": 1.7929831114121868e-06, "loss": 0.66733694, "num_input_tokens_seen": 196099215, "step": 9098, "time_per_iteration": 3.1356828212738037 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.04415989, "balance_loss_mlp": 1.02378869, "epoch": 0.547061476025853, "flos": 22965915582720.0, "grad_norm": 2.132166365058938, "language_loss": 0.73123235, "learning_rate": 1.7925957456224753e-06, "loss": 0.75275862, "num_input_tokens_seen": 196120370, "step": 9099, "time_per_iteration": 2.662252426147461 }, { "auxiliary_loss_clip": 0.01097751, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.04278708, "balance_loss_mlp": 1.02327275, "epoch": 0.547121599278521, "flos": 29968244426880.0, "grad_norm": 1.880355780986747, "language_loss": 0.72515011, "learning_rate": 1.7922083876975537e-06, "loss": 0.74648476, "num_input_tokens_seen": 196139075, "step": 9100, "time_per_iteration": 2.859636068344116 }, { "auxiliary_loss_clip": 0.01106059, "auxiliary_loss_mlp": 0.00770753, "balance_loss_clip": 1.04162157, "balance_loss_mlp": 1.00017691, "epoch": 0.5471817225311889, "flos": 36535443914880.0, "grad_norm": 1.8314110929237357, "language_loss": 0.68211091, "learning_rate": 1.7918210376521102e-06, "loss": 0.70087898, "num_input_tokens_seen": 196159990, "step": 9101, "time_per_iteration": 2.747811794281006 }, { "auxiliary_loss_clip": 0.01123228, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.04393971, "balance_loss_mlp": 1.02121687, "epoch": 0.5472418457838569, "flos": 25775243124480.0, "grad_norm": 1.907951209204745, "language_loss": 0.77796781, "learning_rate": 1.7914336955008343e-06, "loss": 0.79954892, "num_input_tokens_seen": 196180570, "step": 9102, "time_per_iteration": 2.6425788402557373 }, { "auxiliary_loss_clip": 0.01087581, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.04114008, "balance_loss_mlp": 1.02447212, "epoch": 0.5473019690365248, "flos": 27887687925120.0, "grad_norm": 1.553646996990172, "language_loss": 0.72080058, "learning_rate": 1.791046361258413e-06, "loss": 0.74207032, "num_input_tokens_seen": 196200300, "step": 9103, "time_per_iteration": 2.7307486534118652 }, { "auxiliary_loss_clip": 0.01088884, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.0425241, "balance_loss_mlp": 1.01806211, "epoch": 0.5473620922891929, "flos": 57631490219520.0, "grad_norm": 1.4283303897304696, "language_loss": 0.65195155, "learning_rate": 1.7906590349395356e-06, "loss": 0.67315584, "num_input_tokens_seen": 196228525, "step": 9104, "time_per_iteration": 3.0792930126190186 }, { "auxiliary_loss_clip": 0.01109949, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.04480743, "balance_loss_mlp": 1.01883578, "epoch": 0.5474222155418608, "flos": 19354056422400.0, "grad_norm": 1.90483998435302, "language_loss": 0.82428771, "learning_rate": 1.790271716558888e-06, "loss": 0.84571701, "num_input_tokens_seen": 196247690, "step": 9105, "time_per_iteration": 3.3235061168670654 }, { "auxiliary_loss_clip": 0.01119165, "auxiliary_loss_mlp": 0.01030088, "balance_loss_clip": 1.04210079, "balance_loss_mlp": 1.01735604, "epoch": 0.5474823387945288, "flos": 25120448144640.0, "grad_norm": 1.6592382133296117, "language_loss": 0.80052161, "learning_rate": 1.7898844061311575e-06, "loss": 0.82201409, "num_input_tokens_seen": 196268555, "step": 9106, "time_per_iteration": 2.7082676887512207 }, { "auxiliary_loss_clip": 0.01115376, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.04689944, "balance_loss_mlp": 1.02419519, "epoch": 0.5475424620471967, "flos": 18004174381440.0, "grad_norm": 1.7933883779040884, "language_loss": 0.69402343, "learning_rate": 1.7894971036710322e-06, "loss": 0.71554577, "num_input_tokens_seen": 196285585, "step": 9107, "time_per_iteration": 2.626214027404785 }, { "auxiliary_loss_clip": 0.01115289, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.04319263, "balance_loss_mlp": 1.02166939, "epoch": 0.5476025852998647, "flos": 22309324922880.0, "grad_norm": 2.6929722220667824, "language_loss": 0.63537276, "learning_rate": 1.789109809193197e-06, "loss": 0.65687621, "num_input_tokens_seen": 196305085, "step": 9108, "time_per_iteration": 2.6056766510009766 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 1.0446291, "balance_loss_mlp": 1.01750922, "epoch": 0.5476627085525327, "flos": 20120497850880.0, "grad_norm": 1.7311986454715018, "language_loss": 0.75234431, "learning_rate": 1.7887225227123396e-06, "loss": 0.77386445, "num_input_tokens_seen": 196323945, "step": 9109, "time_per_iteration": 2.562833786010742 }, { "auxiliary_loss_clip": 0.01093609, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.04307365, "balance_loss_mlp": 1.02143562, "epoch": 0.5477228318052006, "flos": 17712579772800.0, "grad_norm": 1.7887859684809904, "language_loss": 0.77939326, "learning_rate": 1.7883352442431457e-06, "loss": 0.800686, "num_input_tokens_seen": 196342200, "step": 9110, "time_per_iteration": 2.62839674949646 }, { "auxiliary_loss_clip": 0.01106302, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.04262304, "balance_loss_mlp": 1.01997423, "epoch": 0.5477829550578687, "flos": 25848895962240.0, "grad_norm": 1.525983194059855, "language_loss": 0.71175343, "learning_rate": 1.7879479738002993e-06, "loss": 0.73314071, "num_input_tokens_seen": 196362940, "step": 9111, "time_per_iteration": 2.664486885070801 }, { "auxiliary_loss_clip": 0.01111586, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.0436976, "balance_loss_mlp": 1.0317409, "epoch": 0.5478430783105366, "flos": 23039676161280.0, "grad_norm": 1.5197619181850293, "language_loss": 0.71096945, "learning_rate": 1.7875607113984876e-06, "loss": 0.73253489, "num_input_tokens_seen": 196383070, "step": 9112, "time_per_iteration": 2.7334086894989014 }, { "auxiliary_loss_clip": 0.01067523, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.03873658, "balance_loss_mlp": 1.02179968, "epoch": 0.5479032015632046, "flos": 16071210864000.0, "grad_norm": 2.172543516099556, "language_loss": 0.87877554, "learning_rate": 1.7871734570523953e-06, "loss": 0.89980012, "num_input_tokens_seen": 196398485, "step": 9113, "time_per_iteration": 5.9666571617126465 }, { "auxiliary_loss_clip": 0.01070074, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.04229951, "balance_loss_mlp": 1.01853991, "epoch": 0.5479633248158725, "flos": 24278701852800.0, "grad_norm": 1.4694487805740626, "language_loss": 0.73041236, "learning_rate": 1.7867862107767067e-06, "loss": 0.7514348, "num_input_tokens_seen": 196417725, "step": 9114, "time_per_iteration": 4.333765745162964 }, { "auxiliary_loss_clip": 0.01093195, "auxiliary_loss_mlp": 0.00770887, "balance_loss_clip": 1.03821266, "balance_loss_mlp": 1.00027823, "epoch": 0.5480234480685405, "flos": 26358216860160.0, "grad_norm": 1.6145561495164014, "language_loss": 0.72155976, "learning_rate": 1.7863989725861066e-06, "loss": 0.74020058, "num_input_tokens_seen": 196437840, "step": 9115, "time_per_iteration": 2.6793766021728516 }, { "auxiliary_loss_clip": 0.01084634, "auxiliary_loss_mlp": 0.00774539, "balance_loss_clip": 1.03983831, "balance_loss_mlp": 1.00038791, "epoch": 0.5480835713212084, "flos": 22055077480320.0, "grad_norm": 1.7266092862770852, "language_loss": 0.72229278, "learning_rate": 1.7860117424952781e-06, "loss": 0.74088448, "num_input_tokens_seen": 196457300, "step": 9116, "time_per_iteration": 2.738142490386963 }, { "auxiliary_loss_clip": 0.01095127, "auxiliary_loss_mlp": 0.01039685, "balance_loss_clip": 1.04102373, "balance_loss_mlp": 1.0259639, "epoch": 0.5481436945738765, "flos": 25301042749440.0, "grad_norm": 4.413930764564679, "language_loss": 0.76158273, "learning_rate": 1.7856245205189063e-06, "loss": 0.78293079, "num_input_tokens_seen": 196476720, "step": 9117, "time_per_iteration": 2.693359613418579 }, { "auxiliary_loss_clip": 0.01070482, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.03514457, "balance_loss_mlp": 1.02292752, "epoch": 0.5482038178265444, "flos": 33580857772800.0, "grad_norm": 1.575829874902699, "language_loss": 0.62537289, "learning_rate": 1.785237306671674e-06, "loss": 0.64643478, "num_input_tokens_seen": 196496765, "step": 9118, "time_per_iteration": 4.42430305480957 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.04479444, "balance_loss_mlp": 1.02259278, "epoch": 0.5482639410792124, "flos": 19026192055680.0, "grad_norm": 2.694246810130355, "language_loss": 0.79018009, "learning_rate": 1.7848501009682646e-06, "loss": 0.81179261, "num_input_tokens_seen": 196516220, "step": 9119, "time_per_iteration": 2.606593608856201 }, { "auxiliary_loss_clip": 0.01092726, "auxiliary_loss_mlp": 0.00769453, "balance_loss_clip": 1.04150975, "balance_loss_mlp": 1.00022948, "epoch": 0.5483240643318803, "flos": 25410318900480.0, "grad_norm": 1.8682271604905119, "language_loss": 0.82534289, "learning_rate": 1.7844629034233604e-06, "loss": 0.8439647, "num_input_tokens_seen": 196533860, "step": 9120, "time_per_iteration": 2.694546699523926 }, { "auxiliary_loss_clip": 0.01089359, "auxiliary_loss_mlp": 0.01039031, "balance_loss_clip": 1.04395008, "balance_loss_mlp": 1.02531016, "epoch": 0.5483841875845483, "flos": 21466896272640.0, "grad_norm": 1.8000226938726367, "language_loss": 0.80031526, "learning_rate": 1.7840757140516455e-06, "loss": 0.82159919, "num_input_tokens_seen": 196551305, "step": 9121, "time_per_iteration": 2.7422945499420166 }, { "auxiliary_loss_clip": 0.01076146, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.03803313, "balance_loss_mlp": 1.02408934, "epoch": 0.5484443108372163, "flos": 24747263792640.0, "grad_norm": 1.9827939120507885, "language_loss": 0.60996848, "learning_rate": 1.7836885328678008e-06, "loss": 0.63111973, "num_input_tokens_seen": 196569420, "step": 9122, "time_per_iteration": 2.782677412033081 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.04853153, "balance_loss_mlp": 1.0268079, "epoch": 0.5485044340898843, "flos": 25375377945600.0, "grad_norm": 1.5587852273808862, "language_loss": 0.71594763, "learning_rate": 1.7833013598865084e-06, "loss": 0.73739696, "num_input_tokens_seen": 196590610, "step": 9123, "time_per_iteration": 2.756350517272949 }, { "auxiliary_loss_clip": 0.01121133, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.04210067, "balance_loss_mlp": 1.0208813, "epoch": 0.5485645573425523, "flos": 12641167370880.0, "grad_norm": 2.3735658261361845, "language_loss": 0.83559448, "learning_rate": 1.7829141951224505e-06, "loss": 0.85714072, "num_input_tokens_seen": 196606495, "step": 9124, "time_per_iteration": 2.61197829246521 }, { "auxiliary_loss_clip": 0.01094486, "auxiliary_loss_mlp": 0.01033029, "balance_loss_clip": 1.04321349, "balance_loss_mlp": 1.01992834, "epoch": 0.5486246805952202, "flos": 28329425383680.0, "grad_norm": 1.5486509111854319, "language_loss": 0.80518043, "learning_rate": 1.7825270385903075e-06, "loss": 0.82645559, "num_input_tokens_seen": 196626365, "step": 9125, "time_per_iteration": 2.773972749710083 }, { "auxiliary_loss_clip": 0.01111849, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.04336679, "balance_loss_mlp": 1.01903141, "epoch": 0.5486848038478882, "flos": 16800017817600.0, "grad_norm": 4.333134351852335, "language_loss": 0.74312758, "learning_rate": 1.7821398903047617e-06, "loss": 0.76456887, "num_input_tokens_seen": 196644465, "step": 9126, "time_per_iteration": 2.654529333114624 }, { "auxiliary_loss_clip": 0.01107646, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.03968537, "balance_loss_mlp": 1.02193701, "epoch": 0.5487449271005561, "flos": 17236224581760.0, "grad_norm": 2.710645426319007, "language_loss": 0.66802239, "learning_rate": 1.7817527502804928e-06, "loss": 0.6894713, "num_input_tokens_seen": 196659160, "step": 9127, "time_per_iteration": 2.615807294845581 }, { "auxiliary_loss_clip": 0.01078683, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.03928149, "balance_loss_mlp": 1.0249052, "epoch": 0.5488050503532241, "flos": 17340867878400.0, "grad_norm": 2.0894273864631225, "language_loss": 0.82909453, "learning_rate": 1.781365618532181e-06, "loss": 0.85027516, "num_input_tokens_seen": 196677410, "step": 9128, "time_per_iteration": 2.681060791015625 }, { "auxiliary_loss_clip": 0.01074302, "auxiliary_loss_mlp": 0.01037438, "balance_loss_clip": 1.03565645, "balance_loss_mlp": 1.02254319, "epoch": 0.548865173605892, "flos": 17239169496960.0, "grad_norm": 1.9025486027385248, "language_loss": 0.74247289, "learning_rate": 1.7809784950745078e-06, "loss": 0.76359022, "num_input_tokens_seen": 196696765, "step": 9129, "time_per_iteration": 2.681459426879883 }, { "auxiliary_loss_clip": 0.01077104, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 1.03771412, "balance_loss_mlp": 1.02210581, "epoch": 0.5489252968585601, "flos": 17456716218240.0, "grad_norm": 3.0707794644461854, "language_loss": 0.63489515, "learning_rate": 1.7805913799221511e-06, "loss": 0.65603966, "num_input_tokens_seen": 196714895, "step": 9130, "time_per_iteration": 2.743734359741211 }, { "auxiliary_loss_clip": 0.01124543, "auxiliary_loss_mlp": 0.00771634, "balance_loss_clip": 1.04329586, "balance_loss_mlp": 1.00023222, "epoch": 0.548985420111228, "flos": 26323383646080.0, "grad_norm": 1.7961020275949398, "language_loss": 0.62998879, "learning_rate": 1.7802042730897915e-06, "loss": 0.64895058, "num_input_tokens_seen": 196735510, "step": 9131, "time_per_iteration": 2.7136600017547607 }, { "auxiliary_loss_clip": 0.01109321, "auxiliary_loss_mlp": 0.01039388, "balance_loss_clip": 1.04004657, "balance_loss_mlp": 1.02416492, "epoch": 0.549045543363896, "flos": 18693730748160.0, "grad_norm": 1.6718560353245449, "language_loss": 0.7504952, "learning_rate": 1.7798171745921084e-06, "loss": 0.77198231, "num_input_tokens_seen": 196752855, "step": 9132, "time_per_iteration": 2.686460494995117 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.03815818, "balance_loss_mlp": 1.02046108, "epoch": 0.5491056666165639, "flos": 24717386655360.0, "grad_norm": 1.5443073078358045, "language_loss": 0.81107825, "learning_rate": 1.7794300844437795e-06, "loss": 0.83248657, "num_input_tokens_seen": 196772230, "step": 9133, "time_per_iteration": 2.607304811477661 }, { "auxiliary_loss_clip": 0.0109676, "auxiliary_loss_mlp": 0.00770878, "balance_loss_clip": 1.04211152, "balance_loss_mlp": 1.00023055, "epoch": 0.5491657898692319, "flos": 21576926609280.0, "grad_norm": 2.2143971437865275, "language_loss": 0.69978988, "learning_rate": 1.7790430026594841e-06, "loss": 0.71846628, "num_input_tokens_seen": 196790405, "step": 9134, "time_per_iteration": 2.655400037765503 }, { "auxiliary_loss_clip": 0.01085592, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.03952289, "balance_loss_mlp": 1.0263567, "epoch": 0.5492259131219, "flos": 50476432746240.0, "grad_norm": 2.156005038881863, "language_loss": 0.61240542, "learning_rate": 1.7786559292539004e-06, "loss": 0.63366163, "num_input_tokens_seen": 196813785, "step": 9135, "time_per_iteration": 2.911567449569702 }, { "auxiliary_loss_clip": 0.01112825, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.042696, "balance_loss_mlp": 1.02169049, "epoch": 0.5492860363745679, "flos": 25119262995840.0, "grad_norm": 1.746391133416305, "language_loss": 0.72368252, "learning_rate": 1.7782688642417058e-06, "loss": 0.74517649, "num_input_tokens_seen": 196834390, "step": 9136, "time_per_iteration": 2.6732101440429688 }, { "auxiliary_loss_clip": 0.01060281, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.03961897, "balance_loss_mlp": 1.02839267, "epoch": 0.5493461596272359, "flos": 22633777497600.0, "grad_norm": 2.424259272269788, "language_loss": 0.68256485, "learning_rate": 1.7778818076375781e-06, "loss": 0.70360851, "num_input_tokens_seen": 196853290, "step": 9137, "time_per_iteration": 2.7947540283203125 }, { "auxiliary_loss_clip": 0.01030828, "auxiliary_loss_mlp": 0.01011299, "balance_loss_clip": 1.01489806, "balance_loss_mlp": 1.00992203, "epoch": 0.5494062828799038, "flos": 66151800754560.0, "grad_norm": 0.7420439748923869, "language_loss": 0.65270352, "learning_rate": 1.7774947594561947e-06, "loss": 0.67312479, "num_input_tokens_seen": 196913120, "step": 9138, "time_per_iteration": 3.2256250381469727 }, { "auxiliary_loss_clip": 0.0111256, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.04488194, "balance_loss_mlp": 1.01902211, "epoch": 0.5494664061325718, "flos": 21105958458240.0, "grad_norm": 1.8659950166851553, "language_loss": 0.75243253, "learning_rate": 1.7771077197122321e-06, "loss": 0.77388501, "num_input_tokens_seen": 196931530, "step": 9139, "time_per_iteration": 2.7239251136779785 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.04175556, "balance_loss_mlp": 1.01932561, "epoch": 0.5495265293852397, "flos": 14392566616320.0, "grad_norm": 1.6260992267363037, "language_loss": 0.70765269, "learning_rate": 1.7767206884203672e-06, "loss": 0.72909158, "num_input_tokens_seen": 196949430, "step": 9140, "time_per_iteration": 2.647174119949341 }, { "auxiliary_loss_clip": 0.01090583, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.03731537, "balance_loss_mlp": 1.02207434, "epoch": 0.5495866526379077, "flos": 25549148966400.0, "grad_norm": 1.8985191424105816, "language_loss": 0.7687242, "learning_rate": 1.7763336655952762e-06, "loss": 0.78999794, "num_input_tokens_seen": 196968265, "step": 9141, "time_per_iteration": 2.65411639213562 }, { "auxiliary_loss_clip": 0.01084812, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.0427072, "balance_loss_mlp": 1.02342081, "epoch": 0.5496467758905756, "flos": 21317256213120.0, "grad_norm": 2.1277262842794697, "language_loss": 0.7463578, "learning_rate": 1.7759466512516346e-06, "loss": 0.7675755, "num_input_tokens_seen": 196984930, "step": 9142, "time_per_iteration": 2.7200329303741455 }, { "auxiliary_loss_clip": 0.01098795, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.04416585, "balance_loss_mlp": 1.02186954, "epoch": 0.5497068991432437, "flos": 22233086305920.0, "grad_norm": 5.155975597587774, "language_loss": 0.7661894, "learning_rate": 1.7755596454041192e-06, "loss": 0.78754616, "num_input_tokens_seen": 197002320, "step": 9143, "time_per_iteration": 2.6951520442962646 }, { "auxiliary_loss_clip": 0.01091779, "auxiliary_loss_mlp": 0.01037521, "balance_loss_clip": 1.03912258, "balance_loss_mlp": 1.02332926, "epoch": 0.5497670223959116, "flos": 18479093028480.0, "grad_norm": 2.8186227807908466, "language_loss": 0.79572552, "learning_rate": 1.7751726480674044e-06, "loss": 0.81701857, "num_input_tokens_seen": 197020825, "step": 9144, "time_per_iteration": 2.661098003387451 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.04339552, "balance_loss_mlp": 1.02086163, "epoch": 0.5498271456485796, "flos": 29205107049600.0, "grad_norm": 1.6865855857111283, "language_loss": 0.70998669, "learning_rate": 1.7747856592561645e-06, "loss": 0.731462, "num_input_tokens_seen": 197040450, "step": 9145, "time_per_iteration": 2.6857175827026367 }, { "auxiliary_loss_clip": 0.01109884, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.04158354, "balance_loss_mlp": 1.02063489, "epoch": 0.5498872689012475, "flos": 34824372664320.0, "grad_norm": 1.7292068512536125, "language_loss": 0.70875257, "learning_rate": 1.774398678985076e-06, "loss": 0.73018515, "num_input_tokens_seen": 197063930, "step": 9146, "time_per_iteration": 2.7719805240631104 }, { "auxiliary_loss_clip": 0.01096176, "auxiliary_loss_mlp": 0.01029792, "balance_loss_clip": 1.04054928, "balance_loss_mlp": 1.01708448, "epoch": 0.5499473921539155, "flos": 25921938268800.0, "grad_norm": 1.7336366982972622, "language_loss": 0.63770372, "learning_rate": 1.7740117072688113e-06, "loss": 0.65896338, "num_input_tokens_seen": 197082660, "step": 9147, "time_per_iteration": 2.6603379249572754 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.04582083, "balance_loss_mlp": 1.01920164, "epoch": 0.5500075154065835, "flos": 22273701609600.0, "grad_norm": 2.1607061922348088, "language_loss": 0.81009579, "learning_rate": 1.7736247441220458e-06, "loss": 0.8316772, "num_input_tokens_seen": 197100675, "step": 9148, "time_per_iteration": 2.620183229446411 }, { "auxiliary_loss_clip": 0.01101315, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04367983, "balance_loss_mlp": 1.02550507, "epoch": 0.5500676386592515, "flos": 28037507552640.0, "grad_norm": 1.7340881050910257, "language_loss": 0.79154336, "learning_rate": 1.773237789559453e-06, "loss": 0.81295007, "num_input_tokens_seen": 197121320, "step": 9149, "time_per_iteration": 2.734495162963867 }, { "auxiliary_loss_clip": 0.01082615, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 1.0412097, "balance_loss_mlp": 1.01476002, "epoch": 0.5501277619119195, "flos": 23914819123200.0, "grad_norm": 4.0693062888880185, "language_loss": 0.72006851, "learning_rate": 1.7728508435957052e-06, "loss": 0.74117416, "num_input_tokens_seen": 197138965, "step": 9150, "time_per_iteration": 2.66481876373291 }, { "auxiliary_loss_clip": 0.01099742, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.03804266, "balance_loss_mlp": 1.0189085, "epoch": 0.5501878851645874, "flos": 20923783655040.0, "grad_norm": 3.1249847499070014, "language_loss": 0.75043446, "learning_rate": 1.772463906245477e-06, "loss": 0.77176708, "num_input_tokens_seen": 197156460, "step": 9151, "time_per_iteration": 2.704946517944336 }, { "auxiliary_loss_clip": 0.0109205, "auxiliary_loss_mlp": 0.01033656, "balance_loss_clip": 1.03899741, "balance_loss_mlp": 1.01981556, "epoch": 0.5502480084172554, "flos": 20665298407680.0, "grad_norm": 2.3903222148465035, "language_loss": 0.76302028, "learning_rate": 1.7720769775234394e-06, "loss": 0.78427732, "num_input_tokens_seen": 197175140, "step": 9152, "time_per_iteration": 5.871058464050293 }, { "auxiliary_loss_clip": 0.01098821, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.04291546, "balance_loss_mlp": 1.02058983, "epoch": 0.5503081316699233, "flos": 26432552056320.0, "grad_norm": 1.865148989318078, "language_loss": 0.82033801, "learning_rate": 1.7716900574442662e-06, "loss": 0.84166336, "num_input_tokens_seen": 197194345, "step": 9153, "time_per_iteration": 2.741382598876953 }, { "auxiliary_loss_clip": 0.01110131, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04423809, "balance_loss_mlp": 1.01572764, "epoch": 0.5503682549225913, "flos": 30629144718720.0, "grad_norm": 1.7497509025726563, "language_loss": 0.74392802, "learning_rate": 1.7713031460226294e-06, "loss": 0.76531971, "num_input_tokens_seen": 197215535, "step": 9154, "time_per_iteration": 4.345115900039673 }, { "auxiliary_loss_clip": 0.01104154, "auxiliary_loss_mlp": 0.01039546, "balance_loss_clip": 1.04041803, "balance_loss_mlp": 1.02451348, "epoch": 0.5504283781752592, "flos": 22565439872640.0, "grad_norm": 1.5994441828682415, "language_loss": 0.73138744, "learning_rate": 1.770916243273199e-06, "loss": 0.75282443, "num_input_tokens_seen": 197234945, "step": 9155, "time_per_iteration": 2.6851611137390137 }, { "auxiliary_loss_clip": 0.01021957, "auxiliary_loss_mlp": 0.01001594, "balance_loss_clip": 1.01543474, "balance_loss_mlp": 1.00016963, "epoch": 0.5504885014279273, "flos": 67901009270400.0, "grad_norm": 0.7575867212346565, "language_loss": 0.55399221, "learning_rate": 1.7705293492106483e-06, "loss": 0.57422775, "num_input_tokens_seen": 197302285, "step": 9156, "time_per_iteration": 3.300373077392578 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.03954601, "balance_loss_mlp": 1.02354383, "epoch": 0.5505486246805952, "flos": 22450058409600.0, "grad_norm": 1.7338338818713679, "language_loss": 0.82676858, "learning_rate": 1.7701424638496475e-06, "loss": 0.84822297, "num_input_tokens_seen": 197321575, "step": 9157, "time_per_iteration": 4.260001182556152 }, { "auxiliary_loss_clip": 0.01128779, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.04512608, "balance_loss_mlp": 1.02101421, "epoch": 0.5506087479332632, "flos": 26906896085760.0, "grad_norm": 2.1665568405651916, "language_loss": 0.7574966, "learning_rate": 1.7697555872048677e-06, "loss": 0.77914703, "num_input_tokens_seen": 197340255, "step": 9158, "time_per_iteration": 2.634035587310791 }, { "auxiliary_loss_clip": 0.01079995, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.04036868, "balance_loss_mlp": 1.01919723, "epoch": 0.5506688711859311, "flos": 22930256355840.0, "grad_norm": 1.7765349720842452, "language_loss": 0.7011236, "learning_rate": 1.769368719290979e-06, "loss": 0.72224694, "num_input_tokens_seen": 197360360, "step": 9159, "time_per_iteration": 2.765982151031494 }, { "auxiliary_loss_clip": 0.01074937, "auxiliary_loss_mlp": 0.00772606, "balance_loss_clip": 1.03859997, "balance_loss_mlp": 1.00024915, "epoch": 0.5507289944385991, "flos": 29606408772480.0, "grad_norm": 1.5184177470515237, "language_loss": 0.6844312, "learning_rate": 1.7689818601226516e-06, "loss": 0.70290661, "num_input_tokens_seen": 197381905, "step": 9160, "time_per_iteration": 2.7715611457824707 }, { "auxiliary_loss_clip": 0.01121201, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.04361653, "balance_loss_mlp": 1.02297473, "epoch": 0.5507891176912671, "flos": 15334431091200.0, "grad_norm": 2.346039254378587, "language_loss": 0.71789527, "learning_rate": 1.7685950097145552e-06, "loss": 0.7394737, "num_input_tokens_seen": 197398555, "step": 9161, "time_per_iteration": 2.641042470932007 }, { "auxiliary_loss_clip": 0.01112875, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.04357731, "balance_loss_mlp": 1.02879643, "epoch": 0.5508492409439351, "flos": 26578313447040.0, "grad_norm": 1.6233913779896265, "language_loss": 0.69443804, "learning_rate": 1.768208168081359e-06, "loss": 0.71599269, "num_input_tokens_seen": 197419630, "step": 9162, "time_per_iteration": 2.693645715713501 }, { "auxiliary_loss_clip": 0.01122811, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.04462349, "balance_loss_mlp": 1.02506185, "epoch": 0.5509093641966031, "flos": 25443428261760.0, "grad_norm": 1.863003505887403, "language_loss": 0.85338551, "learning_rate": 1.767821335237733e-06, "loss": 0.87500155, "num_input_tokens_seen": 197438480, "step": 9163, "time_per_iteration": 2.6538877487182617 }, { "auxiliary_loss_clip": 0.01088872, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.04132617, "balance_loss_mlp": 1.01908576, "epoch": 0.550969487449271, "flos": 18698543170560.0, "grad_norm": 1.8611061255519936, "language_loss": 0.80892253, "learning_rate": 1.7674345111983441e-06, "loss": 0.83013415, "num_input_tokens_seen": 197456755, "step": 9164, "time_per_iteration": 2.813016891479492 }, { "auxiliary_loss_clip": 0.0110727, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.04617882, "balance_loss_mlp": 1.01856649, "epoch": 0.551029610701939, "flos": 22708723224960.0, "grad_norm": 1.8149479270660511, "language_loss": 0.73350954, "learning_rate": 1.767047695977863e-06, "loss": 0.75491256, "num_input_tokens_seen": 197475530, "step": 9165, "time_per_iteration": 2.6487855911254883 }, { "auxiliary_loss_clip": 0.01103747, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.04083133, "balance_loss_mlp": 1.02677011, "epoch": 0.5510897339546069, "flos": 12420496166400.0, "grad_norm": 1.9553906281347788, "language_loss": 0.78998721, "learning_rate": 1.7666608895909563e-06, "loss": 0.8114379, "num_input_tokens_seen": 197490835, "step": 9166, "time_per_iteration": 2.578125 }, { "auxiliary_loss_clip": 0.01089384, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.03881669, "balance_loss_mlp": 1.01822138, "epoch": 0.5511498572072749, "flos": 18770579896320.0, "grad_norm": 2.156469581369372, "language_loss": 0.76529676, "learning_rate": 1.7662740920522913e-06, "loss": 0.78651255, "num_input_tokens_seen": 197508770, "step": 9167, "time_per_iteration": 2.7045888900756836 }, { "auxiliary_loss_clip": 0.01112145, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.01811707, "epoch": 0.5512099804599428, "flos": 19573326996480.0, "grad_norm": 2.0156954118398227, "language_loss": 0.79765004, "learning_rate": 1.7658873033765374e-06, "loss": 0.81909174, "num_input_tokens_seen": 197527340, "step": 9168, "time_per_iteration": 2.669908046722412 }, { "auxiliary_loss_clip": 0.0111534, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.04542589, "balance_loss_mlp": 1.02830565, "epoch": 0.5512701037126109, "flos": 26245600744320.0, "grad_norm": 1.6113858397633185, "language_loss": 0.69293267, "learning_rate": 1.7655005235783591e-06, "loss": 0.71450996, "num_input_tokens_seen": 197547280, "step": 9169, "time_per_iteration": 2.70609450340271 }, { "auxiliary_loss_clip": 0.01106964, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.04113257, "balance_loss_mlp": 1.01710367, "epoch": 0.5513302269652788, "flos": 21945406279680.0, "grad_norm": 1.9890616519308366, "language_loss": 0.85510826, "learning_rate": 1.7651137526724251e-06, "loss": 0.87647074, "num_input_tokens_seen": 197565045, "step": 9170, "time_per_iteration": 2.670785427093506 }, { "auxiliary_loss_clip": 0.01022762, "auxiliary_loss_mlp": 0.01003909, "balance_loss_clip": 1.02287233, "balance_loss_mlp": 1.00240731, "epoch": 0.5513903502179468, "flos": 68235948616320.0, "grad_norm": 0.7781167580815929, "language_loss": 0.59840322, "learning_rate": 1.7647269906734017e-06, "loss": 0.61866993, "num_input_tokens_seen": 197625005, "step": 9171, "time_per_iteration": 3.2524025440216064 }, { "auxiliary_loss_clip": 0.01085077, "auxiliary_loss_mlp": 0.01041997, "balance_loss_clip": 1.03855562, "balance_loss_mlp": 1.02763844, "epoch": 0.5514504734706147, "flos": 18734238311040.0, "grad_norm": 1.556060427891405, "language_loss": 0.70670319, "learning_rate": 1.7643402375959533e-06, "loss": 0.72797394, "num_input_tokens_seen": 197645050, "step": 9172, "time_per_iteration": 2.708811044692993 }, { "auxiliary_loss_clip": 0.01120195, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.04229403, "balance_loss_mlp": 1.02470756, "epoch": 0.5515105967232827, "flos": 22270972176000.0, "grad_norm": 1.7490660409709138, "language_loss": 0.75727642, "learning_rate": 1.7639534934547474e-06, "loss": 0.77886033, "num_input_tokens_seen": 197663910, "step": 9173, "time_per_iteration": 2.6022469997406006 }, { "auxiliary_loss_clip": 0.01083041, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.04071558, "balance_loss_mlp": 1.02043712, "epoch": 0.5515707199759508, "flos": 22557682535040.0, "grad_norm": 1.9060639151270278, "language_loss": 0.75156957, "learning_rate": 1.7635667582644484e-06, "loss": 0.77273941, "num_input_tokens_seen": 197681580, "step": 9174, "time_per_iteration": 2.758668899536133 }, { "auxiliary_loss_clip": 0.01102936, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.0414834, "balance_loss_mlp": 1.02056456, "epoch": 0.5516308432286187, "flos": 28291072636800.0, "grad_norm": 2.209520073538634, "language_loss": 0.72830188, "learning_rate": 1.7631800320397217e-06, "loss": 0.74967873, "num_input_tokens_seen": 197702095, "step": 9175, "time_per_iteration": 2.6674885749816895 }, { "auxiliary_loss_clip": 0.01112767, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.04439914, "balance_loss_mlp": 1.02324057, "epoch": 0.5516909664812867, "flos": 18764474584320.0, "grad_norm": 1.7828415192194789, "language_loss": 0.69321132, "learning_rate": 1.7627933147952318e-06, "loss": 0.71470201, "num_input_tokens_seen": 197720720, "step": 9176, "time_per_iteration": 2.721855878829956 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.04404604, "balance_loss_mlp": 1.02004051, "epoch": 0.5517510897339546, "flos": 27740346336000.0, "grad_norm": 1.6320384621008008, "language_loss": 0.70890021, "learning_rate": 1.7624066065456435e-06, "loss": 0.73034984, "num_input_tokens_seen": 197741820, "step": 9177, "time_per_iteration": 2.6951122283935547 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.0442878, "balance_loss_mlp": 1.01811981, "epoch": 0.5518112129866226, "flos": 18404470523520.0, "grad_norm": 1.5626252071778102, "language_loss": 0.80647016, "learning_rate": 1.7620199073056204e-06, "loss": 0.82790309, "num_input_tokens_seen": 197759160, "step": 9178, "time_per_iteration": 2.6048829555511475 }, { "auxiliary_loss_clip": 0.01063405, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.04167509, "balance_loss_mlp": 1.03129053, "epoch": 0.5518713362392905, "flos": 25082670015360.0, "grad_norm": 2.211793529411812, "language_loss": 0.7505163, "learning_rate": 1.761633217089826e-06, "loss": 0.77161986, "num_input_tokens_seen": 197779760, "step": 9179, "time_per_iteration": 2.808234453201294 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04556203, "balance_loss_mlp": 1.02984655, "epoch": 0.5519314594919585, "flos": 36538999361280.0, "grad_norm": 1.9934221112233521, "language_loss": 0.7009306, "learning_rate": 1.761246535912924e-06, "loss": 0.7225163, "num_input_tokens_seen": 197801545, "step": 9180, "time_per_iteration": 2.788222551345825 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.0398531, "balance_loss_mlp": 1.02672613, "epoch": 0.5519915827446265, "flos": 20448613612800.0, "grad_norm": 1.9005454733047327, "language_loss": 0.67093515, "learning_rate": 1.7608598637895776e-06, "loss": 0.69241244, "num_input_tokens_seen": 197820760, "step": 9181, "time_per_iteration": 2.7013533115386963 }, { "auxiliary_loss_clip": 0.01126813, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.0449146, "balance_loss_mlp": 1.02041602, "epoch": 0.5520517059972945, "flos": 23768052151680.0, "grad_norm": 2.0355295280850347, "language_loss": 0.79382825, "learning_rate": 1.7604732007344486e-06, "loss": 0.8154366, "num_input_tokens_seen": 197840195, "step": 9182, "time_per_iteration": 2.6580309867858887 }, { "auxiliary_loss_clip": 0.0108505, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.0405935, "balance_loss_mlp": 1.01576233, "epoch": 0.5521118292499624, "flos": 22196457411840.0, "grad_norm": 2.3123904881057524, "language_loss": 0.83006704, "learning_rate": 1.7600865467622003e-06, "loss": 0.85121477, "num_input_tokens_seen": 197859475, "step": 9183, "time_per_iteration": 2.744466543197632 }, { "auxiliary_loss_clip": 0.01100335, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.0419153, "balance_loss_mlp": 1.01544046, "epoch": 0.5521719525026304, "flos": 23583291569280.0, "grad_norm": 1.2881660479793424, "language_loss": 0.67605364, "learning_rate": 1.7596999018874936e-06, "loss": 0.6973449, "num_input_tokens_seen": 197879395, "step": 9184, "time_per_iteration": 2.6846580505371094 }, { "auxiliary_loss_clip": 0.01110729, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.04261684, "balance_loss_mlp": 1.01442409, "epoch": 0.5522320757552983, "flos": 26137617482880.0, "grad_norm": 1.486667996359971, "language_loss": 0.76359147, "learning_rate": 1.7593132661249917e-06, "loss": 0.78498632, "num_input_tokens_seen": 197900815, "step": 9185, "time_per_iteration": 2.6278598308563232 }, { "auxiliary_loss_clip": 0.01084681, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.04073203, "balance_loss_mlp": 1.02742732, "epoch": 0.5522921990079663, "flos": 24676160820480.0, "grad_norm": 1.6270174778631188, "language_loss": 0.74294305, "learning_rate": 1.7589266394893536e-06, "loss": 0.76420891, "num_input_tokens_seen": 197918985, "step": 9186, "time_per_iteration": 2.7178421020507812 }, { "auxiliary_loss_clip": 0.01094897, "auxiliary_loss_mlp": 0.0103984, "balance_loss_clip": 1.04445529, "balance_loss_mlp": 1.02626204, "epoch": 0.5523523222606344, "flos": 22748153379840.0, "grad_norm": 2.1270117067296725, "language_loss": 0.66701925, "learning_rate": 1.7585400219952421e-06, "loss": 0.68836665, "num_input_tokens_seen": 197937725, "step": 9187, "time_per_iteration": 2.7278029918670654 }, { "auxiliary_loss_clip": 0.01101824, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.04459238, "balance_loss_mlp": 1.02054477, "epoch": 0.5524124455133023, "flos": 19755825022080.0, "grad_norm": 1.575939713951601, "language_loss": 0.7774123, "learning_rate": 1.758153413657318e-06, "loss": 0.79877484, "num_input_tokens_seen": 197955635, "step": 9188, "time_per_iteration": 2.753506660461426 }, { "auxiliary_loss_clip": 0.01095705, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.04053175, "balance_loss_mlp": 1.01806509, "epoch": 0.5524725687659703, "flos": 23294821443840.0, "grad_norm": 1.82344252580878, "language_loss": 0.81139189, "learning_rate": 1.7577668144902394e-06, "loss": 0.83266759, "num_input_tokens_seen": 197974490, "step": 9189, "time_per_iteration": 2.7089128494262695 }, { "auxiliary_loss_clip": 0.01104025, "auxiliary_loss_mlp": 0.00770543, "balance_loss_clip": 1.04259682, "balance_loss_mlp": 1.00024211, "epoch": 0.5525326920186382, "flos": 24862178378880.0, "grad_norm": 1.4850448399521246, "language_loss": 0.76478475, "learning_rate": 1.7573802245086684e-06, "loss": 0.78353041, "num_input_tokens_seen": 197995735, "step": 9190, "time_per_iteration": 2.611971855163574 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.04273391, "balance_loss_mlp": 1.02648067, "epoch": 0.5525928152713062, "flos": 13735580906880.0, "grad_norm": 2.4141637541410508, "language_loss": 0.78987861, "learning_rate": 1.7569936437272627e-06, "loss": 0.81154966, "num_input_tokens_seen": 198009685, "step": 9191, "time_per_iteration": 2.545794725418091 }, { "auxiliary_loss_clip": 0.01050104, "auxiliary_loss_mlp": 0.01035439, "balance_loss_clip": 1.03439641, "balance_loss_mlp": 1.02133703, "epoch": 0.5526529385239741, "flos": 13071592045440.0, "grad_norm": 2.484462687188894, "language_loss": 0.68966973, "learning_rate": 1.7566070721606829e-06, "loss": 0.71052521, "num_input_tokens_seen": 198026845, "step": 9192, "time_per_iteration": 6.08718204498291 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04424548, "balance_loss_mlp": 1.02356553, "epoch": 0.5527130617766421, "flos": 23148377694720.0, "grad_norm": 1.4810056841060688, "language_loss": 0.77680272, "learning_rate": 1.756220509823588e-06, "loss": 0.7982707, "num_input_tokens_seen": 198045275, "step": 9193, "time_per_iteration": 4.1960039138793945 }, { "auxiliary_loss_clip": 0.01083568, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.03722787, "balance_loss_mlp": 1.02139795, "epoch": 0.55277318502931, "flos": 21285547482240.0, "grad_norm": 1.4323494490195217, "language_loss": 0.78473246, "learning_rate": 1.7558339567306344e-06, "loss": 0.80591547, "num_input_tokens_seen": 198065760, "step": 9194, "time_per_iteration": 2.730219841003418 }, { "auxiliary_loss_clip": 0.01089289, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.04286909, "balance_loss_mlp": 1.02309823, "epoch": 0.5528333082819781, "flos": 38324549462400.0, "grad_norm": 2.5114324389353224, "language_loss": 0.69563878, "learning_rate": 1.7554474128964825e-06, "loss": 0.71689719, "num_input_tokens_seen": 198087595, "step": 9195, "time_per_iteration": 2.898447275161743 }, { "auxiliary_loss_clip": 0.01107137, "auxiliary_loss_mlp": 0.01036404, "balance_loss_clip": 1.04293728, "balance_loss_mlp": 1.02215791, "epoch": 0.552893431534646, "flos": 13553621585280.0, "grad_norm": 1.952206040801574, "language_loss": 0.74276292, "learning_rate": 1.7550608783357887e-06, "loss": 0.76419842, "num_input_tokens_seen": 198104620, "step": 9196, "time_per_iteration": 2.775261878967285 }, { "auxiliary_loss_clip": 0.01105394, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.04212689, "balance_loss_mlp": 1.02461457, "epoch": 0.552953554787314, "flos": 21939408708480.0, "grad_norm": 2.1600616911977384, "language_loss": 0.76948142, "learning_rate": 1.7546743530632115e-06, "loss": 0.79092181, "num_input_tokens_seen": 198123565, "step": 9197, "time_per_iteration": 4.16440224647522 }, { "auxiliary_loss_clip": 0.01097995, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.03984201, "balance_loss_mlp": 1.01995707, "epoch": 0.5530136780399819, "flos": 43658002558080.0, "grad_norm": 1.6850679441105894, "language_loss": 0.76054031, "learning_rate": 1.754287837093407e-06, "loss": 0.78184652, "num_input_tokens_seen": 198148270, "step": 9198, "time_per_iteration": 2.950439453125 }, { "auxiliary_loss_clip": 0.01119177, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.04138994, "balance_loss_mlp": 1.01700497, "epoch": 0.5530738012926499, "flos": 25045502417280.0, "grad_norm": 1.499755291272354, "language_loss": 0.79495585, "learning_rate": 1.7539013304410327e-06, "loss": 0.81644565, "num_input_tokens_seen": 198168810, "step": 9199, "time_per_iteration": 2.619361162185669 }, { "auxiliary_loss_clip": 0.01078304, "auxiliary_loss_mlp": 0.01039784, "balance_loss_clip": 1.03867352, "balance_loss_mlp": 1.02552032, "epoch": 0.553133924545318, "flos": 16472081623680.0, "grad_norm": 1.9832278976810611, "language_loss": 0.63797927, "learning_rate": 1.7535148331207443e-06, "loss": 0.65916014, "num_input_tokens_seen": 198186200, "step": 9200, "time_per_iteration": 2.6335854530334473 }, { "auxiliary_loss_clip": 0.01102034, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.04273176, "balance_loss_mlp": 1.01869619, "epoch": 0.5531940477979859, "flos": 24606207083520.0, "grad_norm": 1.4982382349332672, "language_loss": 0.66065866, "learning_rate": 1.7531283451471978e-06, "loss": 0.68201303, "num_input_tokens_seen": 198207050, "step": 9201, "time_per_iteration": 2.7522671222686768 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04183888, "balance_loss_mlp": 1.02056432, "epoch": 0.5532541710506539, "flos": 22159577122560.0, "grad_norm": 1.9333851468305103, "language_loss": 0.61028016, "learning_rate": 1.7527418665350502e-06, "loss": 0.63171005, "num_input_tokens_seen": 198224565, "step": 9202, "time_per_iteration": 2.6281580924987793 }, { "auxiliary_loss_clip": 0.0110847, "auxiliary_loss_mlp": 0.00770781, "balance_loss_clip": 1.0422498, "balance_loss_mlp": 1.00029778, "epoch": 0.5533142943033218, "flos": 21397265758080.0, "grad_norm": 1.7184873612817428, "language_loss": 0.64222115, "learning_rate": 1.7523553972989548e-06, "loss": 0.66101366, "num_input_tokens_seen": 198244790, "step": 9203, "time_per_iteration": 2.6509506702423096 }, { "auxiliary_loss_clip": 0.01108951, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.04175293, "balance_loss_mlp": 1.02028739, "epoch": 0.5533744175559898, "flos": 23550541344000.0, "grad_norm": 1.4819756399271273, "language_loss": 0.63615203, "learning_rate": 1.7519689374535683e-06, "loss": 0.65757859, "num_input_tokens_seen": 198264375, "step": 9204, "time_per_iteration": 2.7008473873138428 }, { "auxiliary_loss_clip": 0.01106611, "auxiliary_loss_mlp": 0.01030715, "balance_loss_clip": 1.04070532, "balance_loss_mlp": 1.0184958, "epoch": 0.5534345408086577, "flos": 24061514267520.0, "grad_norm": 1.5985992235632864, "language_loss": 0.77158082, "learning_rate": 1.7515824870135445e-06, "loss": 0.79295409, "num_input_tokens_seen": 198283895, "step": 9205, "time_per_iteration": 2.6544225215911865 }, { "auxiliary_loss_clip": 0.01059768, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.03511405, "balance_loss_mlp": 1.02576268, "epoch": 0.5534946640613257, "flos": 33771831408000.0, "grad_norm": 1.4391383519913163, "language_loss": 0.72826385, "learning_rate": 1.751196045993537e-06, "loss": 0.74925232, "num_input_tokens_seen": 198310035, "step": 9206, "time_per_iteration": 2.832268476486206 }, { "auxiliary_loss_clip": 0.01073531, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.03840923, "balance_loss_mlp": 1.0208354, "epoch": 0.5535547873139937, "flos": 15159223526400.0, "grad_norm": 2.230271879861814, "language_loss": 0.75639313, "learning_rate": 1.7508096144082012e-06, "loss": 0.77746987, "num_input_tokens_seen": 198327810, "step": 9207, "time_per_iteration": 2.7088775634765625 }, { "auxiliary_loss_clip": 0.01088202, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.0419991, "balance_loss_mlp": 1.02010703, "epoch": 0.5536149105666617, "flos": 16980863817600.0, "grad_norm": 71.24671792095333, "language_loss": 0.61898887, "learning_rate": 1.750423192272189e-06, "loss": 0.6402089, "num_input_tokens_seen": 198343150, "step": 9208, "time_per_iteration": 2.749739646911621 }, { "auxiliary_loss_clip": 0.01123136, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.04367232, "balance_loss_mlp": 1.02285004, "epoch": 0.5536750338193296, "flos": 18149935772160.0, "grad_norm": 2.006267106077657, "language_loss": 0.64258868, "learning_rate": 1.7500367796001547e-06, "loss": 0.66417855, "num_input_tokens_seen": 198360925, "step": 9209, "time_per_iteration": 2.6854724884033203 }, { "auxiliary_loss_clip": 0.01084442, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.03969955, "balance_loss_mlp": 1.02729774, "epoch": 0.5537351570719976, "flos": 22747794243840.0, "grad_norm": 1.8841222831412607, "language_loss": 0.82470959, "learning_rate": 1.7496503764067513e-06, "loss": 0.84597361, "num_input_tokens_seen": 198379265, "step": 9210, "time_per_iteration": 2.746532917022705 }, { "auxiliary_loss_clip": 0.01098481, "auxiliary_loss_mlp": 0.01029278, "balance_loss_clip": 1.04068804, "balance_loss_mlp": 1.016523, "epoch": 0.5537952803246655, "flos": 26356026130560.0, "grad_norm": 1.6369703268884894, "language_loss": 0.72731483, "learning_rate": 1.74926398270663e-06, "loss": 0.74859238, "num_input_tokens_seen": 198399490, "step": 9211, "time_per_iteration": 2.767152786254883 }, { "auxiliary_loss_clip": 0.01089972, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.03941226, "balance_loss_mlp": 1.02259946, "epoch": 0.5538554035773335, "flos": 18037427397120.0, "grad_norm": 1.965979716525238, "language_loss": 0.6684767, "learning_rate": 1.7488775985144437e-06, "loss": 0.68975115, "num_input_tokens_seen": 198419110, "step": 9212, "time_per_iteration": 2.6946139335632324 }, { "auxiliary_loss_clip": 0.01092654, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.04305434, "balance_loss_mlp": 1.01557696, "epoch": 0.5539155268300014, "flos": 31686247002240.0, "grad_norm": 1.403594998374367, "language_loss": 0.51636183, "learning_rate": 1.7484912238448443e-06, "loss": 0.53758979, "num_input_tokens_seen": 198441360, "step": 9213, "time_per_iteration": 2.7821476459503174 }, { "auxiliary_loss_clip": 0.01092111, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.04350758, "balance_loss_mlp": 1.01752245, "epoch": 0.5539756500826695, "flos": 15193769431680.0, "grad_norm": 3.6308307245288214, "language_loss": 0.86044586, "learning_rate": 1.7481048587124827e-06, "loss": 0.88167566, "num_input_tokens_seen": 198459835, "step": 9214, "time_per_iteration": 2.7264554500579834 }, { "auxiliary_loss_clip": 0.01110148, "auxiliary_loss_mlp": 0.01032811, "balance_loss_clip": 1.04324055, "balance_loss_mlp": 1.02003813, "epoch": 0.5540357733353375, "flos": 26353117128960.0, "grad_norm": 2.235553679927881, "language_loss": 0.70002753, "learning_rate": 1.7477185031320108e-06, "loss": 0.72145712, "num_input_tokens_seen": 198478955, "step": 9215, "time_per_iteration": 2.684901714324951 }, { "auxiliary_loss_clip": 0.01093255, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.03972387, "balance_loss_mlp": 1.01641822, "epoch": 0.5540958965880054, "flos": 21323684747520.0, "grad_norm": 1.5213166138329088, "language_loss": 0.73443544, "learning_rate": 1.7473321571180773e-06, "loss": 0.75567436, "num_input_tokens_seen": 198499030, "step": 9216, "time_per_iteration": 2.6930174827575684 }, { "auxiliary_loss_clip": 0.01095704, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.04206526, "balance_loss_mlp": 1.02541757, "epoch": 0.5541560198406734, "flos": 25666828899840.0, "grad_norm": 1.8909182551573178, "language_loss": 0.71728694, "learning_rate": 1.7469458206853345e-06, "loss": 0.73863238, "num_input_tokens_seen": 198520265, "step": 9217, "time_per_iteration": 2.705566644668579 }, { "auxiliary_loss_clip": 0.01102416, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.04219627, "balance_loss_mlp": 1.01496446, "epoch": 0.5542161430933413, "flos": 21939624190080.0, "grad_norm": 1.8150794810366015, "language_loss": 0.78261054, "learning_rate": 1.7465594938484315e-06, "loss": 0.80391365, "num_input_tokens_seen": 198539645, "step": 9218, "time_per_iteration": 2.6569690704345703 }, { "auxiliary_loss_clip": 0.01077956, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.03790164, "balance_loss_mlp": 1.02161169, "epoch": 0.5542762663460093, "flos": 19571459489280.0, "grad_norm": 1.6224660724744044, "language_loss": 0.72173905, "learning_rate": 1.7461731766220176e-06, "loss": 0.74288952, "num_input_tokens_seen": 198558710, "step": 9219, "time_per_iteration": 2.685511350631714 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01039965, "balance_loss_clip": 1.04482341, "balance_loss_mlp": 1.0262028, "epoch": 0.5543363895986773, "flos": 19499063627520.0, "grad_norm": 1.5105706382424104, "language_loss": 0.71297967, "learning_rate": 1.7457868690207426e-06, "loss": 0.73446798, "num_input_tokens_seen": 198577050, "step": 9220, "time_per_iteration": 2.6306073665618896 }, { "auxiliary_loss_clip": 0.01120811, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.04381871, "balance_loss_mlp": 1.01429546, "epoch": 0.5543965128513453, "flos": 22635609091200.0, "grad_norm": 1.6307293256223026, "language_loss": 0.79449409, "learning_rate": 1.7454005710592547e-06, "loss": 0.81597066, "num_input_tokens_seen": 198595290, "step": 9221, "time_per_iteration": 2.664358139038086 }, { "auxiliary_loss_clip": 0.01090389, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.04653525, "balance_loss_mlp": 1.02108812, "epoch": 0.5544566361040132, "flos": 25989952671360.0, "grad_norm": 1.9685503329730023, "language_loss": 0.83722961, "learning_rate": 1.7450142827522027e-06, "loss": 0.85847831, "num_input_tokens_seen": 198614110, "step": 9222, "time_per_iteration": 2.770050048828125 }, { "auxiliary_loss_clip": 0.01100221, "auxiliary_loss_mlp": 0.00771629, "balance_loss_clip": 1.04789209, "balance_loss_mlp": 1.00036037, "epoch": 0.5545167593566812, "flos": 28257568225920.0, "grad_norm": 1.9185335813275248, "language_loss": 0.75431746, "learning_rate": 1.7446280041142344e-06, "loss": 0.773036, "num_input_tokens_seen": 198633880, "step": 9223, "time_per_iteration": 2.794182062149048 }, { "auxiliary_loss_clip": 0.01091289, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.04017019, "balance_loss_mlp": 1.0201149, "epoch": 0.5545768826093491, "flos": 28476551491200.0, "grad_norm": 1.614917501509061, "language_loss": 0.82090491, "learning_rate": 1.7442417351599986e-06, "loss": 0.84215945, "num_input_tokens_seen": 198653505, "step": 9224, "time_per_iteration": 2.7137935161590576 }, { "auxiliary_loss_clip": 0.01108448, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.04417324, "balance_loss_mlp": 1.02924204, "epoch": 0.5546370058620171, "flos": 18478051534080.0, "grad_norm": 1.7607532408743478, "language_loss": 0.57043874, "learning_rate": 1.743855475904141e-06, "loss": 0.59195429, "num_input_tokens_seen": 198671890, "step": 9225, "time_per_iteration": 2.616447687149048 }, { "auxiliary_loss_clip": 0.01112997, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04317498, "balance_loss_mlp": 1.02444005, "epoch": 0.554697129114685, "flos": 22930507751040.0, "grad_norm": 1.6222452828903178, "language_loss": 0.67458808, "learning_rate": 1.7434692263613098e-06, "loss": 0.69610214, "num_input_tokens_seen": 198691995, "step": 9226, "time_per_iteration": 2.663339138031006 }, { "auxiliary_loss_clip": 0.0108551, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.03901601, "balance_loss_mlp": 1.02121162, "epoch": 0.5547572523673531, "flos": 21797166850560.0, "grad_norm": 1.6061917148762987, "language_loss": 0.74387592, "learning_rate": 1.7430829865461518e-06, "loss": 0.76507771, "num_input_tokens_seen": 198712440, "step": 9227, "time_per_iteration": 2.762258529663086 }, { "auxiliary_loss_clip": 0.01087938, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.04223549, "balance_loss_mlp": 1.02071249, "epoch": 0.5548173756200211, "flos": 22342829333760.0, "grad_norm": 1.8589261758591291, "language_loss": 0.73263627, "learning_rate": 1.7426967564733118e-06, "loss": 0.7538631, "num_input_tokens_seen": 198731515, "step": 9228, "time_per_iteration": 2.762092113494873 }, { "auxiliary_loss_clip": 0.01122414, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.04351175, "balance_loss_mlp": 1.01886559, "epoch": 0.554877498872689, "flos": 17858736213120.0, "grad_norm": 1.672332446894358, "language_loss": 0.75519872, "learning_rate": 1.7423105361574373e-06, "loss": 0.77674282, "num_input_tokens_seen": 198749750, "step": 9229, "time_per_iteration": 2.6003267765045166 }, { "auxiliary_loss_clip": 0.01110807, "auxiliary_loss_mlp": 0.00772253, "balance_loss_clip": 1.0439682, "balance_loss_mlp": 1.00026536, "epoch": 0.554937622125357, "flos": 17238343484160.0, "grad_norm": 1.7587828151966396, "language_loss": 0.68663722, "learning_rate": 1.741924325613172e-06, "loss": 0.70546782, "num_input_tokens_seen": 198768320, "step": 9230, "time_per_iteration": 2.6502435207366943 }, { "auxiliary_loss_clip": 0.01078746, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.04407859, "balance_loss_mlp": 1.02506709, "epoch": 0.5549977453780249, "flos": 25368087484800.0, "grad_norm": 2.162588573655947, "language_loss": 0.6800701, "learning_rate": 1.741538124855163e-06, "loss": 0.70125121, "num_input_tokens_seen": 198787230, "step": 9231, "time_per_iteration": 4.46450400352478 }, { "auxiliary_loss_clip": 0.01125233, "auxiliary_loss_mlp": 0.01040313, "balance_loss_clip": 1.04339528, "balance_loss_mlp": 1.02537608, "epoch": 0.555057868630693, "flos": 25079114568960.0, "grad_norm": 1.7058695185820383, "language_loss": 0.78623915, "learning_rate": 1.7411519338980548e-06, "loss": 0.80789459, "num_input_tokens_seen": 198806720, "step": 9232, "time_per_iteration": 4.17819356918335 }, { "auxiliary_loss_clip": 0.01077674, "auxiliary_loss_mlp": 0.01038155, "balance_loss_clip": 1.03794336, "balance_loss_mlp": 1.02523899, "epoch": 0.5551179918833609, "flos": 26104220812800.0, "grad_norm": 1.530027860156435, "language_loss": 0.82512534, "learning_rate": 1.7407657527564898e-06, "loss": 0.84628367, "num_input_tokens_seen": 198826235, "step": 9233, "time_per_iteration": 2.7746078968048096 }, { "auxiliary_loss_clip": 0.01108881, "auxiliary_loss_mlp": 0.01040385, "balance_loss_clip": 1.04062366, "balance_loss_mlp": 1.02632475, "epoch": 0.5551781151360289, "flos": 19384759572480.0, "grad_norm": 8.113354085779601, "language_loss": 0.74638891, "learning_rate": 1.7403795814451142e-06, "loss": 0.76788163, "num_input_tokens_seen": 198842655, "step": 9234, "time_per_iteration": 2.6174590587615967 }, { "auxiliary_loss_clip": 0.01094953, "auxiliary_loss_mlp": 0.01029345, "balance_loss_clip": 1.03896558, "balance_loss_mlp": 1.01647031, "epoch": 0.5552382383886968, "flos": 21725956137600.0, "grad_norm": 4.639125305136136, "language_loss": 0.64988184, "learning_rate": 1.7399934199785706e-06, "loss": 0.67112482, "num_input_tokens_seen": 198861210, "step": 9235, "time_per_iteration": 2.6820857524871826 }, { "auxiliary_loss_clip": 0.0106692, "auxiliary_loss_mlp": 0.01042767, "balance_loss_clip": 1.03562975, "balance_loss_mlp": 1.02793705, "epoch": 0.5552983616413648, "flos": 14356189117440.0, "grad_norm": 1.66240052317675, "language_loss": 0.67842531, "learning_rate": 1.7396072683715029e-06, "loss": 0.69952214, "num_input_tokens_seen": 198880045, "step": 9236, "time_per_iteration": 4.265462160110474 }, { "auxiliary_loss_clip": 0.01116825, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.04261172, "balance_loss_mlp": 1.01549888, "epoch": 0.5553584848940327, "flos": 25478548784640.0, "grad_norm": 1.8489707966449562, "language_loss": 0.86189765, "learning_rate": 1.7392211266385536e-06, "loss": 0.88334954, "num_input_tokens_seen": 198900210, "step": 9237, "time_per_iteration": 2.662736654281616 }, { "auxiliary_loss_clip": 0.01108193, "auxiliary_loss_mlp": 0.01037757, "balance_loss_clip": 1.04178131, "balance_loss_mlp": 1.02388716, "epoch": 0.5554186081467007, "flos": 22163850840960.0, "grad_norm": 2.008755703666539, "language_loss": 0.73663169, "learning_rate": 1.7388349947943652e-06, "loss": 0.75809121, "num_input_tokens_seen": 198919055, "step": 9238, "time_per_iteration": 2.6842122077941895 }, { "auxiliary_loss_clip": 0.01105716, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.0387727, "balance_loss_mlp": 1.01777411, "epoch": 0.5554787313993687, "flos": 49746656125440.0, "grad_norm": 1.8187915692087442, "language_loss": 0.78551757, "learning_rate": 1.73844887285358e-06, "loss": 0.80689085, "num_input_tokens_seen": 198943505, "step": 9239, "time_per_iteration": 2.887911558151245 }, { "auxiliary_loss_clip": 0.01106485, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04819751, "balance_loss_mlp": 1.01699483, "epoch": 0.5555388546520367, "flos": 22127365601280.0, "grad_norm": 1.7617963791060023, "language_loss": 0.8016845, "learning_rate": 1.7380627608308393e-06, "loss": 0.82305664, "num_input_tokens_seen": 198963590, "step": 9240, "time_per_iteration": 2.759277582168579 }, { "auxiliary_loss_clip": 0.0109666, "auxiliary_loss_mlp": 0.01034491, "balance_loss_clip": 1.04089236, "balance_loss_mlp": 1.02099013, "epoch": 0.5555989779047047, "flos": 24682122478080.0, "grad_norm": 2.168471057936508, "language_loss": 0.65255535, "learning_rate": 1.737676658740786e-06, "loss": 0.67386687, "num_input_tokens_seen": 198982680, "step": 9241, "time_per_iteration": 2.7321317195892334 }, { "auxiliary_loss_clip": 0.01110689, "auxiliary_loss_mlp": 0.0077113, "balance_loss_clip": 1.04320502, "balance_loss_mlp": 1.00029731, "epoch": 0.5556591011573726, "flos": 16106510954880.0, "grad_norm": 1.885035131778914, "language_loss": 0.72406638, "learning_rate": 1.7372905665980594e-06, "loss": 0.74288458, "num_input_tokens_seen": 199000185, "step": 9242, "time_per_iteration": 2.6891591548919678 }, { "auxiliary_loss_clip": 0.01106836, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.04584861, "balance_loss_mlp": 1.02024293, "epoch": 0.5557192244100406, "flos": 12933695733120.0, "grad_norm": 1.6675932055368092, "language_loss": 0.64065903, "learning_rate": 1.7369044844173012e-06, "loss": 0.66207308, "num_input_tokens_seen": 199018380, "step": 9243, "time_per_iteration": 3.1710290908813477 }, { "auxiliary_loss_clip": 0.01094198, "auxiliary_loss_mlp": 0.00771105, "balance_loss_clip": 1.04436445, "balance_loss_mlp": 1.00027966, "epoch": 0.5557793476627085, "flos": 23111712887040.0, "grad_norm": 2.6865994829235333, "language_loss": 0.75548631, "learning_rate": 1.7365184122131509e-06, "loss": 0.77413929, "num_input_tokens_seen": 199037115, "step": 9244, "time_per_iteration": 2.686121940612793 }, { "auxiliary_loss_clip": 0.01091692, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.03900838, "balance_loss_mlp": 1.02352512, "epoch": 0.5558394709153766, "flos": 21428040735360.0, "grad_norm": 2.0505810415857506, "language_loss": 0.75051856, "learning_rate": 1.7361323500002486e-06, "loss": 0.77179724, "num_input_tokens_seen": 199053375, "step": 9245, "time_per_iteration": 2.6561057567596436 }, { "auxiliary_loss_clip": 0.01099057, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.04262114, "balance_loss_mlp": 1.02087283, "epoch": 0.5558995941680445, "flos": 25078324469760.0, "grad_norm": 2.0581034442408055, "language_loss": 0.79967058, "learning_rate": 1.7357462977932348e-06, "loss": 0.82100856, "num_input_tokens_seen": 199070930, "step": 9246, "time_per_iteration": 2.6968653202056885 }, { "auxiliary_loss_clip": 0.01120892, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.0435034, "balance_loss_mlp": 1.0241977, "epoch": 0.5559597174207125, "flos": 20011149872640.0, "grad_norm": 1.8340386723611697, "language_loss": 0.73825908, "learning_rate": 1.7353602556067471e-06, "loss": 0.75984728, "num_input_tokens_seen": 199088675, "step": 9247, "time_per_iteration": 2.5861082077026367 }, { "auxiliary_loss_clip": 0.01091731, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.04089963, "balance_loss_mlp": 1.0214448, "epoch": 0.5560198406733804, "flos": 16835677044480.0, "grad_norm": 2.6765383510534324, "language_loss": 0.74975288, "learning_rate": 1.7349742234554254e-06, "loss": 0.77102304, "num_input_tokens_seen": 199103075, "step": 9248, "time_per_iteration": 2.634092092514038 }, { "auxiliary_loss_clip": 0.00999886, "auxiliary_loss_mlp": 0.01011469, "balance_loss_clip": 1.01177704, "balance_loss_mlp": 1.00989556, "epoch": 0.5560799639260484, "flos": 70697051758080.0, "grad_norm": 0.8462101410465201, "language_loss": 0.59490269, "learning_rate": 1.7345882013539081e-06, "loss": 0.61501622, "num_input_tokens_seen": 199160325, "step": 9249, "time_per_iteration": 3.389267683029175 }, { "auxiliary_loss_clip": 0.01118078, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.04007614, "balance_loss_mlp": 1.01592088, "epoch": 0.5561400871787163, "flos": 23148593176320.0, "grad_norm": 2.8767161081984427, "language_loss": 0.79950154, "learning_rate": 1.734202189316832e-06, "loss": 0.82098025, "num_input_tokens_seen": 199179760, "step": 9250, "time_per_iteration": 2.578690528869629 }, { "auxiliary_loss_clip": 0.01098469, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04169929, "balance_loss_mlp": 1.02075529, "epoch": 0.5562002104313843, "flos": 17566423332480.0, "grad_norm": 3.104352444179477, "language_loss": 0.68685251, "learning_rate": 1.733816187358836e-06, "loss": 0.7081852, "num_input_tokens_seen": 199196695, "step": 9251, "time_per_iteration": 2.7810349464416504 }, { "auxiliary_loss_clip": 0.01109089, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.04200792, "balance_loss_mlp": 1.02018476, "epoch": 0.5562603336840523, "flos": 25045430590080.0, "grad_norm": 1.5038625186154766, "language_loss": 0.75750792, "learning_rate": 1.7334301954945569e-06, "loss": 0.77893281, "num_input_tokens_seen": 199217845, "step": 9252, "time_per_iteration": 2.663238286972046 }, { "auxiliary_loss_clip": 0.01107916, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.04108679, "balance_loss_mlp": 1.02441943, "epoch": 0.5563204569367203, "flos": 29059022436480.0, "grad_norm": 1.5228616100256118, "language_loss": 0.72854966, "learning_rate": 1.7330442137386313e-06, "loss": 0.7500242, "num_input_tokens_seen": 199239250, "step": 9253, "time_per_iteration": 2.6020450592041016 }, { "auxiliary_loss_clip": 0.01093689, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.04451489, "balance_loss_mlp": 1.02043748, "epoch": 0.5563805801893883, "flos": 22090449398400.0, "grad_norm": 1.6703038143704756, "language_loss": 0.83143723, "learning_rate": 1.7326582421056965e-06, "loss": 0.85270357, "num_input_tokens_seen": 199258320, "step": 9254, "time_per_iteration": 2.701199531555176 }, { "auxiliary_loss_clip": 0.01012318, "auxiliary_loss_mlp": 0.01004464, "balance_loss_clip": 1.01460981, "balance_loss_mlp": 1.0030154, "epoch": 0.5564407034420562, "flos": 58636128689280.0, "grad_norm": 0.8693463823650434, "language_loss": 0.64875168, "learning_rate": 1.732272280610387e-06, "loss": 0.6689195, "num_input_tokens_seen": 199314840, "step": 9255, "time_per_iteration": 3.1222445964813232 }, { "auxiliary_loss_clip": 0.01111592, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.04527521, "balance_loss_mlp": 1.02035666, "epoch": 0.5565008266947242, "flos": 23112323418240.0, "grad_norm": 2.147539486852423, "language_loss": 0.69487607, "learning_rate": 1.7318863292673399e-06, "loss": 0.7163254, "num_input_tokens_seen": 199335405, "step": 9256, "time_per_iteration": 2.642542600631714 }, { "auxiliary_loss_clip": 0.01085774, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.04269767, "balance_loss_mlp": 1.01939559, "epoch": 0.5565609499473921, "flos": 21578399066880.0, "grad_norm": 1.6171582584602333, "language_loss": 0.75981283, "learning_rate": 1.73150038809119e-06, "loss": 0.78098786, "num_input_tokens_seen": 199354345, "step": 9257, "time_per_iteration": 2.712520122528076 }, { "auxiliary_loss_clip": 0.01074562, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.04019046, "balance_loss_mlp": 1.0233897, "epoch": 0.5566210732000602, "flos": 18369637309440.0, "grad_norm": 3.6499733263034746, "language_loss": 0.60697454, "learning_rate": 1.7311144570965724e-06, "loss": 0.62808049, "num_input_tokens_seen": 199372250, "step": 9258, "time_per_iteration": 2.751559257507324 }, { "auxiliary_loss_clip": 0.01084702, "auxiliary_loss_mlp": 0.01035032, "balance_loss_clip": 1.03922486, "balance_loss_mlp": 1.02042937, "epoch": 0.5566811964527281, "flos": 25703350053120.0, "grad_norm": 1.5966024354647115, "language_loss": 0.79111505, "learning_rate": 1.7307285362981215e-06, "loss": 0.81231236, "num_input_tokens_seen": 199392815, "step": 9259, "time_per_iteration": 2.7664895057678223 }, { "auxiliary_loss_clip": 0.01088989, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.04242945, "balance_loss_mlp": 1.02328086, "epoch": 0.5567413197053961, "flos": 26943991856640.0, "grad_norm": 1.7833081696281723, "language_loss": 0.81253225, "learning_rate": 1.7303426257104712e-06, "loss": 0.83379543, "num_input_tokens_seen": 199412375, "step": 9260, "time_per_iteration": 2.79059100151062 }, { "auxiliary_loss_clip": 0.01120889, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.04265976, "balance_loss_mlp": 1.02585721, "epoch": 0.556801442958064, "flos": 20850597694080.0, "grad_norm": 1.513133023380305, "language_loss": 0.69277883, "learning_rate": 1.729956725348256e-06, "loss": 0.71438575, "num_input_tokens_seen": 199431490, "step": 9261, "time_per_iteration": 2.5942957401275635 }, { "auxiliary_loss_clip": 0.01009344, "auxiliary_loss_mlp": 0.01005985, "balance_loss_clip": 1.01376081, "balance_loss_mlp": 1.00455499, "epoch": 0.556861566210732, "flos": 70498213044480.0, "grad_norm": 0.7654306967564637, "language_loss": 0.61116695, "learning_rate": 1.729570835226108e-06, "loss": 0.63132024, "num_input_tokens_seen": 199495855, "step": 9262, "time_per_iteration": 3.2477405071258545 }, { "auxiliary_loss_clip": 0.01109024, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.03991163, "balance_loss_mlp": 1.02594411, "epoch": 0.5569216894633999, "flos": 25337276593920.0, "grad_norm": 1.6344264149627976, "language_loss": 0.64423072, "learning_rate": 1.7291849553586622e-06, "loss": 0.66571325, "num_input_tokens_seen": 199515870, "step": 9263, "time_per_iteration": 2.658576488494873 }, { "auxiliary_loss_clip": 0.01095378, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.03873014, "balance_loss_mlp": 1.02134418, "epoch": 0.556981812716068, "flos": 22638733574400.0, "grad_norm": 1.867976542015905, "language_loss": 0.73368537, "learning_rate": 1.7287990857605497e-06, "loss": 0.75498509, "num_input_tokens_seen": 199535745, "step": 9264, "time_per_iteration": 2.7003254890441895 }, { "auxiliary_loss_clip": 0.01095238, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.04636014, "balance_loss_mlp": 1.01672268, "epoch": 0.5570419359687359, "flos": 11035852738560.0, "grad_norm": 2.2771016341265526, "language_loss": 0.76178783, "learning_rate": 1.7284132264464022e-06, "loss": 0.78303552, "num_input_tokens_seen": 199554035, "step": 9265, "time_per_iteration": 2.7386014461517334 }, { "auxiliary_loss_clip": 0.01090389, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.04179025, "balance_loss_mlp": 1.02249825, "epoch": 0.5571020592214039, "flos": 22823135020800.0, "grad_norm": 1.339030652191656, "language_loss": 0.70789158, "learning_rate": 1.7280273774308536e-06, "loss": 0.72914135, "num_input_tokens_seen": 199576120, "step": 9266, "time_per_iteration": 2.741800546646118 }, { "auxiliary_loss_clip": 0.01094155, "auxiliary_loss_mlp": 0.0103873, "balance_loss_clip": 1.03911209, "balance_loss_mlp": 1.0255034, "epoch": 0.5571621824740719, "flos": 22927778317440.0, "grad_norm": 2.0031056980063506, "language_loss": 0.68157613, "learning_rate": 1.727641538728533e-06, "loss": 0.70290494, "num_input_tokens_seen": 199593780, "step": 9267, "time_per_iteration": 2.7874062061309814 }, { "auxiliary_loss_clip": 0.01104037, "auxiliary_loss_mlp": 0.01038856, "balance_loss_clip": 1.03991306, "balance_loss_mlp": 1.02653575, "epoch": 0.5572223057267398, "flos": 22966705681920.0, "grad_norm": 1.918660534651482, "language_loss": 0.74570519, "learning_rate": 1.7272557103540736e-06, "loss": 0.76713407, "num_input_tokens_seen": 199613220, "step": 9268, "time_per_iteration": 2.7008538246154785 }, { "auxiliary_loss_clip": 0.01103292, "auxiliary_loss_mlp": 0.00770403, "balance_loss_clip": 1.04299617, "balance_loss_mlp": 1.00017905, "epoch": 0.5572824289794078, "flos": 20960053413120.0, "grad_norm": 1.8745085493520866, "language_loss": 0.75087655, "learning_rate": 1.726869892322104e-06, "loss": 0.76961344, "num_input_tokens_seen": 199632085, "step": 9269, "time_per_iteration": 2.653756856918335 }, { "auxiliary_loss_clip": 0.01081519, "auxiliary_loss_mlp": 0.01046232, "balance_loss_clip": 1.03722787, "balance_loss_mlp": 1.03201032, "epoch": 0.5573425522320757, "flos": 25042413847680.0, "grad_norm": 1.688879717720704, "language_loss": 0.82588089, "learning_rate": 1.726484084647256e-06, "loss": 0.84715831, "num_input_tokens_seen": 199649295, "step": 9270, "time_per_iteration": 4.278396844863892 }, { "auxiliary_loss_clip": 0.01079257, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.04120445, "balance_loss_mlp": 1.02594197, "epoch": 0.5574026754847438, "flos": 23659637927040.0, "grad_norm": 2.0078243728297167, "language_loss": 0.79825968, "learning_rate": 1.7260982873441591e-06, "loss": 0.81944454, "num_input_tokens_seen": 199668870, "step": 9271, "time_per_iteration": 6.1330788135528564 }, { "auxiliary_loss_clip": 0.01099668, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.04303491, "balance_loss_mlp": 1.01848447, "epoch": 0.5574627987374117, "flos": 24782240661120.0, "grad_norm": 2.2903855544483394, "language_loss": 0.90515852, "learning_rate": 1.725712500427442e-06, "loss": 0.92647034, "num_input_tokens_seen": 199684870, "step": 9272, "time_per_iteration": 2.6802456378936768 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.04199028, "balance_loss_mlp": 1.02049148, "epoch": 0.5575229219900797, "flos": 21834944979840.0, "grad_norm": 2.009692341926254, "language_loss": 0.83817393, "learning_rate": 1.7253267239117347e-06, "loss": 0.85936373, "num_input_tokens_seen": 199701975, "step": 9273, "time_per_iteration": 2.714702606201172 }, { "auxiliary_loss_clip": 0.01111871, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.0435437, "balance_loss_mlp": 1.0286727, "epoch": 0.5575830452427476, "flos": 27815148408960.0, "grad_norm": 2.029727061879287, "language_loss": 0.74000418, "learning_rate": 1.7249409578116655e-06, "loss": 0.76155877, "num_input_tokens_seen": 199721865, "step": 9274, "time_per_iteration": 2.6897573471069336 }, { "auxiliary_loss_clip": 0.01102598, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.04597545, "balance_loss_mlp": 1.02202296, "epoch": 0.5576431684954156, "flos": 17812805696640.0, "grad_norm": 2.7929550344218885, "language_loss": 0.7749905, "learning_rate": 1.7245552021418629e-06, "loss": 0.79638124, "num_input_tokens_seen": 199736455, "step": 9275, "time_per_iteration": 2.6423583030700684 }, { "auxiliary_loss_clip": 0.01093646, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.04310751, "balance_loss_mlp": 1.02178109, "epoch": 0.5577032917480835, "flos": 15486872411520.0, "grad_norm": 1.5365384810156146, "language_loss": 0.75059974, "learning_rate": 1.7241694569169546e-06, "loss": 0.77188718, "num_input_tokens_seen": 199753125, "step": 9276, "time_per_iteration": 4.227986812591553 }, { "auxiliary_loss_clip": 0.01098066, "auxiliary_loss_mlp": 0.01035646, "balance_loss_clip": 1.04026711, "balance_loss_mlp": 1.02219296, "epoch": 0.5577634150007516, "flos": 21579763783680.0, "grad_norm": 1.8156811956405543, "language_loss": 0.75730252, "learning_rate": 1.7237837221515678e-06, "loss": 0.77863955, "num_input_tokens_seen": 199771365, "step": 9277, "time_per_iteration": 2.651348114013672 }, { "auxiliary_loss_clip": 0.01117192, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.04269838, "balance_loss_mlp": 1.02087963, "epoch": 0.5578235382534195, "flos": 21139750177920.0, "grad_norm": 1.871466977383403, "language_loss": 0.71828836, "learning_rate": 1.7233979978603304e-06, "loss": 0.73979771, "num_input_tokens_seen": 199790035, "step": 9278, "time_per_iteration": 2.657386302947998 }, { "auxiliary_loss_clip": 0.0108587, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.04430723, "balance_loss_mlp": 1.02232218, "epoch": 0.5578836615060875, "flos": 26505199313280.0, "grad_norm": 1.586228481919935, "language_loss": 0.75729156, "learning_rate": 1.723012284057868e-06, "loss": 0.77852082, "num_input_tokens_seen": 199811125, "step": 9279, "time_per_iteration": 2.751840353012085 }, { "auxiliary_loss_clip": 0.01093934, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.03794658, "balance_loss_mlp": 1.02376509, "epoch": 0.5579437847587555, "flos": 20153786780160.0, "grad_norm": 1.6097529730476008, "language_loss": 0.67559254, "learning_rate": 1.7226265807588082e-06, "loss": 0.69690311, "num_input_tokens_seen": 199829915, "step": 9280, "time_per_iteration": 2.6563684940338135 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.0392946, "balance_loss_mlp": 1.02810693, "epoch": 0.5580039080114234, "flos": 26102281478400.0, "grad_norm": 1.6056594505621422, "language_loss": 0.73215401, "learning_rate": 1.7222408879777763e-06, "loss": 0.75363857, "num_input_tokens_seen": 199850670, "step": 9281, "time_per_iteration": 2.6871986389160156 }, { "auxiliary_loss_clip": 0.01086628, "auxiliary_loss_mlp": 0.00770991, "balance_loss_clip": 1.04039741, "balance_loss_mlp": 1.0002861, "epoch": 0.5580640312640914, "flos": 13771671096960.0, "grad_norm": 3.0582981113882317, "language_loss": 0.75378543, "learning_rate": 1.7218552057293974e-06, "loss": 0.77236158, "num_input_tokens_seen": 199867645, "step": 9282, "time_per_iteration": 2.680744171142578 }, { "auxiliary_loss_clip": 0.01055422, "auxiliary_loss_mlp": 0.01036854, "balance_loss_clip": 1.03532624, "balance_loss_mlp": 1.02328229, "epoch": 0.5581241545167593, "flos": 17675986792320.0, "grad_norm": 2.212590462669887, "language_loss": 0.6592958, "learning_rate": 1.721469534028297e-06, "loss": 0.68021852, "num_input_tokens_seen": 199886320, "step": 9283, "time_per_iteration": 2.7523255348205566 }, { "auxiliary_loss_clip": 0.01087506, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.04440904, "balance_loss_mlp": 1.01841235, "epoch": 0.5581842777694274, "flos": 19569161018880.0, "grad_norm": 1.7248818916670352, "language_loss": 0.82969356, "learning_rate": 1.7210838728890994e-06, "loss": 0.85087025, "num_input_tokens_seen": 199904895, "step": 9284, "time_per_iteration": 2.6912968158721924 }, { "auxiliary_loss_clip": 0.01097795, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.04244661, "balance_loss_mlp": 1.02261066, "epoch": 0.5582444010220953, "flos": 20595165102720.0, "grad_norm": 2.3068151709488736, "language_loss": 0.85949606, "learning_rate": 1.7206982223264304e-06, "loss": 0.88083011, "num_input_tokens_seen": 199921090, "step": 9285, "time_per_iteration": 2.6835310459136963 }, { "auxiliary_loss_clip": 0.01095995, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.0437417, "balance_loss_mlp": 1.02543855, "epoch": 0.5583045242747633, "flos": 19135504120320.0, "grad_norm": 2.6758058324476024, "language_loss": 0.73497176, "learning_rate": 1.720312582354912e-06, "loss": 0.75631171, "num_input_tokens_seen": 199939925, "step": 9286, "time_per_iteration": 2.7510128021240234 }, { "auxiliary_loss_clip": 0.01119969, "auxiliary_loss_mlp": 0.01032279, "balance_loss_clip": 1.04193521, "balance_loss_mlp": 1.01924896, "epoch": 0.5583646475274312, "flos": 27454569730560.0, "grad_norm": 2.5542622351497104, "language_loss": 0.7366401, "learning_rate": 1.7199269529891684e-06, "loss": 0.7581625, "num_input_tokens_seen": 199960015, "step": 9287, "time_per_iteration": 2.7764368057250977 }, { "auxiliary_loss_clip": 0.01087822, "auxiliary_loss_mlp": 0.01038543, "balance_loss_clip": 1.04215682, "balance_loss_mlp": 1.0240171, "epoch": 0.5584247707800992, "flos": 23653784010240.0, "grad_norm": 1.5995445525462566, "language_loss": 0.75250727, "learning_rate": 1.7195413342438233e-06, "loss": 0.77377093, "num_input_tokens_seen": 199980505, "step": 9288, "time_per_iteration": 2.711667060852051 }, { "auxiliary_loss_clip": 0.01101347, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.04461765, "balance_loss_mlp": 1.03062999, "epoch": 0.5584848940327671, "flos": 13698880185600.0, "grad_norm": 2.3847574468541075, "language_loss": 0.77486145, "learning_rate": 1.7191557261334984e-06, "loss": 0.79632932, "num_input_tokens_seen": 199999020, "step": 9289, "time_per_iteration": 2.726365566253662 }, { "auxiliary_loss_clip": 0.01092544, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.04270971, "balance_loss_mlp": 1.02084172, "epoch": 0.5585450172854352, "flos": 27016208150400.0, "grad_norm": 1.8546991944448898, "language_loss": 0.61392409, "learning_rate": 1.718770128672817e-06, "loss": 0.63519758, "num_input_tokens_seen": 200019020, "step": 9290, "time_per_iteration": 2.7546441555023193 }, { "auxiliary_loss_clip": 0.01071377, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.03871763, "balance_loss_mlp": 1.01945531, "epoch": 0.5586051405381031, "flos": 23185653033600.0, "grad_norm": 2.64639974160875, "language_loss": 0.68249333, "learning_rate": 1.7183845418764e-06, "loss": 0.70353258, "num_input_tokens_seen": 200038110, "step": 9291, "time_per_iteration": 3.030916452407837 }, { "auxiliary_loss_clip": 0.01091279, "auxiliary_loss_mlp": 0.01045913, "balance_loss_clip": 1.04114079, "balance_loss_mlp": 1.03218007, "epoch": 0.5586652637907711, "flos": 20775544225920.0, "grad_norm": 1.7635760067758424, "language_loss": 0.84269536, "learning_rate": 1.7179989657588698e-06, "loss": 0.86406732, "num_input_tokens_seen": 200056210, "step": 9292, "time_per_iteration": 2.6990363597869873 }, { "auxiliary_loss_clip": 0.01090195, "auxiliary_loss_mlp": 0.01046206, "balance_loss_clip": 1.03904271, "balance_loss_mlp": 1.03265166, "epoch": 0.5587253870434391, "flos": 28219897837440.0, "grad_norm": 2.3637237833932687, "language_loss": 0.73976684, "learning_rate": 1.7176134003348476e-06, "loss": 0.76113087, "num_input_tokens_seen": 200075620, "step": 9293, "time_per_iteration": 2.7066195011138916 }, { "auxiliary_loss_clip": 0.0108672, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.04188502, "balance_loss_mlp": 1.03185785, "epoch": 0.558785510296107, "flos": 26615732440320.0, "grad_norm": 1.7291294273759894, "language_loss": 0.72083485, "learning_rate": 1.7172278456189523e-06, "loss": 0.74214494, "num_input_tokens_seen": 200095945, "step": 9294, "time_per_iteration": 2.7188310623168945 }, { "auxiliary_loss_clip": 0.01098814, "auxiliary_loss_mlp": 0.00770939, "balance_loss_clip": 1.04345989, "balance_loss_mlp": 1.0002197, "epoch": 0.558845633548775, "flos": 20156767608960.0, "grad_norm": 2.0034844848738995, "language_loss": 0.68573147, "learning_rate": 1.716842301625806e-06, "loss": 0.70442897, "num_input_tokens_seen": 200114185, "step": 9295, "time_per_iteration": 2.645157814025879 }, { "auxiliary_loss_clip": 0.01120796, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.04437232, "balance_loss_mlp": 1.02404976, "epoch": 0.5589057568014429, "flos": 24350774492160.0, "grad_norm": 1.451861251832641, "language_loss": 0.81153715, "learning_rate": 1.7164567683700281e-06, "loss": 0.83312207, "num_input_tokens_seen": 200135030, "step": 9296, "time_per_iteration": 2.638831853866577 }, { "auxiliary_loss_clip": 0.01109007, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.0433023, "balance_loss_mlp": 1.02302504, "epoch": 0.558965880054111, "flos": 21105168359040.0, "grad_norm": 2.39482931377815, "language_loss": 0.65407717, "learning_rate": 1.7160712458662379e-06, "loss": 0.67553014, "num_input_tokens_seen": 200154290, "step": 9297, "time_per_iteration": 2.6714565753936768 }, { "auxiliary_loss_clip": 0.01088452, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.04224098, "balance_loss_mlp": 1.024997, "epoch": 0.5590260033067789, "flos": 18436071513600.0, "grad_norm": 1.768502931317098, "language_loss": 0.75242859, "learning_rate": 1.7156857341290544e-06, "loss": 0.77370477, "num_input_tokens_seen": 200171555, "step": 9298, "time_per_iteration": 2.7061312198638916 }, { "auxiliary_loss_clip": 0.01019627, "auxiliary_loss_mlp": 0.01016507, "balance_loss_clip": 1.01274395, "balance_loss_mlp": 1.01488543, "epoch": 0.5590861265594469, "flos": 70577432490240.0, "grad_norm": 0.6867151105979278, "language_loss": 0.52393436, "learning_rate": 1.7153002331730967e-06, "loss": 0.54429573, "num_input_tokens_seen": 200237010, "step": 9299, "time_per_iteration": 3.2783946990966797 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.04119837, "balance_loss_mlp": 1.02390957, "epoch": 0.5591462498121148, "flos": 30664408896000.0, "grad_norm": 1.9265460961114051, "language_loss": 0.69143355, "learning_rate": 1.7149147430129824e-06, "loss": 0.7128489, "num_input_tokens_seen": 200260820, "step": 9300, "time_per_iteration": 2.716351270675659 }, { "auxiliary_loss_clip": 0.01065458, "auxiliary_loss_mlp": 0.01057284, "balance_loss_clip": 1.03432143, "balance_loss_mlp": 1.04067802, "epoch": 0.5592063730647828, "flos": 18150438562560.0, "grad_norm": 2.0948179426753164, "language_loss": 0.81994128, "learning_rate": 1.7145292636633293e-06, "loss": 0.84116876, "num_input_tokens_seen": 200278035, "step": 9301, "time_per_iteration": 2.6983389854431152 }, { "auxiliary_loss_clip": 0.01117535, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.04067254, "balance_loss_mlp": 1.0186348, "epoch": 0.5592664963174507, "flos": 24060400945920.0, "grad_norm": 3.1722185850775553, "language_loss": 0.68140459, "learning_rate": 1.714143795138756e-06, "loss": 0.70290172, "num_input_tokens_seen": 200297255, "step": 9302, "time_per_iteration": 2.5997016429901123 }, { "auxiliary_loss_clip": 0.01088292, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.04123783, "balance_loss_mlp": 1.01426911, "epoch": 0.5593266195701188, "flos": 19827897661440.0, "grad_norm": 1.7171276141981482, "language_loss": 0.70894414, "learning_rate": 1.713758337453878e-06, "loss": 0.7301147, "num_input_tokens_seen": 200317505, "step": 9303, "time_per_iteration": 2.720726728439331 }, { "auxiliary_loss_clip": 0.01045978, "auxiliary_loss_mlp": 0.01043666, "balance_loss_clip": 1.03466618, "balance_loss_mlp": 1.02934885, "epoch": 0.5593867428227867, "flos": 25300755440640.0, "grad_norm": 3.8871936508431606, "language_loss": 0.72614998, "learning_rate": 1.7133728906233124e-06, "loss": 0.74704641, "num_input_tokens_seen": 200338350, "step": 9304, "time_per_iteration": 2.7727861404418945 }, { "auxiliary_loss_clip": 0.01107464, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.04120493, "balance_loss_mlp": 1.0174104, "epoch": 0.5594468660754547, "flos": 12933013374720.0, "grad_norm": 2.306388303475261, "language_loss": 0.77981883, "learning_rate": 1.7129874546616763e-06, "loss": 0.80119586, "num_input_tokens_seen": 200353965, "step": 9305, "time_per_iteration": 2.5945067405700684 }, { "auxiliary_loss_clip": 0.01069392, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.04184294, "balance_loss_mlp": 1.01778793, "epoch": 0.5595069893281227, "flos": 19062713208960.0, "grad_norm": 1.7491845938042618, "language_loss": 0.69805098, "learning_rate": 1.7126020295835836e-06, "loss": 0.71904755, "num_input_tokens_seen": 200373595, "step": 9306, "time_per_iteration": 2.8083784580230713 }, { "auxiliary_loss_clip": 0.01018297, "auxiliary_loss_mlp": 0.01002442, "balance_loss_clip": 1.015836, "balance_loss_mlp": 1.00099397, "epoch": 0.5595671125807906, "flos": 70273375862400.0, "grad_norm": 0.9194279331367995, "language_loss": 0.60304606, "learning_rate": 1.7122166154036518e-06, "loss": 0.62325346, "num_input_tokens_seen": 200429155, "step": 9307, "time_per_iteration": 3.301408052444458 }, { "auxiliary_loss_clip": 0.01104522, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.0423522, "balance_loss_mlp": 1.02234626, "epoch": 0.5596272358334586, "flos": 20665513889280.0, "grad_norm": 1.8556565203900444, "language_loss": 0.73943615, "learning_rate": 1.7118312121364943e-06, "loss": 0.76083553, "num_input_tokens_seen": 200448290, "step": 9308, "time_per_iteration": 2.6449387073516846 }, { "auxiliary_loss_clip": 0.01051886, "auxiliary_loss_mlp": 0.01038908, "balance_loss_clip": 1.03424501, "balance_loss_mlp": 1.02397084, "epoch": 0.5596873590861265, "flos": 25041013217280.0, "grad_norm": 2.1877402567653808, "language_loss": 0.69691569, "learning_rate": 1.7114458197967257e-06, "loss": 0.71782362, "num_input_tokens_seen": 200466555, "step": 9309, "time_per_iteration": 4.464626312255859 }, { "auxiliary_loss_clip": 0.01093684, "auxiliary_loss_mlp": 0.01037862, "balance_loss_clip": 1.04161119, "balance_loss_mlp": 1.02288949, "epoch": 0.5597474823387946, "flos": 25958387594880.0, "grad_norm": 1.9102617963629012, "language_loss": 0.75523353, "learning_rate": 1.7110604383989613e-06, "loss": 0.77654898, "num_input_tokens_seen": 200485980, "step": 9310, "time_per_iteration": 4.4445412158966064 }, { "auxiliary_loss_clip": 0.01112006, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.04378152, "balance_loss_mlp": 1.02286768, "epoch": 0.5598076055914625, "flos": 26177442687360.0, "grad_norm": 2.0703892527912813, "language_loss": 0.69657761, "learning_rate": 1.7106750679578133e-06, "loss": 0.71807039, "num_input_tokens_seen": 200504555, "step": 9311, "time_per_iteration": 4.303341865539551 }, { "auxiliary_loss_clip": 0.01105172, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.04042637, "balance_loss_mlp": 1.02103674, "epoch": 0.5598677288441305, "flos": 11655778590720.0, "grad_norm": 1.8932120118757645, "language_loss": 0.71856189, "learning_rate": 1.7102897084878962e-06, "loss": 0.73995423, "num_input_tokens_seen": 200522700, "step": 9312, "time_per_iteration": 2.610438823699951 }, { "auxiliary_loss_clip": 0.01080705, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.04290187, "balance_loss_mlp": 1.02023816, "epoch": 0.5599278520967984, "flos": 22966597941120.0, "grad_norm": 2.1557841469459746, "language_loss": 0.89152771, "learning_rate": 1.709904360003822e-06, "loss": 0.91267115, "num_input_tokens_seen": 200541910, "step": 9313, "time_per_iteration": 2.6854610443115234 }, { "auxiliary_loss_clip": 0.01081962, "auxiliary_loss_mlp": 0.01044977, "balance_loss_clip": 1.0415206, "balance_loss_mlp": 1.03109467, "epoch": 0.5599879753494664, "flos": 21215557831680.0, "grad_norm": 1.521477055933408, "language_loss": 0.77815449, "learning_rate": 1.709519022520204e-06, "loss": 0.79942387, "num_input_tokens_seen": 200562600, "step": 9314, "time_per_iteration": 4.262527942657471 }, { "auxiliary_loss_clip": 0.01082652, "auxiliary_loss_mlp": 0.01031612, "balance_loss_clip": 1.0416466, "balance_loss_mlp": 1.01851654, "epoch": 0.5600480986021343, "flos": 31903219105920.0, "grad_norm": 1.6753660628338782, "language_loss": 0.70509619, "learning_rate": 1.7091336960516537e-06, "loss": 0.72623885, "num_input_tokens_seen": 200584795, "step": 9315, "time_per_iteration": 2.7611892223358154 }, { "auxiliary_loss_clip": 0.0110321, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.04375148, "balance_loss_mlp": 1.02726793, "epoch": 0.5601082218548024, "flos": 28476048700800.0, "grad_norm": 1.7587170023253702, "language_loss": 0.66601861, "learning_rate": 1.7087483806127824e-06, "loss": 0.68746114, "num_input_tokens_seen": 200606945, "step": 9316, "time_per_iteration": 2.675050973892212 }, { "auxiliary_loss_clip": 0.0108131, "auxiliary_loss_mlp": 0.01037022, "balance_loss_clip": 1.037871, "balance_loss_mlp": 1.0214529, "epoch": 0.5601683451074703, "flos": 24097173494400.0, "grad_norm": 2.414777902457845, "language_loss": 0.87209964, "learning_rate": 1.7083630762182022e-06, "loss": 0.89328289, "num_input_tokens_seen": 200626340, "step": 9317, "time_per_iteration": 2.7405858039855957 }, { "auxiliary_loss_clip": 0.01115616, "auxiliary_loss_mlp": 0.01038233, "balance_loss_clip": 1.04544759, "balance_loss_mlp": 1.02290869, "epoch": 0.5602284683601383, "flos": 26356205698560.0, "grad_norm": 1.8555836482261492, "language_loss": 0.76961493, "learning_rate": 1.7079777828825233e-06, "loss": 0.79115343, "num_input_tokens_seen": 200644520, "step": 9318, "time_per_iteration": 2.683375597000122 }, { "auxiliary_loss_clip": 0.0110569, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.04080641, "balance_loss_mlp": 1.02822459, "epoch": 0.5602885916128063, "flos": 24496392228480.0, "grad_norm": 1.6342768124534643, "language_loss": 0.76235765, "learning_rate": 1.7075925006203558e-06, "loss": 0.7838217, "num_input_tokens_seen": 200664845, "step": 9319, "time_per_iteration": 2.6256465911865234 }, { "auxiliary_loss_clip": 0.01107325, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.04242063, "balance_loss_mlp": 1.02393723, "epoch": 0.5603487148654742, "flos": 27345006270720.0, "grad_norm": 1.4761895802927258, "language_loss": 0.85648036, "learning_rate": 1.7072072294463101e-06, "loss": 0.87791771, "num_input_tokens_seen": 200686535, "step": 9320, "time_per_iteration": 2.7295455932617188 }, { "auxiliary_loss_clip": 0.0103543, "auxiliary_loss_mlp": 0.01003142, "balance_loss_clip": 1.01980209, "balance_loss_mlp": 1.00181246, "epoch": 0.5604088381181422, "flos": 54087756180480.0, "grad_norm": 0.7528149861495326, "language_loss": 0.52530909, "learning_rate": 1.706821969374996e-06, "loss": 0.54569471, "num_input_tokens_seen": 200736965, "step": 9321, "time_per_iteration": 3.0199856758117676 }, { "auxiliary_loss_clip": 0.01097468, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.04187417, "balance_loss_mlp": 1.02274311, "epoch": 0.5604689613708101, "flos": 22236390357120.0, "grad_norm": 1.366292846882571, "language_loss": 0.74232858, "learning_rate": 1.7064367204210216e-06, "loss": 0.7636596, "num_input_tokens_seen": 200757420, "step": 9322, "time_per_iteration": 2.7239301204681396 }, { "auxiliary_loss_clip": 0.01120105, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.04226124, "balance_loss_mlp": 1.01925397, "epoch": 0.5605290846234782, "flos": 35297782940160.0, "grad_norm": 1.6268223998146492, "language_loss": 0.74119061, "learning_rate": 1.7060514825989963e-06, "loss": 0.7627306, "num_input_tokens_seen": 200779520, "step": 9323, "time_per_iteration": 2.7277660369873047 }, { "auxiliary_loss_clip": 0.01097354, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.04408789, "balance_loss_mlp": 1.01961303, "epoch": 0.5605892078761461, "flos": 20263314326400.0, "grad_norm": 2.353968750169446, "language_loss": 0.61679977, "learning_rate": 1.7056662559235286e-06, "loss": 0.63811433, "num_input_tokens_seen": 200799485, "step": 9324, "time_per_iteration": 2.681330442428589 }, { "auxiliary_loss_clip": 0.01068442, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 1.03685164, "balance_loss_mlp": 1.02353454, "epoch": 0.5606493311288141, "flos": 17308333134720.0, "grad_norm": 1.7599111661375368, "language_loss": 0.87798876, "learning_rate": 1.705281040409226e-06, "loss": 0.89905095, "num_input_tokens_seen": 200817540, "step": 9325, "time_per_iteration": 2.73244571685791 }, { "auxiliary_loss_clip": 0.01098073, "auxiliary_loss_mlp": 0.01034138, "balance_loss_clip": 1.04064608, "balance_loss_mlp": 1.01970756, "epoch": 0.560709454381482, "flos": 21652985658240.0, "grad_norm": 1.5582793995716135, "language_loss": 0.7359941, "learning_rate": 1.7048958360706952e-06, "loss": 0.75731623, "num_input_tokens_seen": 200838380, "step": 9326, "time_per_iteration": 2.685098886489868 }, { "auxiliary_loss_clip": 0.01099795, "auxiliary_loss_mlp": 0.01027968, "balance_loss_clip": 1.04008412, "balance_loss_mlp": 1.01316798, "epoch": 0.56076957763415, "flos": 20303355012480.0, "grad_norm": 1.8644433543241015, "language_loss": 0.78216934, "learning_rate": 1.7045106429225447e-06, "loss": 0.80344701, "num_input_tokens_seen": 200855640, "step": 9327, "time_per_iteration": 2.7206430435180664 }, { "auxiliary_loss_clip": 0.01106989, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.04609513, "balance_loss_mlp": 1.02029371, "epoch": 0.5608297008868179, "flos": 25045897466880.0, "grad_norm": 1.6309153070460434, "language_loss": 0.78084052, "learning_rate": 1.7041254609793795e-06, "loss": 0.80225813, "num_input_tokens_seen": 200876585, "step": 9328, "time_per_iteration": 2.6724750995635986 }, { "auxiliary_loss_clip": 0.01119639, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.04266322, "balance_loss_mlp": 1.01832008, "epoch": 0.560889824139486, "flos": 19866825025920.0, "grad_norm": 1.4710158195252034, "language_loss": 0.73393631, "learning_rate": 1.7037402902558066e-06, "loss": 0.75544858, "num_input_tokens_seen": 200898175, "step": 9329, "time_per_iteration": 2.610711097717285 }, { "auxiliary_loss_clip": 0.01100007, "auxiliary_loss_mlp": 0.00773419, "balance_loss_clip": 1.04148126, "balance_loss_mlp": 1.00026274, "epoch": 0.5609499473921539, "flos": 22929394429440.0, "grad_norm": 1.5539142345159989, "language_loss": 0.83609939, "learning_rate": 1.7033551307664324e-06, "loss": 0.85483366, "num_input_tokens_seen": 200917515, "step": 9330, "time_per_iteration": 2.7287333011627197 }, { "auxiliary_loss_clip": 0.01042257, "auxiliary_loss_mlp": 0.01001028, "balance_loss_clip": 1.01692343, "balance_loss_mlp": 0.99974674, "epoch": 0.5610100706448219, "flos": 53035825455360.0, "grad_norm": 0.7095685041475404, "language_loss": 0.57797414, "learning_rate": 1.7029699825258603e-06, "loss": 0.59840697, "num_input_tokens_seen": 200978615, "step": 9331, "time_per_iteration": 3.197101354598999 }, { "auxiliary_loss_clip": 0.01082146, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.0445832, "balance_loss_mlp": 1.02405381, "epoch": 0.5610701938974898, "flos": 21834944979840.0, "grad_norm": 1.957386899067858, "language_loss": 0.82066166, "learning_rate": 1.7025848455486971e-06, "loss": 0.8418678, "num_input_tokens_seen": 200997745, "step": 9332, "time_per_iteration": 2.706125497817993 }, { "auxiliary_loss_clip": 0.01106958, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.04060066, "balance_loss_mlp": 1.02800488, "epoch": 0.5611303171501578, "flos": 17457183095040.0, "grad_norm": 1.7807099110593088, "language_loss": 0.81912845, "learning_rate": 1.7021997198495454e-06, "loss": 0.8406288, "num_input_tokens_seen": 201016370, "step": 9333, "time_per_iteration": 2.6288132667541504 }, { "auxiliary_loss_clip": 0.01119893, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.04119062, "balance_loss_mlp": 1.01676321, "epoch": 0.5611904404028258, "flos": 22637799820800.0, "grad_norm": 1.6112092331225492, "language_loss": 0.72989404, "learning_rate": 1.7018146054430108e-06, "loss": 0.75139362, "num_input_tokens_seen": 201034310, "step": 9334, "time_per_iteration": 2.6088995933532715 }, { "auxiliary_loss_clip": 0.01098453, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.0453335, "balance_loss_mlp": 1.02690315, "epoch": 0.5612505636554938, "flos": 14316327999360.0, "grad_norm": 2.5253764454191416, "language_loss": 0.71248639, "learning_rate": 1.7014295023436961e-06, "loss": 0.73387766, "num_input_tokens_seen": 201052030, "step": 9335, "time_per_iteration": 2.633389949798584 }, { "auxiliary_loss_clip": 0.0109857, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.03983665, "balance_loss_mlp": 1.02066469, "epoch": 0.5613106869081618, "flos": 16508279554560.0, "grad_norm": 1.8386426637696407, "language_loss": 0.77176088, "learning_rate": 1.701044410566205e-06, "loss": 0.79309118, "num_input_tokens_seen": 201068445, "step": 9336, "time_per_iteration": 2.681753158569336 }, { "auxiliary_loss_clip": 0.01108773, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.0423466, "balance_loss_mlp": 1.02086353, "epoch": 0.5613708101608297, "flos": 24058569352320.0, "grad_norm": 2.6196694346701817, "language_loss": 0.64508319, "learning_rate": 1.7006593301251393e-06, "loss": 0.66651058, "num_input_tokens_seen": 201082140, "step": 9337, "time_per_iteration": 2.629194498062134 }, { "auxiliary_loss_clip": 0.01025154, "auxiliary_loss_mlp": 0.01003147, "balance_loss_clip": 1.01963842, "balance_loss_mlp": 1.00190687, "epoch": 0.5614309334134977, "flos": 64905735997440.0, "grad_norm": 0.8917713489246797, "language_loss": 0.62551695, "learning_rate": 1.700274261035102e-06, "loss": 0.64579999, "num_input_tokens_seen": 201137245, "step": 9338, "time_per_iteration": 3.1740610599517822 }, { "auxiliary_loss_clip": 0.01091363, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.04291368, "balance_loss_mlp": 1.02275264, "epoch": 0.5614910566661656, "flos": 32919849740160.0, "grad_norm": 1.9155240319962232, "language_loss": 0.65588379, "learning_rate": 1.6998892033106946e-06, "loss": 0.67715669, "num_input_tokens_seen": 201157270, "step": 9339, "time_per_iteration": 2.795539617538452 }, { "auxiliary_loss_clip": 0.0110324, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.04000616, "balance_loss_mlp": 1.0283432, "epoch": 0.5615511799188336, "flos": 18588871969920.0, "grad_norm": 1.9415000376687095, "language_loss": 0.69498181, "learning_rate": 1.6995041569665184e-06, "loss": 0.716452, "num_input_tokens_seen": 201174530, "step": 9340, "time_per_iteration": 2.6073222160339355 }, { "auxiliary_loss_clip": 0.01076412, "auxiliary_loss_mlp": 0.0103814, "balance_loss_clip": 1.04082394, "balance_loss_mlp": 1.02536726, "epoch": 0.5616113031715015, "flos": 22820010537600.0, "grad_norm": 1.461608284307224, "language_loss": 0.77235413, "learning_rate": 1.6991191220171756e-06, "loss": 0.79349971, "num_input_tokens_seen": 201194905, "step": 9341, "time_per_iteration": 2.712812662124634 }, { "auxiliary_loss_clip": 0.01069621, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.03758025, "balance_loss_mlp": 1.0230068, "epoch": 0.5616714264241696, "flos": 22345702421760.0, "grad_norm": 1.556156421929591, "language_loss": 0.79645002, "learning_rate": 1.6987340984772653e-06, "loss": 0.81752324, "num_input_tokens_seen": 201213715, "step": 9342, "time_per_iteration": 2.774918556213379 }, { "auxiliary_loss_clip": 0.01091015, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.03911448, "balance_loss_mlp": 1.02109551, "epoch": 0.5617315496768375, "flos": 18807783408000.0, "grad_norm": 2.3711889370259907, "language_loss": 0.76042008, "learning_rate": 1.6983490863613882e-06, "loss": 0.78168309, "num_input_tokens_seen": 201231415, "step": 9343, "time_per_iteration": 2.7124969959259033 }, { "auxiliary_loss_clip": 0.01080837, "auxiliary_loss_mlp": 0.01044577, "balance_loss_clip": 1.04475522, "balance_loss_mlp": 1.03011727, "epoch": 0.5617916729295055, "flos": 18369314087040.0, "grad_norm": 2.196794276196035, "language_loss": 0.69644189, "learning_rate": 1.6979640856841442e-06, "loss": 0.71769607, "num_input_tokens_seen": 201249625, "step": 9344, "time_per_iteration": 2.7265472412109375 }, { "auxiliary_loss_clip": 0.01121229, "auxiliary_loss_mlp": 0.01038625, "balance_loss_clip": 1.04347157, "balance_loss_mlp": 1.02447486, "epoch": 0.5618517961821734, "flos": 28179964892160.0, "grad_norm": 3.2350770637683106, "language_loss": 0.6636014, "learning_rate": 1.6975790964601318e-06, "loss": 0.68519998, "num_input_tokens_seen": 201271205, "step": 9345, "time_per_iteration": 2.686527729034424 }, { "auxiliary_loss_clip": 0.01098571, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.04279995, "balance_loss_mlp": 1.0190227, "epoch": 0.5619119194348414, "flos": 15486872411520.0, "grad_norm": 1.9772946469645978, "language_loss": 0.87311339, "learning_rate": 1.6971941187039512e-06, "loss": 0.89441955, "num_input_tokens_seen": 201287700, "step": 9346, "time_per_iteration": 2.6551971435546875 }, { "auxiliary_loss_clip": 0.0109764, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.04373372, "balance_loss_mlp": 1.02243173, "epoch": 0.5619720426875094, "flos": 29128652951040.0, "grad_norm": 2.320939151148892, "language_loss": 0.59135818, "learning_rate": 1.6968091524301993e-06, "loss": 0.61270428, "num_input_tokens_seen": 201307530, "step": 9347, "time_per_iteration": 2.701704263687134 }, { "auxiliary_loss_clip": 0.01113798, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.0449301, "balance_loss_mlp": 1.02461553, "epoch": 0.5620321659401774, "flos": 18003743418240.0, "grad_norm": 3.390094180858037, "language_loss": 0.69345069, "learning_rate": 1.6964241976534745e-06, "loss": 0.7149868, "num_input_tokens_seen": 201326210, "step": 9348, "time_per_iteration": 2.6152281761169434 }, { "auxiliary_loss_clip": 0.01072866, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.03694952, "balance_loss_mlp": 1.02000761, "epoch": 0.5620922891928454, "flos": 20594518657920.0, "grad_norm": 12.292181580280033, "language_loss": 0.79008943, "learning_rate": 1.6960392543883754e-06, "loss": 0.81117141, "num_input_tokens_seen": 201346120, "step": 9349, "time_per_iteration": 5.937277793884277 }, { "auxiliary_loss_clip": 0.01068645, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.04074883, "balance_loss_mlp": 1.02314854, "epoch": 0.5621524124455133, "flos": 26287006147200.0, "grad_norm": 2.217082199318971, "language_loss": 0.67245173, "learning_rate": 1.6956543226494975e-06, "loss": 0.6935125, "num_input_tokens_seen": 201365700, "step": 9350, "time_per_iteration": 4.385211229324341 }, { "auxiliary_loss_clip": 0.01069908, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.03964508, "balance_loss_mlp": 1.02451682, "epoch": 0.5622125356981813, "flos": 12750299867520.0, "grad_norm": 2.668539433171336, "language_loss": 0.78305924, "learning_rate": 1.6952694024514381e-06, "loss": 0.80415249, "num_input_tokens_seen": 201382795, "step": 9351, "time_per_iteration": 2.6691691875457764 }, { "auxiliary_loss_clip": 0.01099605, "auxiliary_loss_mlp": 0.00772893, "balance_loss_clip": 1.03920138, "balance_loss_mlp": 1.00020838, "epoch": 0.5622726589508492, "flos": 23805327490560.0, "grad_norm": 1.4861648044093183, "language_loss": 0.59128547, "learning_rate": 1.6948844938087945e-06, "loss": 0.61001039, "num_input_tokens_seen": 201402780, "step": 9352, "time_per_iteration": 2.753941297531128 }, { "auxiliary_loss_clip": 0.01105703, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.0406158, "balance_loss_mlp": 1.02476466, "epoch": 0.5623327822035172, "flos": 24718212668160.0, "grad_norm": 1.334754568183942, "language_loss": 0.71630079, "learning_rate": 1.6944995967361604e-06, "loss": 0.73773241, "num_input_tokens_seen": 201424140, "step": 9353, "time_per_iteration": 4.249570369720459 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.04213238, "balance_loss_mlp": 1.01918769, "epoch": 0.5623929054561851, "flos": 14019274523520.0, "grad_norm": 2.376274628807619, "language_loss": 0.7593621, "learning_rate": 1.6941147112481327e-06, "loss": 0.78066975, "num_input_tokens_seen": 201439645, "step": 9354, "time_per_iteration": 2.689899206161499 }, { "auxiliary_loss_clip": 0.01089457, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.04167855, "balance_loss_mlp": 1.02183056, "epoch": 0.5624530287088532, "flos": 20704405340160.0, "grad_norm": 1.8223711210662343, "language_loss": 0.72909653, "learning_rate": 1.6937298373593056e-06, "loss": 0.75034714, "num_input_tokens_seen": 201459970, "step": 9355, "time_per_iteration": 2.755100965499878 }, { "auxiliary_loss_clip": 0.01104288, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.04146492, "balance_loss_mlp": 1.01845825, "epoch": 0.5625131519615211, "flos": 21470918595840.0, "grad_norm": 1.4719507883232867, "language_loss": 0.7346037, "learning_rate": 1.693344975084274e-06, "loss": 0.75596744, "num_input_tokens_seen": 201480055, "step": 9356, "time_per_iteration": 2.641638994216919 }, { "auxiliary_loss_clip": 0.01119375, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.04301476, "balance_loss_mlp": 1.02204823, "epoch": 0.5625732752141891, "flos": 18698004466560.0, "grad_norm": 2.3002614331876687, "language_loss": 0.83191347, "learning_rate": 1.6929601244376318e-06, "loss": 0.85346651, "num_input_tokens_seen": 201497645, "step": 9357, "time_per_iteration": 2.6374433040618896 }, { "auxiliary_loss_clip": 0.01108702, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.04158151, "balance_loss_mlp": 1.02019668, "epoch": 0.562633398466857, "flos": 16216900427520.0, "grad_norm": 2.42238754199954, "language_loss": 0.72483993, "learning_rate": 1.6925752854339722e-06, "loss": 0.74625957, "num_input_tokens_seen": 201515455, "step": 9358, "time_per_iteration": 2.6288702487945557 }, { "auxiliary_loss_clip": 0.01118085, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.04183221, "balance_loss_mlp": 1.02859807, "epoch": 0.562693521719525, "flos": 22491930689280.0, "grad_norm": 2.2438292834488838, "language_loss": 0.7763263, "learning_rate": 1.6921904580878885e-06, "loss": 0.79792929, "num_input_tokens_seen": 201534500, "step": 9359, "time_per_iteration": 2.6272196769714355 }, { "auxiliary_loss_clip": 0.0109706, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.04087317, "balance_loss_mlp": 1.0212934, "epoch": 0.562753644972193, "flos": 25331171281920.0, "grad_norm": 1.8703344042445116, "language_loss": 0.70466304, "learning_rate": 1.6918056424139736e-06, "loss": 0.72597086, "num_input_tokens_seen": 201553280, "step": 9360, "time_per_iteration": 2.6694719791412354 }, { "auxiliary_loss_clip": 0.00993761, "auxiliary_loss_mlp": 0.00999248, "balance_loss_clip": 1.01494741, "balance_loss_mlp": 0.99799061, "epoch": 0.562813768224861, "flos": 67392622126080.0, "grad_norm": 0.7735600550199924, "language_loss": 0.5555625, "learning_rate": 1.6914208384268197e-06, "loss": 0.57549262, "num_input_tokens_seen": 201610030, "step": 9361, "time_per_iteration": 3.2061593532562256 }, { "auxiliary_loss_clip": 0.01093709, "auxiliary_loss_mlp": 0.01035172, "balance_loss_clip": 1.04106104, "balance_loss_mlp": 1.02236927, "epoch": 0.562873891477529, "flos": 23331163029120.0, "grad_norm": 1.4272041180912485, "language_loss": 0.8169086, "learning_rate": 1.691036046141018e-06, "loss": 0.83819747, "num_input_tokens_seen": 201628370, "step": 9362, "time_per_iteration": 2.648585319519043 }, { "auxiliary_loss_clip": 0.01084349, "auxiliary_loss_mlp": 0.00771085, "balance_loss_clip": 1.03982627, "balance_loss_mlp": 1.00021708, "epoch": 0.5629340147301969, "flos": 38472824805120.0, "grad_norm": 1.5810217639510977, "language_loss": 0.7460767, "learning_rate": 1.6906512655711614e-06, "loss": 0.76463103, "num_input_tokens_seen": 201649790, "step": 9363, "time_per_iteration": 2.8376948833465576 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.04345608, "balance_loss_mlp": 1.02068281, "epoch": 0.5629941379828649, "flos": 29242023252480.0, "grad_norm": 1.625625465741998, "language_loss": 0.82640725, "learning_rate": 1.690266496731839e-06, "loss": 0.84789848, "num_input_tokens_seen": 201669175, "step": 9364, "time_per_iteration": 2.6790480613708496 }, { "auxiliary_loss_clip": 0.0107898, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.03860497, "balance_loss_mlp": 1.02573752, "epoch": 0.5630542612355328, "flos": 19420885676160.0, "grad_norm": 2.0942443962927513, "language_loss": 0.65238589, "learning_rate": 1.689881739637642e-06, "loss": 0.67356229, "num_input_tokens_seen": 201687000, "step": 9365, "time_per_iteration": 2.6504223346710205 }, { "auxiliary_loss_clip": 0.01099908, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.0423665, "balance_loss_mlp": 1.0259583, "epoch": 0.5631143844882008, "flos": 22266303408000.0, "grad_norm": 5.761173374312871, "language_loss": 0.8185727, "learning_rate": 1.6894969943031611e-06, "loss": 0.83997548, "num_input_tokens_seen": 201703335, "step": 9366, "time_per_iteration": 2.6865267753601074 }, { "auxiliary_loss_clip": 0.01118809, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.04305601, "balance_loss_mlp": 1.02106667, "epoch": 0.5631745077408687, "flos": 22965305051520.0, "grad_norm": 1.4687745386206819, "language_loss": 0.73388821, "learning_rate": 1.6891122607429845e-06, "loss": 0.75541377, "num_input_tokens_seen": 201723495, "step": 9367, "time_per_iteration": 2.6309821605682373 }, { "auxiliary_loss_clip": 0.01020057, "auxiliary_loss_mlp": 0.01004541, "balance_loss_clip": 1.01475585, "balance_loss_mlp": 1.0032177, "epoch": 0.5632346309935368, "flos": 65080515576960.0, "grad_norm": 0.6203732228424765, "language_loss": 0.53471267, "learning_rate": 1.6887275389717028e-06, "loss": 0.5549587, "num_input_tokens_seen": 201792615, "step": 9368, "time_per_iteration": 3.285132884979248 }, { "auxiliary_loss_clip": 0.01119712, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.04367208, "balance_loss_mlp": 1.02514756, "epoch": 0.5632947542462047, "flos": 23002903612800.0, "grad_norm": 1.6032046035258145, "language_loss": 0.69323123, "learning_rate": 1.6883428290039046e-06, "loss": 0.71481466, "num_input_tokens_seen": 201812520, "step": 9369, "time_per_iteration": 2.5828912258148193 }, { "auxiliary_loss_clip": 0.01081861, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.03560948, "balance_loss_mlp": 1.02258897, "epoch": 0.5633548774988727, "flos": 30482593228800.0, "grad_norm": 1.8644770946275213, "language_loss": 0.75840139, "learning_rate": 1.6879581308541763e-06, "loss": 0.77958775, "num_input_tokens_seen": 201834185, "step": 9370, "time_per_iteration": 2.7649481296539307 }, { "auxiliary_loss_clip": 0.01095504, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.04126322, "balance_loss_mlp": 1.02440023, "epoch": 0.5634150007515406, "flos": 18515039564160.0, "grad_norm": 2.2895815027179864, "language_loss": 0.755108, "learning_rate": 1.687573444537108e-06, "loss": 0.776452, "num_input_tokens_seen": 201851305, "step": 9371, "time_per_iteration": 2.591031312942505 }, { "auxiliary_loss_clip": 0.01106226, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.04110384, "balance_loss_mlp": 1.02787304, "epoch": 0.5634751240042086, "flos": 19244672530560.0, "grad_norm": 1.7615457998604214, "language_loss": 0.76489764, "learning_rate": 1.687188770067285e-06, "loss": 0.78636777, "num_input_tokens_seen": 201870350, "step": 9372, "time_per_iteration": 2.619053840637207 }, { "auxiliary_loss_clip": 0.01090528, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.03906, "balance_loss_mlp": 1.01705718, "epoch": 0.5635352472568766, "flos": 12020630987520.0, "grad_norm": 2.266062441891877, "language_loss": 0.71336401, "learning_rate": 1.6868041074592956e-06, "loss": 0.73457533, "num_input_tokens_seen": 201886800, "step": 9373, "time_per_iteration": 2.624600887298584 }, { "auxiliary_loss_clip": 0.01090554, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.04418933, "balance_loss_mlp": 1.0168401, "epoch": 0.5635953705095446, "flos": 21871645701120.0, "grad_norm": 2.1043627154333797, "language_loss": 0.82543874, "learning_rate": 1.6864194567277264e-06, "loss": 0.84665811, "num_input_tokens_seen": 201904730, "step": 9374, "time_per_iteration": 2.644887924194336 }, { "auxiliary_loss_clip": 0.01104117, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.03739262, "balance_loss_mlp": 1.01734459, "epoch": 0.5636554937622126, "flos": 27126166659840.0, "grad_norm": 1.7268514389800265, "language_loss": 0.66357785, "learning_rate": 1.6860348178871618e-06, "loss": 0.68492401, "num_input_tokens_seen": 201924850, "step": 9375, "time_per_iteration": 2.65166974067688 }, { "auxiliary_loss_clip": 0.01084894, "auxiliary_loss_mlp": 0.00770652, "balance_loss_clip": 1.04238153, "balance_loss_mlp": 1.00019169, "epoch": 0.5637156170148805, "flos": 12926405272320.0, "grad_norm": 2.3049359861127696, "language_loss": 0.81049269, "learning_rate": 1.6856501909521889e-06, "loss": 0.82904816, "num_input_tokens_seen": 201939500, "step": 9376, "time_per_iteration": 2.766364336013794 }, { "auxiliary_loss_clip": 0.01101359, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.04133999, "balance_loss_mlp": 1.02115881, "epoch": 0.5637757402675485, "flos": 45551033130240.0, "grad_norm": 1.6449694311006493, "language_loss": 0.6926713, "learning_rate": 1.6852655759373925e-06, "loss": 0.71403265, "num_input_tokens_seen": 201963000, "step": 9377, "time_per_iteration": 2.870060443878174 }, { "auxiliary_loss_clip": 0.01074381, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.03875685, "balance_loss_mlp": 1.01979017, "epoch": 0.5638358635202164, "flos": 20886041439360.0, "grad_norm": 1.3919625147372467, "language_loss": 0.74771237, "learning_rate": 1.6848809728573565e-06, "loss": 0.76878589, "num_input_tokens_seen": 201983145, "step": 9378, "time_per_iteration": 2.749613046646118 }, { "auxiliary_loss_clip": 0.01122728, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.04050553, "balance_loss_mlp": 1.02154493, "epoch": 0.5638959867728844, "flos": 18806562345600.0, "grad_norm": 2.63873718495401, "language_loss": 0.81853002, "learning_rate": 1.6844963817266656e-06, "loss": 0.84011662, "num_input_tokens_seen": 202000335, "step": 9379, "time_per_iteration": 2.625277280807495 }, { "auxiliary_loss_clip": 0.01093031, "auxiliary_loss_mlp": 0.01036774, "balance_loss_clip": 1.03674948, "balance_loss_mlp": 1.02336287, "epoch": 0.5639561100255523, "flos": 27490336698240.0, "grad_norm": 2.218934810530396, "language_loss": 0.7167027, "learning_rate": 1.6841118025599042e-06, "loss": 0.73800081, "num_input_tokens_seen": 202018275, "step": 9380, "time_per_iteration": 2.715791940689087 }, { "auxiliary_loss_clip": 0.01086194, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.0455358, "balance_loss_mlp": 1.02485633, "epoch": 0.5640162332782204, "flos": 18076570243200.0, "grad_norm": 2.0069687855649234, "language_loss": 0.74178547, "learning_rate": 1.6837272353716542e-06, "loss": 0.76304728, "num_input_tokens_seen": 202034330, "step": 9381, "time_per_iteration": 2.8091652393341064 }, { "auxiliary_loss_clip": 0.01068257, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.03590226, "balance_loss_mlp": 1.02741027, "epoch": 0.5640763565308883, "flos": 20884856290560.0, "grad_norm": 2.008212488841835, "language_loss": 0.72358, "learning_rate": 1.683342680176499e-06, "loss": 0.74467456, "num_input_tokens_seen": 202053100, "step": 9382, "time_per_iteration": 2.750049114227295 }, { "auxiliary_loss_clip": 0.0103983, "auxiliary_loss_mlp": 0.01012073, "balance_loss_clip": 1.01468074, "balance_loss_mlp": 1.01088643, "epoch": 0.5641364797835563, "flos": 64447912224000.0, "grad_norm": 0.7132903418918451, "language_loss": 0.54439944, "learning_rate": 1.682958136989022e-06, "loss": 0.56491846, "num_input_tokens_seen": 202120125, "step": 9383, "time_per_iteration": 3.308600425720215 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.04080617, "balance_loss_mlp": 1.01664162, "epoch": 0.5641966030362242, "flos": 18660944609280.0, "grad_norm": 1.7587549687902173, "language_loss": 0.71036148, "learning_rate": 1.6825736058238033e-06, "loss": 0.73172909, "num_input_tokens_seen": 202138030, "step": 9384, "time_per_iteration": 2.705378532409668 }, { "auxiliary_loss_clip": 0.01098378, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.04193604, "balance_loss_mlp": 1.02113533, "epoch": 0.5642567262888922, "flos": 22492325738880.0, "grad_norm": 2.5060474723218724, "language_loss": 0.75891483, "learning_rate": 1.6821890866954263e-06, "loss": 0.78025091, "num_input_tokens_seen": 202155580, "step": 9385, "time_per_iteration": 2.648486375808716 }, { "auxiliary_loss_clip": 0.01102679, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.03705001, "balance_loss_mlp": 1.02121449, "epoch": 0.5643168495415603, "flos": 13003972692480.0, "grad_norm": 1.9370694733196534, "language_loss": 0.82360542, "learning_rate": 1.6818045796184703e-06, "loss": 0.84497941, "num_input_tokens_seen": 202170365, "step": 9386, "time_per_iteration": 2.6014211177825928 }, { "auxiliary_loss_clip": 0.01108433, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.04246962, "balance_loss_mlp": 1.02117205, "epoch": 0.5643769727942282, "flos": 18588297352320.0, "grad_norm": 2.256739627854675, "language_loss": 0.69928676, "learning_rate": 1.681420084607516e-06, "loss": 0.72072339, "num_input_tokens_seen": 202189095, "step": 9387, "time_per_iteration": 2.6225178241729736 }, { "auxiliary_loss_clip": 0.01110032, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.04169261, "balance_loss_mlp": 1.02292085, "epoch": 0.5644370960468962, "flos": 33806269572480.0, "grad_norm": 1.4294069994917775, "language_loss": 0.74616826, "learning_rate": 1.6810356016771452e-06, "loss": 0.76762915, "num_input_tokens_seen": 202213500, "step": 9388, "time_per_iteration": 4.3489909172058105 }, { "auxiliary_loss_clip": 0.01103005, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.04041004, "balance_loss_mlp": 1.02143562, "epoch": 0.5644972192995641, "flos": 21214911386880.0, "grad_norm": 1.5515532198665989, "language_loss": 0.81965339, "learning_rate": 1.6806511308419353e-06, "loss": 0.84101695, "num_input_tokens_seen": 202231920, "step": 9389, "time_per_iteration": 5.713036060333252 }, { "auxiliary_loss_clip": 0.01083726, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.03770804, "balance_loss_mlp": 1.02090037, "epoch": 0.5645573425522321, "flos": 18587722734720.0, "grad_norm": 2.017294292301613, "language_loss": 0.63844502, "learning_rate": 1.680266672116467e-06, "loss": 0.65964001, "num_input_tokens_seen": 202247600, "step": 9390, "time_per_iteration": 2.718738079071045 }, { "auxiliary_loss_clip": 0.01096947, "auxiliary_loss_mlp": 0.01030588, "balance_loss_clip": 1.04229331, "balance_loss_mlp": 1.01875103, "epoch": 0.5646174658049, "flos": 18113809668480.0, "grad_norm": 1.8385345725956297, "language_loss": 0.92190915, "learning_rate": 1.6798822255153192e-06, "loss": 0.94318449, "num_input_tokens_seen": 202265350, "step": 9391, "time_per_iteration": 2.6871705055236816 }, { "auxiliary_loss_clip": 0.01118295, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.04650784, "balance_loss_mlp": 1.02288282, "epoch": 0.564677589057568, "flos": 28329964087680.0, "grad_norm": 2.30014312113224, "language_loss": 0.60238105, "learning_rate": 1.6794977910530684e-06, "loss": 0.62393618, "num_input_tokens_seen": 202284285, "step": 9392, "time_per_iteration": 2.6965878009796143 }, { "auxiliary_loss_clip": 0.01068376, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.03531122, "balance_loss_mlp": 1.01367223, "epoch": 0.564737712310236, "flos": 22163743100160.0, "grad_norm": 2.2381091213593924, "language_loss": 0.81505215, "learning_rate": 1.6791133687442937e-06, "loss": 0.83601177, "num_input_tokens_seen": 202303450, "step": 9393, "time_per_iteration": 4.253687620162964 }, { "auxiliary_loss_clip": 0.01095131, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.03995085, "balance_loss_mlp": 1.01804614, "epoch": 0.564797835562904, "flos": 20959011918720.0, "grad_norm": 1.6857006339700658, "language_loss": 0.87381589, "learning_rate": 1.6787289586035725e-06, "loss": 0.89507914, "num_input_tokens_seen": 202322315, "step": 9394, "time_per_iteration": 2.6733334064483643 }, { "auxiliary_loss_clip": 0.0110875, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.04296374, "balance_loss_mlp": 1.02065587, "epoch": 0.5648579588155719, "flos": 17420302805760.0, "grad_norm": 1.9505278392416294, "language_loss": 0.84685338, "learning_rate": 1.6783445606454814e-06, "loss": 0.86827838, "num_input_tokens_seen": 202339905, "step": 9395, "time_per_iteration": 2.6754062175750732 }, { "auxiliary_loss_clip": 0.0102964, "auxiliary_loss_mlp": 0.01000117, "balance_loss_clip": 1.01416993, "balance_loss_mlp": 0.99888915, "epoch": 0.5649180820682399, "flos": 69929568835200.0, "grad_norm": 0.7966393150311729, "language_loss": 0.58260763, "learning_rate": 1.677960174884597e-06, "loss": 0.60290521, "num_input_tokens_seen": 202397320, "step": 9396, "time_per_iteration": 3.176486015319824 }, { "auxiliary_loss_clip": 0.01099184, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.04099381, "balance_loss_mlp": 1.01762295, "epoch": 0.5649782053209078, "flos": 24973070641920.0, "grad_norm": 1.8659420980935195, "language_loss": 0.70408708, "learning_rate": 1.6775758013354943e-06, "loss": 0.72538739, "num_input_tokens_seen": 202416865, "step": 9397, "time_per_iteration": 2.76436710357666 }, { "auxiliary_loss_clip": 0.01087737, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.0412184, "balance_loss_mlp": 1.02305877, "epoch": 0.5650383285735758, "flos": 21726602582400.0, "grad_norm": 1.7242630837852022, "language_loss": 0.66510224, "learning_rate": 1.67719144001275e-06, "loss": 0.68634838, "num_input_tokens_seen": 202436210, "step": 9398, "time_per_iteration": 2.8452060222625732 }, { "auxiliary_loss_clip": 0.0102199, "auxiliary_loss_mlp": 0.01002651, "balance_loss_clip": 1.01533413, "balance_loss_mlp": 1.00157201, "epoch": 0.5650984518262439, "flos": 65904484636800.0, "grad_norm": 0.7636877487193632, "language_loss": 0.58165693, "learning_rate": 1.6768070909309386e-06, "loss": 0.60190332, "num_input_tokens_seen": 202492925, "step": 9399, "time_per_iteration": 3.1523597240448 }, { "auxiliary_loss_clip": 0.01076045, "auxiliary_loss_mlp": 0.01036845, "balance_loss_clip": 1.03608418, "balance_loss_mlp": 1.02109778, "epoch": 0.5651585750789118, "flos": 21032592929280.0, "grad_norm": 2.707299355352823, "language_loss": 0.7311101, "learning_rate": 1.6764227541046347e-06, "loss": 0.75223899, "num_input_tokens_seen": 202511905, "step": 9400, "time_per_iteration": 2.778313636779785 }, { "auxiliary_loss_clip": 0.01093566, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.04261565, "balance_loss_mlp": 1.02349663, "epoch": 0.5652186983315798, "flos": 18551919853440.0, "grad_norm": 1.7896331589473868, "language_loss": 0.6111843, "learning_rate": 1.676038429548412e-06, "loss": 0.63250542, "num_input_tokens_seen": 202529815, "step": 9401, "time_per_iteration": 2.7110683917999268 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.03607464, "balance_loss_mlp": 1.01735282, "epoch": 0.5652788215842477, "flos": 18478662065280.0, "grad_norm": 3.6521869515488405, "language_loss": 0.81323993, "learning_rate": 1.6756541172768453e-06, "loss": 0.83432162, "num_input_tokens_seen": 202547710, "step": 9402, "time_per_iteration": 2.8134961128234863 }, { "auxiliary_loss_clip": 0.0106172, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.03186333, "balance_loss_mlp": 1.02785897, "epoch": 0.5653389448369157, "flos": 30044052080640.0, "grad_norm": 1.434807389128129, "language_loss": 0.77711642, "learning_rate": 1.6752698173045068e-06, "loss": 0.79816544, "num_input_tokens_seen": 202568835, "step": 9403, "time_per_iteration": 2.9176833629608154 }, { "auxiliary_loss_clip": 0.01064861, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.03543758, "balance_loss_mlp": 1.02137828, "epoch": 0.5653990680895836, "flos": 16727550128640.0, "grad_norm": 1.6891349615397695, "language_loss": 0.69381618, "learning_rate": 1.6748855296459685e-06, "loss": 0.71483362, "num_input_tokens_seen": 202587385, "step": 9404, "time_per_iteration": 2.8122291564941406 }, { "auxiliary_loss_clip": 0.01081972, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.03926969, "balance_loss_mlp": 1.02245533, "epoch": 0.5654591913422516, "flos": 14538256179840.0, "grad_norm": 1.8707097320787585, "language_loss": 0.66802347, "learning_rate": 1.6745012543158045e-06, "loss": 0.68919408, "num_input_tokens_seen": 202604815, "step": 9405, "time_per_iteration": 2.6256675720214844 }, { "auxiliary_loss_clip": 0.01087827, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.03976154, "balance_loss_mlp": 1.02543378, "epoch": 0.5655193145949196, "flos": 26209905603840.0, "grad_norm": 1.7731068900459501, "language_loss": 0.74520212, "learning_rate": 1.6741169913285852e-06, "loss": 0.76646411, "num_input_tokens_seen": 202623775, "step": 9406, "time_per_iteration": 2.7220685482025146 }, { "auxiliary_loss_clip": 0.01061139, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.03829598, "balance_loss_mlp": 1.02655435, "epoch": 0.5655794378475876, "flos": 25046579825280.0, "grad_norm": 1.7152353741974506, "language_loss": 0.7952764, "learning_rate": 1.673732740698882e-06, "loss": 0.81632137, "num_input_tokens_seen": 202643375, "step": 9407, "time_per_iteration": 2.785325765609741 }, { "auxiliary_loss_clip": 0.01077703, "auxiliary_loss_mlp": 0.01039246, "balance_loss_clip": 1.03728688, "balance_loss_mlp": 1.02510178, "epoch": 0.5656395611002555, "flos": 31032852652800.0, "grad_norm": 1.3619251628826352, "language_loss": 0.71023029, "learning_rate": 1.6733485024412666e-06, "loss": 0.73139971, "num_input_tokens_seen": 202668400, "step": 9408, "time_per_iteration": 2.8171489238739014 }, { "auxiliary_loss_clip": 0.01061658, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.03865576, "balance_loss_mlp": 1.02198541, "epoch": 0.5656996843529235, "flos": 20229522606720.0, "grad_norm": 1.9952093590252573, "language_loss": 0.81203496, "learning_rate": 1.672964276570308e-06, "loss": 0.8330102, "num_input_tokens_seen": 202685125, "step": 9409, "time_per_iteration": 2.770899772644043 }, { "auxiliary_loss_clip": 0.01076156, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.03786421, "balance_loss_mlp": 1.01730919, "epoch": 0.5657598076055914, "flos": 20996251344000.0, "grad_norm": 1.8859201816541107, "language_loss": 0.78039193, "learning_rate": 1.6725800631005776e-06, "loss": 0.80145949, "num_input_tokens_seen": 202703830, "step": 9410, "time_per_iteration": 2.6944680213928223 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01042462, "balance_loss_clip": 1.04339719, "balance_loss_mlp": 1.02865767, "epoch": 0.5658199308582594, "flos": 11545999649280.0, "grad_norm": 2.199230863577756, "language_loss": 0.83460367, "learning_rate": 1.6721958620466432e-06, "loss": 0.85624611, "num_input_tokens_seen": 202719835, "step": 9411, "time_per_iteration": 2.576122760772705 }, { "auxiliary_loss_clip": 0.01112938, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04195237, "balance_loss_mlp": 1.01830769, "epoch": 0.5658800541109275, "flos": 14172146807040.0, "grad_norm": 3.221148840875553, "language_loss": 0.67855954, "learning_rate": 1.6718116734230749e-06, "loss": 0.70002437, "num_input_tokens_seen": 202736795, "step": 9412, "time_per_iteration": 2.6416120529174805 }, { "auxiliary_loss_clip": 0.01104164, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.04040003, "balance_loss_mlp": 1.02026224, "epoch": 0.5659401773635954, "flos": 27305073325440.0, "grad_norm": 1.6585263288332466, "language_loss": 0.58582389, "learning_rate": 1.6714274972444413e-06, "loss": 0.60718977, "num_input_tokens_seen": 202756900, "step": 9413, "time_per_iteration": 2.678048610687256 }, { "auxiliary_loss_clip": 0.01039217, "auxiliary_loss_mlp": 0.01044241, "balance_loss_clip": 1.03433728, "balance_loss_mlp": 1.02943516, "epoch": 0.5660003006162634, "flos": 16728196573440.0, "grad_norm": 1.5449777270978375, "language_loss": 0.69369984, "learning_rate": 1.6710433335253092e-06, "loss": 0.71453446, "num_input_tokens_seen": 202775145, "step": 9414, "time_per_iteration": 2.7721176147460938 }, { "auxiliary_loss_clip": 0.01048825, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.04257154, "balance_loss_mlp": 1.02139449, "epoch": 0.5660604238689313, "flos": 21653452535040.0, "grad_norm": 1.812121190686056, "language_loss": 0.78028589, "learning_rate": 1.670659182280247e-06, "loss": 0.80111009, "num_input_tokens_seen": 202794505, "step": 9415, "time_per_iteration": 3.0027029514312744 }, { "auxiliary_loss_clip": 0.01020707, "auxiliary_loss_mlp": 0.01005189, "balance_loss_clip": 1.01482093, "balance_loss_mlp": 1.00411057, "epoch": 0.5661205471215993, "flos": 68824022083200.0, "grad_norm": 0.6894107195855314, "language_loss": 0.4917945, "learning_rate": 1.670275043523822e-06, "loss": 0.51205349, "num_input_tokens_seen": 202858580, "step": 9416, "time_per_iteration": 3.564145565032959 }, { "auxiliary_loss_clip": 0.01107627, "auxiliary_loss_mlp": 0.00770936, "balance_loss_clip": 1.04195189, "balance_loss_mlp": 1.00020862, "epoch": 0.5661806703742672, "flos": 28621774177920.0, "grad_norm": 1.657672708695628, "language_loss": 0.62541103, "learning_rate": 1.6698909172706e-06, "loss": 0.64419663, "num_input_tokens_seen": 202878565, "step": 9417, "time_per_iteration": 2.6624128818511963 }, { "auxiliary_loss_clip": 0.01098355, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.03992152, "balance_loss_mlp": 1.02003968, "epoch": 0.5662407936269352, "flos": 21397948116480.0, "grad_norm": 1.9219049023075434, "language_loss": 0.68760461, "learning_rate": 1.6695068035351479e-06, "loss": 0.7089265, "num_input_tokens_seen": 202897350, "step": 9418, "time_per_iteration": 2.686701774597168 }, { "auxiliary_loss_clip": 0.0110608, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.03848708, "balance_loss_mlp": 1.01997232, "epoch": 0.5663009168796032, "flos": 25660005315840.0, "grad_norm": 1.8426385136450754, "language_loss": 0.65225303, "learning_rate": 1.6691227023320304e-06, "loss": 0.67366338, "num_input_tokens_seen": 202916745, "step": 9419, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.00978175, "auxiliary_loss_mlp": 0.01018666, "balance_loss_clip": 1.01932096, "balance_loss_mlp": 1.01722336, "epoch": 0.5663610401322712, "flos": 67930458422400.0, "grad_norm": 0.7448874820638522, "language_loss": 0.59677726, "learning_rate": 1.6687386136758135e-06, "loss": 0.61674571, "num_input_tokens_seen": 202982375, "step": 9420, "time_per_iteration": 3.422990083694458 }, { "auxiliary_loss_clip": 0.01098663, "auxiliary_loss_mlp": 0.00770427, "balance_loss_clip": 1.0412631, "balance_loss_mlp": 1.00017929, "epoch": 0.5664211633849391, "flos": 24609367480320.0, "grad_norm": 1.5681535851968893, "language_loss": 0.74130625, "learning_rate": 1.6683545375810618e-06, "loss": 0.75999713, "num_input_tokens_seen": 203002430, "step": 9421, "time_per_iteration": 2.8006680011749268 }, { "auxiliary_loss_clip": 0.0108426, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.03777134, "balance_loss_mlp": 1.02212119, "epoch": 0.5664812866376071, "flos": 11648811352320.0, "grad_norm": 2.1577016458252567, "language_loss": 0.72988069, "learning_rate": 1.6679704740623389e-06, "loss": 0.75108308, "num_input_tokens_seen": 203019425, "step": 9422, "time_per_iteration": 2.6400234699249268 }, { "auxiliary_loss_clip": 0.01105093, "auxiliary_loss_mlp": 0.01037861, "balance_loss_clip": 1.04141676, "balance_loss_mlp": 1.02530825, "epoch": 0.566541409890275, "flos": 24643985212800.0, "grad_norm": 1.7654112494568213, "language_loss": 0.81893075, "learning_rate": 1.6675864231342085e-06, "loss": 0.84036028, "num_input_tokens_seen": 203039035, "step": 9423, "time_per_iteration": 2.673105239868164 }, { "auxiliary_loss_clip": 0.01090689, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.03944159, "balance_loss_mlp": 1.02356339, "epoch": 0.566601533142943, "flos": 22270577126400.0, "grad_norm": 1.4934148877619189, "language_loss": 0.8075555, "learning_rate": 1.6672023848112353e-06, "loss": 0.82883728, "num_input_tokens_seen": 203059320, "step": 9424, "time_per_iteration": 2.6597039699554443 }, { "auxiliary_loss_clip": 0.01124321, "auxiliary_loss_mlp": 0.00771519, "balance_loss_clip": 1.04382432, "balance_loss_mlp": 1.00018978, "epoch": 0.5666616563956111, "flos": 29971656218880.0, "grad_norm": 2.0092362269175297, "language_loss": 0.78882873, "learning_rate": 1.6668183591079805e-06, "loss": 0.80778712, "num_input_tokens_seen": 203078490, "step": 9425, "time_per_iteration": 2.6688153743743896 }, { "auxiliary_loss_clip": 0.01090837, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.0417583, "balance_loss_mlp": 1.02170324, "epoch": 0.566721779648279, "flos": 17781456101760.0, "grad_norm": 1.976091068193849, "language_loss": 0.5920769, "learning_rate": 1.6664343460390064e-06, "loss": 0.61333382, "num_input_tokens_seen": 203096065, "step": 9426, "time_per_iteration": 2.6646664142608643 }, { "auxiliary_loss_clip": 0.01110034, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.04102027, "balance_loss_mlp": 1.01922381, "epoch": 0.566781902900947, "flos": 21033490769280.0, "grad_norm": 2.110311025280775, "language_loss": 0.81678975, "learning_rate": 1.6660503456188764e-06, "loss": 0.83821344, "num_input_tokens_seen": 203115270, "step": 9427, "time_per_iteration": 5.8222620487213135 }, { "auxiliary_loss_clip": 0.01117064, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.04323864, "balance_loss_mlp": 1.02506411, "epoch": 0.5668420261536149, "flos": 23148593176320.0, "grad_norm": 1.814267468057716, "language_loss": 0.86105633, "learning_rate": 1.6656663578621498e-06, "loss": 0.88260972, "num_input_tokens_seen": 203134290, "step": 9428, "time_per_iteration": 4.0940985679626465 }, { "auxiliary_loss_clip": 0.01102233, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.04397511, "balance_loss_mlp": 1.01996648, "epoch": 0.5669021494062829, "flos": 22601601889920.0, "grad_norm": 2.604927880391597, "language_loss": 0.73541754, "learning_rate": 1.6652823827833886e-06, "loss": 0.75678086, "num_input_tokens_seen": 203152935, "step": 9429, "time_per_iteration": 2.711982011795044 }, { "auxiliary_loss_clip": 0.01100688, "auxiliary_loss_mlp": 0.00772268, "balance_loss_clip": 1.04164147, "balance_loss_mlp": 1.00020123, "epoch": 0.5669622726589508, "flos": 17381231786880.0, "grad_norm": 3.499205688936759, "language_loss": 0.75380534, "learning_rate": 1.6648984203971538e-06, "loss": 0.77253491, "num_input_tokens_seen": 203170110, "step": 9430, "time_per_iteration": 2.775536060333252 }, { "auxiliary_loss_clip": 0.0111876, "auxiliary_loss_mlp": 0.01036284, "balance_loss_clip": 1.04125142, "balance_loss_mlp": 1.02263451, "epoch": 0.5670223959116188, "flos": 18763253521920.0, "grad_norm": 1.7932678929965582, "language_loss": 0.72862244, "learning_rate": 1.6645144707180032e-06, "loss": 0.75017291, "num_input_tokens_seen": 203188825, "step": 9431, "time_per_iteration": 2.7299160957336426 }, { "auxiliary_loss_clip": 0.01068382, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.03856969, "balance_loss_mlp": 1.02459264, "epoch": 0.5670825191642868, "flos": 13553334276480.0, "grad_norm": 1.899230938499918, "language_loss": 0.73544705, "learning_rate": 1.6641305337604984e-06, "loss": 0.75650489, "num_input_tokens_seen": 203206860, "step": 9432, "time_per_iteration": 2.68713641166687 }, { "auxiliary_loss_clip": 0.01066627, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.03716183, "balance_loss_mlp": 1.01875782, "epoch": 0.5671426424169548, "flos": 22054035985920.0, "grad_norm": 1.4657818599236931, "language_loss": 0.78099382, "learning_rate": 1.663746609539197e-06, "loss": 0.80198044, "num_input_tokens_seen": 203225625, "step": 9433, "time_per_iteration": 4.3982954025268555 }, { "auxiliary_loss_clip": 0.01123451, "auxiliary_loss_mlp": 0.01038623, "balance_loss_clip": 1.04226542, "balance_loss_mlp": 1.02239299, "epoch": 0.5672027656696227, "flos": 21323972056320.0, "grad_norm": 1.9415050552486373, "language_loss": 0.6311425, "learning_rate": 1.6633626980686582e-06, "loss": 0.65276325, "num_input_tokens_seen": 203242920, "step": 9434, "time_per_iteration": 2.6829726696014404 }, { "auxiliary_loss_clip": 0.01106985, "auxiliary_loss_mlp": 0.01029655, "balance_loss_clip": 1.04066229, "balance_loss_mlp": 1.01654196, "epoch": 0.5672628889222907, "flos": 23514056104320.0, "grad_norm": 2.0456781967901025, "language_loss": 0.66337132, "learning_rate": 1.6629787993634399e-06, "loss": 0.68473774, "num_input_tokens_seen": 203261995, "step": 9435, "time_per_iteration": 2.7055511474609375 }, { "auxiliary_loss_clip": 0.01092568, "auxiliary_loss_mlp": 0.00770808, "balance_loss_clip": 1.03747869, "balance_loss_mlp": 1.00008333, "epoch": 0.5673230121749586, "flos": 27121928855040.0, "grad_norm": 1.9714061310868114, "language_loss": 0.71574509, "learning_rate": 1.6625949134380984e-06, "loss": 0.73437893, "num_input_tokens_seen": 203280670, "step": 9436, "time_per_iteration": 2.7314302921295166 }, { "auxiliary_loss_clip": 0.01119804, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.041466, "balance_loss_mlp": 1.02099752, "epoch": 0.5673831354276266, "flos": 31141985149440.0, "grad_norm": 1.474374193730658, "language_loss": 0.7411499, "learning_rate": 1.6622110403071921e-06, "loss": 0.76269662, "num_input_tokens_seen": 203304800, "step": 9437, "time_per_iteration": 2.6829545497894287 }, { "auxiliary_loss_clip": 0.01115825, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.04766893, "balance_loss_mlp": 1.02231264, "epoch": 0.5674432586802945, "flos": 27673193859840.0, "grad_norm": 2.0226289672132096, "language_loss": 0.6118415, "learning_rate": 1.661827179985277e-06, "loss": 0.63336593, "num_input_tokens_seen": 203324060, "step": 9438, "time_per_iteration": 2.6840946674346924 }, { "auxiliary_loss_clip": 0.01097885, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.03924835, "balance_loss_mlp": 1.0185318, "epoch": 0.5675033819329626, "flos": 26615157822720.0, "grad_norm": 1.5530482991602657, "language_loss": 0.75020033, "learning_rate": 1.661443332486909e-06, "loss": 0.77150226, "num_input_tokens_seen": 203344360, "step": 9439, "time_per_iteration": 2.6898789405822754 }, { "auxiliary_loss_clip": 0.01092055, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.04008341, "balance_loss_mlp": 1.02168322, "epoch": 0.5675635051856306, "flos": 19098372435840.0, "grad_norm": 1.924986803502997, "language_loss": 0.83848387, "learning_rate": 1.6610594978266438e-06, "loss": 0.85976589, "num_input_tokens_seen": 203362115, "step": 9440, "time_per_iteration": 2.7438228130340576 }, { "auxiliary_loss_clip": 0.01087383, "auxiliary_loss_mlp": 0.01036961, "balance_loss_clip": 1.0389899, "balance_loss_mlp": 1.02264404, "epoch": 0.5676236284382985, "flos": 17566315591680.0, "grad_norm": 3.3538120018942843, "language_loss": 0.75190175, "learning_rate": 1.6606756760190365e-06, "loss": 0.7731452, "num_input_tokens_seen": 203380550, "step": 9441, "time_per_iteration": 2.6487948894500732 }, { "auxiliary_loss_clip": 0.01066366, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.03523147, "balance_loss_mlp": 1.02376556, "epoch": 0.5676837516909665, "flos": 15954069634560.0, "grad_norm": 1.8078445069287523, "language_loss": 0.83109975, "learning_rate": 1.6602918670786413e-06, "loss": 0.85213792, "num_input_tokens_seen": 203396590, "step": 9442, "time_per_iteration": 2.692474842071533 }, { "auxiliary_loss_clip": 0.01083606, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.04210138, "balance_loss_mlp": 1.02311242, "epoch": 0.5677438749436344, "flos": 18295912644480.0, "grad_norm": 2.0214699890453414, "language_loss": 0.74567246, "learning_rate": 1.6599080710200126e-06, "loss": 0.76686704, "num_input_tokens_seen": 203414280, "step": 9443, "time_per_iteration": 2.742173433303833 }, { "auxiliary_loss_clip": 0.01093942, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.04245853, "balance_loss_mlp": 1.02310669, "epoch": 0.5678039981963025, "flos": 17931311642880.0, "grad_norm": 2.2236359492875817, "language_loss": 0.77068752, "learning_rate": 1.6595242878577046e-06, "loss": 0.79199237, "num_input_tokens_seen": 203433280, "step": 9444, "time_per_iteration": 2.65165376663208 }, { "auxiliary_loss_clip": 0.01083168, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.04132152, "balance_loss_mlp": 1.02910697, "epoch": 0.5678641214489704, "flos": 19316350120320.0, "grad_norm": 1.9769562357276376, "language_loss": 0.80988097, "learning_rate": 1.6591405176062687e-06, "loss": 0.83114243, "num_input_tokens_seen": 203449935, "step": 9445, "time_per_iteration": 2.692103147506714 }, { "auxiliary_loss_clip": 0.01115981, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.03910589, "balance_loss_mlp": 1.01635599, "epoch": 0.5679242447016384, "flos": 27751084502400.0, "grad_norm": 1.8145653139656197, "language_loss": 0.71126974, "learning_rate": 1.658756760280259e-06, "loss": 0.73272997, "num_input_tokens_seen": 203473025, "step": 9446, "time_per_iteration": 2.6656479835510254 }, { "auxiliary_loss_clip": 0.01084809, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.03896558, "balance_loss_mlp": 1.01640046, "epoch": 0.5679843679543063, "flos": 23769093646080.0, "grad_norm": 1.9173533022587075, "language_loss": 0.73434311, "learning_rate": 1.6583730158942276e-06, "loss": 0.75548959, "num_input_tokens_seen": 203492895, "step": 9447, "time_per_iteration": 2.7948012351989746 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.04186499, "balance_loss_mlp": 1.02139819, "epoch": 0.5680444912069743, "flos": 25591883172480.0, "grad_norm": 3.5475375147623294, "language_loss": 0.7504915, "learning_rate": 1.657989284462725e-06, "loss": 0.77183461, "num_input_tokens_seen": 203513710, "step": 9448, "time_per_iteration": 2.700333595275879 }, { "auxiliary_loss_clip": 0.01079167, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.04264426, "balance_loss_mlp": 1.0336951, "epoch": 0.5681046144596422, "flos": 23695799944320.0, "grad_norm": 2.3399913967333865, "language_loss": 0.76352537, "learning_rate": 1.6576055660003038e-06, "loss": 0.78480804, "num_input_tokens_seen": 203531630, "step": 9449, "time_per_iteration": 2.7736854553222656 }, { "auxiliary_loss_clip": 0.01096359, "auxiliary_loss_mlp": 0.01042326, "balance_loss_clip": 1.04059768, "balance_loss_mlp": 1.02729404, "epoch": 0.5681647377123102, "flos": 28000770917760.0, "grad_norm": 1.7507923980752478, "language_loss": 0.74660265, "learning_rate": 1.6572218605215128e-06, "loss": 0.76798952, "num_input_tokens_seen": 203551885, "step": 9450, "time_per_iteration": 2.749420642852783 }, { "auxiliary_loss_clip": 0.01102012, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.04193068, "balance_loss_mlp": 1.02674794, "epoch": 0.5682248609649782, "flos": 22747758330240.0, "grad_norm": 2.689223250754005, "language_loss": 0.66906244, "learning_rate": 1.6568381680409038e-06, "loss": 0.69047868, "num_input_tokens_seen": 203572250, "step": 9451, "time_per_iteration": 2.753199338912964 }, { "auxiliary_loss_clip": 0.01096067, "auxiliary_loss_mlp": 0.01038718, "balance_loss_clip": 1.03942561, "balance_loss_mlp": 1.02265501, "epoch": 0.5682849842176462, "flos": 21288600138240.0, "grad_norm": 3.0838986562683557, "language_loss": 0.71882987, "learning_rate": 1.656454488573026e-06, "loss": 0.74017769, "num_input_tokens_seen": 203590605, "step": 9452, "time_per_iteration": 2.6950924396514893 }, { "auxiliary_loss_clip": 0.01076417, "auxiliary_loss_mlp": 0.01030065, "balance_loss_clip": 1.03938448, "balance_loss_mlp": 1.01734543, "epoch": 0.5683451074703142, "flos": 21141689512320.0, "grad_norm": 1.8642874843773423, "language_loss": 0.70013601, "learning_rate": 1.656070822132428e-06, "loss": 0.72120082, "num_input_tokens_seen": 203610080, "step": 9453, "time_per_iteration": 2.7006165981292725 }, { "auxiliary_loss_clip": 0.01076829, "auxiliary_loss_mlp": 0.00769854, "balance_loss_clip": 1.04066825, "balance_loss_mlp": 1.00014949, "epoch": 0.5684052307229821, "flos": 22344481359360.0, "grad_norm": 2.037972918051024, "language_loss": 0.70139372, "learning_rate": 1.6556871687336592e-06, "loss": 0.71986055, "num_input_tokens_seen": 203630060, "step": 9454, "time_per_iteration": 2.759376287460327 }, { "auxiliary_loss_clip": 0.01095428, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.03987896, "balance_loss_mlp": 1.01938248, "epoch": 0.5684653539756501, "flos": 21798639308160.0, "grad_norm": 1.989743078970872, "language_loss": 0.6078186, "learning_rate": 1.6553035283912671e-06, "loss": 0.62909198, "num_input_tokens_seen": 203649065, "step": 9455, "time_per_iteration": 2.678152322769165 }, { "auxiliary_loss_clip": 0.01082741, "auxiliary_loss_mlp": 0.0103652, "balance_loss_clip": 1.0447154, "balance_loss_mlp": 1.02253652, "epoch": 0.568525477228318, "flos": 22999635475200.0, "grad_norm": 4.296474832454859, "language_loss": 0.73108375, "learning_rate": 1.6549199011198e-06, "loss": 0.75227636, "num_input_tokens_seen": 203667545, "step": 9456, "time_per_iteration": 2.7307004928588867 }, { "auxiliary_loss_clip": 0.01099598, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.04188192, "balance_loss_mlp": 1.01902199, "epoch": 0.568585600480986, "flos": 21392489249280.0, "grad_norm": 1.662795047431792, "language_loss": 0.77013254, "learning_rate": 1.6545362869338048e-06, "loss": 0.79144537, "num_input_tokens_seen": 203686025, "step": 9457, "time_per_iteration": 2.665708303451538 }, { "auxiliary_loss_clip": 0.01111194, "auxiliary_loss_mlp": 0.01036842, "balance_loss_clip": 1.0429163, "balance_loss_mlp": 1.02280521, "epoch": 0.568645723733654, "flos": 30007351359360.0, "grad_norm": 2.0672888051412817, "language_loss": 0.66191971, "learning_rate": 1.6541526858478285e-06, "loss": 0.68340003, "num_input_tokens_seen": 203705540, "step": 9458, "time_per_iteration": 2.780771017074585 }, { "auxiliary_loss_clip": 0.01110997, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.04201722, "balance_loss_mlp": 1.01742291, "epoch": 0.568705846986322, "flos": 20412667077120.0, "grad_norm": 2.504426538314312, "language_loss": 0.68920743, "learning_rate": 1.6537690978764167e-06, "loss": 0.71063197, "num_input_tokens_seen": 203723670, "step": 9459, "time_per_iteration": 2.637176513671875 }, { "auxiliary_loss_clip": 0.01095236, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.0442152, "balance_loss_mlp": 1.01929152, "epoch": 0.5687659702389899, "flos": 17456752131840.0, "grad_norm": 2.127788828908428, "language_loss": 0.76758575, "learning_rate": 1.6533855230341155e-06, "loss": 0.788867, "num_input_tokens_seen": 203739705, "step": 9460, "time_per_iteration": 2.7338075637817383 }, { "auxiliary_loss_clip": 0.01066336, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.04204893, "balance_loss_mlp": 1.02563262, "epoch": 0.5688260934916579, "flos": 25406081095680.0, "grad_norm": 1.8378075196350074, "language_loss": 0.71994978, "learning_rate": 1.65300196133547e-06, "loss": 0.74100673, "num_input_tokens_seen": 203759000, "step": 9461, "time_per_iteration": 2.9295692443847656 }, { "auxiliary_loss_clip": 0.01110974, "auxiliary_loss_mlp": 0.01036974, "balance_loss_clip": 1.04267561, "balance_loss_mlp": 1.02314544, "epoch": 0.5688862167443258, "flos": 21608024808960.0, "grad_norm": 2.3363777583338794, "language_loss": 0.73092425, "learning_rate": 1.6526184127950249e-06, "loss": 0.75240374, "num_input_tokens_seen": 203774295, "step": 9462, "time_per_iteration": 2.639132022857666 }, { "auxiliary_loss_clip": 0.01105415, "auxiliary_loss_mlp": 0.01026496, "balance_loss_clip": 1.03986573, "balance_loss_mlp": 1.01507592, "epoch": 0.5689463399969938, "flos": 22418996123520.0, "grad_norm": 1.9966058203681178, "language_loss": 0.72878397, "learning_rate": 1.6522348774273246e-06, "loss": 0.75010306, "num_input_tokens_seen": 203792710, "step": 9463, "time_per_iteration": 2.687623977661133 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.04214895, "balance_loss_mlp": 1.02012718, "epoch": 0.5690064632496618, "flos": 18296810484480.0, "grad_norm": 2.136514167684146, "language_loss": 0.73800778, "learning_rate": 1.6518513552469123e-06, "loss": 0.75941622, "num_input_tokens_seen": 203811645, "step": 9464, "time_per_iteration": 2.6446449756622314 }, { "auxiliary_loss_clip": 0.01110623, "auxiliary_loss_mlp": 0.0077176, "balance_loss_clip": 1.04163098, "balance_loss_mlp": 1.00012827, "epoch": 0.5690665865023298, "flos": 21579260993280.0, "grad_norm": 2.0135063282733108, "language_loss": 0.84068149, "learning_rate": 1.6514678462683312e-06, "loss": 0.85950536, "num_input_tokens_seen": 203830040, "step": 9465, "time_per_iteration": 2.6243364810943604 }, { "auxiliary_loss_clip": 0.01092541, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.03678536, "balance_loss_mlp": 1.0195086, "epoch": 0.5691267097549978, "flos": 24421446501120.0, "grad_norm": 1.6434295280058835, "language_loss": 0.72125626, "learning_rate": 1.651084350506125e-06, "loss": 0.74250448, "num_input_tokens_seen": 203851245, "step": 9466, "time_per_iteration": 5.837533712387085 }, { "auxiliary_loss_clip": 0.01016007, "auxiliary_loss_mlp": 0.01001581, "balance_loss_clip": 1.01873374, "balance_loss_mlp": 1.00037718, "epoch": 0.5691868330076657, "flos": 61657906199040.0, "grad_norm": 0.7155703714304625, "language_loss": 0.55334294, "learning_rate": 1.6507008679748343e-06, "loss": 0.57351875, "num_input_tokens_seen": 203916400, "step": 9467, "time_per_iteration": 4.8396992683410645 }, { "auxiliary_loss_clip": 0.01107605, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.04263473, "balance_loss_mlp": 1.02364564, "epoch": 0.5692469562603337, "flos": 21325193118720.0, "grad_norm": 16.186384536861027, "language_loss": 0.6343258, "learning_rate": 1.6503173986890023e-06, "loss": 0.65578872, "num_input_tokens_seen": 203935870, "step": 9468, "time_per_iteration": 2.6212332248687744 }, { "auxiliary_loss_clip": 0.01066902, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.03614831, "balance_loss_mlp": 1.02334094, "epoch": 0.5693070795130016, "flos": 23367899664000.0, "grad_norm": 2.927691999708818, "language_loss": 0.78902012, "learning_rate": 1.64993394266317e-06, "loss": 0.81006986, "num_input_tokens_seen": 203954950, "step": 9469, "time_per_iteration": 2.745016098022461 }, { "auxiliary_loss_clip": 0.01085393, "auxiliary_loss_mlp": 0.01053274, "balance_loss_clip": 1.04159784, "balance_loss_mlp": 1.03830147, "epoch": 0.5693672027656697, "flos": 18697250280960.0, "grad_norm": 2.217720738619104, "language_loss": 0.69655335, "learning_rate": 1.6495504999118769e-06, "loss": 0.71793997, "num_input_tokens_seen": 203972715, "step": 9470, "time_per_iteration": 2.6895534992218018 }, { "auxiliary_loss_clip": 0.01097198, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.04529762, "balance_loss_mlp": 1.02352285, "epoch": 0.5694273260183376, "flos": 20449188230400.0, "grad_norm": 1.6026966116267123, "language_loss": 0.74473977, "learning_rate": 1.6491670704496644e-06, "loss": 0.76608038, "num_input_tokens_seen": 203990775, "step": 9471, "time_per_iteration": 2.6734213829040527 }, { "auxiliary_loss_clip": 0.01077759, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.0421195, "balance_loss_mlp": 1.02579701, "epoch": 0.5694874492710056, "flos": 17603195880960.0, "grad_norm": 1.75714793559233, "language_loss": 0.57588744, "learning_rate": 1.6487836542910716e-06, "loss": 0.59706891, "num_input_tokens_seen": 204008845, "step": 9472, "time_per_iteration": 4.335491180419922 }, { "auxiliary_loss_clip": 0.01082559, "auxiliary_loss_mlp": 0.01032344, "balance_loss_clip": 1.03902221, "balance_loss_mlp": 1.01946378, "epoch": 0.5695475725236735, "flos": 13370836250880.0, "grad_norm": 1.9281443896441626, "language_loss": 0.73845899, "learning_rate": 1.648400251450638e-06, "loss": 0.75960797, "num_input_tokens_seen": 204023755, "step": 9473, "time_per_iteration": 2.706148147583008 }, { "auxiliary_loss_clip": 0.01017729, "auxiliary_loss_mlp": 0.01007582, "balance_loss_clip": 1.02078795, "balance_loss_mlp": 1.00631857, "epoch": 0.5696076957763415, "flos": 68174398661760.0, "grad_norm": 0.6469732305814715, "language_loss": 0.57547617, "learning_rate": 1.6480168619429023e-06, "loss": 0.59572935, "num_input_tokens_seen": 204091255, "step": 9474, "time_per_iteration": 3.2811825275421143 }, { "auxiliary_loss_clip": 0.01106855, "auxiliary_loss_mlp": 0.01038889, "balance_loss_clip": 1.04254341, "balance_loss_mlp": 1.02532923, "epoch": 0.5696678190290094, "flos": 33838301525760.0, "grad_norm": 2.207374996280549, "language_loss": 0.53488398, "learning_rate": 1.6476334857824017e-06, "loss": 0.55634141, "num_input_tokens_seen": 204113285, "step": 9475, "time_per_iteration": 2.701791524887085 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.01039618, "balance_loss_clip": 1.04524517, "balance_loss_mlp": 1.0262965, "epoch": 0.5697279422816774, "flos": 26356600748160.0, "grad_norm": 1.6070261580589493, "language_loss": 0.79622197, "learning_rate": 1.647250122983675e-06, "loss": 0.81784725, "num_input_tokens_seen": 204133045, "step": 9476, "time_per_iteration": 2.695966958999634 }, { "auxiliary_loss_clip": 0.01101607, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.04603529, "balance_loss_mlp": 1.0258019, "epoch": 0.5697880655343454, "flos": 22930507751040.0, "grad_norm": 1.9576279407758228, "language_loss": 0.66811013, "learning_rate": 1.6468667735612592e-06, "loss": 0.68951333, "num_input_tokens_seen": 204152590, "step": 9477, "time_per_iteration": 2.6981940269470215 }, { "auxiliary_loss_clip": 0.0108821, "auxiliary_loss_mlp": 0.01037709, "balance_loss_clip": 1.04286826, "balance_loss_mlp": 1.02403569, "epoch": 0.5698481887870134, "flos": 26761314263040.0, "grad_norm": 1.587062911340377, "language_loss": 0.70738614, "learning_rate": 1.6464834375296906e-06, "loss": 0.72864532, "num_input_tokens_seen": 204171815, "step": 9478, "time_per_iteration": 2.779813766479492 }, { "auxiliary_loss_clip": 0.01084042, "auxiliary_loss_mlp": 0.01031832, "balance_loss_clip": 1.03916287, "balance_loss_mlp": 1.0200479, "epoch": 0.5699083120396814, "flos": 15742269089280.0, "grad_norm": 4.484039953055517, "language_loss": 0.6938777, "learning_rate": 1.6461001149035055e-06, "loss": 0.71503651, "num_input_tokens_seen": 204188535, "step": 9479, "time_per_iteration": 2.712655782699585 }, { "auxiliary_loss_clip": 0.01078443, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.04121661, "balance_loss_mlp": 1.02166843, "epoch": 0.5699684352923493, "flos": 19537272720000.0, "grad_norm": 2.2062311419155205, "language_loss": 0.71329868, "learning_rate": 1.6457168056972392e-06, "loss": 0.73441678, "num_input_tokens_seen": 204208365, "step": 9480, "time_per_iteration": 2.727628469467163 }, { "auxiliary_loss_clip": 0.01089043, "auxiliary_loss_mlp": 0.00769268, "balance_loss_clip": 1.04188204, "balance_loss_mlp": 1.00015211, "epoch": 0.5700285585450173, "flos": 16253349753600.0, "grad_norm": 2.49302312393396, "language_loss": 0.7201618, "learning_rate": 1.6453335099254276e-06, "loss": 0.73874491, "num_input_tokens_seen": 204226560, "step": 9481, "time_per_iteration": 2.6870779991149902 }, { "auxiliary_loss_clip": 0.01111632, "auxiliary_loss_mlp": 0.01037308, "balance_loss_clip": 1.04494166, "balance_loss_mlp": 1.02441525, "epoch": 0.5700886817976852, "flos": 19864993432320.0, "grad_norm": 2.3265371075794046, "language_loss": 0.78086042, "learning_rate": 1.6449502276026041e-06, "loss": 0.80234993, "num_input_tokens_seen": 204245410, "step": 9482, "time_per_iteration": 2.648545742034912 }, { "auxiliary_loss_clip": 0.01099058, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.04446602, "balance_loss_mlp": 1.01918221, "epoch": 0.5701488050503533, "flos": 23841704989440.0, "grad_norm": 1.4982420423731841, "language_loss": 0.77999502, "learning_rate": 1.6445669587433043e-06, "loss": 0.80129617, "num_input_tokens_seen": 204264840, "step": 9483, "time_per_iteration": 2.716085910797119 }, { "auxiliary_loss_clip": 0.01098634, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.04435062, "balance_loss_mlp": 1.03189337, "epoch": 0.5702089283030212, "flos": 23659673840640.0, "grad_norm": 1.773078274148673, "language_loss": 0.81291378, "learning_rate": 1.6441837033620612e-06, "loss": 0.83433783, "num_input_tokens_seen": 204284335, "step": 9484, "time_per_iteration": 2.7283802032470703 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.0077026, "balance_loss_clip": 1.04378128, "balance_loss_mlp": 1.00009394, "epoch": 0.5702690515556892, "flos": 27891171544320.0, "grad_norm": 294.9687469035841, "language_loss": 0.60670495, "learning_rate": 1.6438004614734073e-06, "loss": 0.6256156, "num_input_tokens_seen": 204302590, "step": 9485, "time_per_iteration": 2.7182137966156006 }, { "auxiliary_loss_clip": 0.01107456, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.04291701, "balance_loss_mlp": 1.02048063, "epoch": 0.5703291748083571, "flos": 24023951619840.0, "grad_norm": 2.0199937842049676, "language_loss": 0.65740418, "learning_rate": 1.6434172330918757e-06, "loss": 0.67881644, "num_input_tokens_seen": 204323055, "step": 9486, "time_per_iteration": 2.7076590061187744 }, { "auxiliary_loss_clip": 0.01026531, "auxiliary_loss_mlp": 0.01001416, "balance_loss_clip": 1.02014589, "balance_loss_mlp": 1.00029588, "epoch": 0.5703892980610251, "flos": 57023382919680.0, "grad_norm": 0.6682653451732087, "language_loss": 0.47990364, "learning_rate": 1.6430340182319978e-06, "loss": 0.50018317, "num_input_tokens_seen": 204386160, "step": 9487, "time_per_iteration": 3.3227086067199707 }, { "auxiliary_loss_clip": 0.0108502, "auxiliary_loss_mlp": 0.00770885, "balance_loss_clip": 1.04171848, "balance_loss_mlp": 1.00012314, "epoch": 0.570449421313693, "flos": 24351025887360.0, "grad_norm": 1.5998860502141972, "language_loss": 0.85676056, "learning_rate": 1.6426508169083067e-06, "loss": 0.87531954, "num_input_tokens_seen": 204406315, "step": 9488, "time_per_iteration": 2.7443041801452637 }, { "auxiliary_loss_clip": 0.01084932, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.04087424, "balance_loss_mlp": 1.02245951, "epoch": 0.570509544566361, "flos": 24828566227200.0, "grad_norm": 1.4382001019160457, "language_loss": 0.78847331, "learning_rate": 1.6422676291353314e-06, "loss": 0.80968434, "num_input_tokens_seen": 204427645, "step": 9489, "time_per_iteration": 2.7456719875335693 }, { "auxiliary_loss_clip": 0.01099206, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.04345155, "balance_loss_mlp": 1.01655364, "epoch": 0.570569667819029, "flos": 21397301671680.0, "grad_norm": 1.7750907148912565, "language_loss": 0.70044166, "learning_rate": 1.641884454927604e-06, "loss": 0.72171819, "num_input_tokens_seen": 204445910, "step": 9490, "time_per_iteration": 2.646172046661377 }, { "auxiliary_loss_clip": 0.01085076, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.04102945, "balance_loss_mlp": 1.02055264, "epoch": 0.570629791071697, "flos": 23216751233280.0, "grad_norm": 1.5662629922292932, "language_loss": 0.76374, "learning_rate": 1.6415012942996548e-06, "loss": 0.78492117, "num_input_tokens_seen": 204464680, "step": 9491, "time_per_iteration": 2.686228036880493 }, { "auxiliary_loss_clip": 0.01010704, "auxiliary_loss_mlp": 0.0075136, "balance_loss_clip": 1.01657176, "balance_loss_mlp": 0.99964297, "epoch": 0.570689914324365, "flos": 65284666525440.0, "grad_norm": 0.7940313966382696, "language_loss": 0.57365447, "learning_rate": 1.641118147266011e-06, "loss": 0.5912751, "num_input_tokens_seen": 204525580, "step": 9492, "time_per_iteration": 3.275951623916626 }, { "auxiliary_loss_clip": 0.01091927, "auxiliary_loss_mlp": 0.00770164, "balance_loss_clip": 1.0425539, "balance_loss_mlp": 1.00009966, "epoch": 0.5707500375770329, "flos": 21141904993920.0, "grad_norm": 1.811585397599456, "language_loss": 0.71563506, "learning_rate": 1.6407350138412035e-06, "loss": 0.73425597, "num_input_tokens_seen": 204541320, "step": 9493, "time_per_iteration": 2.6741974353790283 }, { "auxiliary_loss_clip": 0.0112282, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.0450213, "balance_loss_mlp": 1.01957655, "epoch": 0.5708101608297009, "flos": 20812747737600.0, "grad_norm": 1.647557383472974, "language_loss": 0.7782768, "learning_rate": 1.6403518940397606e-06, "loss": 0.79982895, "num_input_tokens_seen": 204560275, "step": 9494, "time_per_iteration": 2.6302967071533203 }, { "auxiliary_loss_clip": 0.01124725, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.04463601, "balance_loss_mlp": 1.01786041, "epoch": 0.5708702840823688, "flos": 25812338895360.0, "grad_norm": 2.0991801198395166, "language_loss": 0.80634642, "learning_rate": 1.6399687878762096e-06, "loss": 0.82790697, "num_input_tokens_seen": 204579430, "step": 9495, "time_per_iteration": 2.628124237060547 }, { "auxiliary_loss_clip": 0.01077213, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.03985035, "balance_loss_mlp": 1.02567959, "epoch": 0.5709304073350369, "flos": 23651916503040.0, "grad_norm": 2.1559343585674067, "language_loss": 0.66669941, "learning_rate": 1.6395856953650784e-06, "loss": 0.68788344, "num_input_tokens_seen": 204597710, "step": 9496, "time_per_iteration": 2.7877724170684814 }, { "auxiliary_loss_clip": 0.01125369, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.04586279, "balance_loss_mlp": 1.02479351, "epoch": 0.5709905305877048, "flos": 16107552449280.0, "grad_norm": 2.6392695697640387, "language_loss": 0.69406897, "learning_rate": 1.6392026165208938e-06, "loss": 0.71570456, "num_input_tokens_seen": 204616140, "step": 9497, "time_per_iteration": 2.5715434551239014 }, { "auxiliary_loss_clip": 0.01107343, "auxiliary_loss_mlp": 0.00770833, "balance_loss_clip": 1.04470205, "balance_loss_mlp": 1.00010204, "epoch": 0.5710506538403728, "flos": 24750819239040.0, "grad_norm": 2.381002532737965, "language_loss": 0.81296104, "learning_rate": 1.638819551358182e-06, "loss": 0.83174282, "num_input_tokens_seen": 204636470, "step": 9498, "time_per_iteration": 2.7146875858306885 }, { "auxiliary_loss_clip": 0.01122241, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.04371977, "balance_loss_mlp": 1.02453244, "epoch": 0.5711107770930407, "flos": 21982250655360.0, "grad_norm": 1.8640767096069095, "language_loss": 0.66366005, "learning_rate": 1.638436499891469e-06, "loss": 0.68527335, "num_input_tokens_seen": 204656640, "step": 9499, "time_per_iteration": 2.59460711479187 }, { "auxiliary_loss_clip": 0.01090983, "auxiliary_loss_mlp": 0.01034376, "balance_loss_clip": 1.04218864, "balance_loss_mlp": 1.02126861, "epoch": 0.5711709003457087, "flos": 19574009354880.0, "grad_norm": 1.5439081268362653, "language_loss": 0.71755552, "learning_rate": 1.6380534621352805e-06, "loss": 0.73880911, "num_input_tokens_seen": 204675475, "step": 9500, "time_per_iteration": 2.6723949909210205 }, { "auxiliary_loss_clip": 0.01092856, "auxiliary_loss_mlp": 0.01032614, "balance_loss_clip": 1.04149878, "balance_loss_mlp": 1.01973963, "epoch": 0.5712310235983766, "flos": 24242683489920.0, "grad_norm": 1.9336466751975971, "language_loss": 0.76224887, "learning_rate": 1.6376704381041407e-06, "loss": 0.78350353, "num_input_tokens_seen": 204695385, "step": 9501, "time_per_iteration": 2.7653119564056396 }, { "auxiliary_loss_clip": 0.01101056, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.04289281, "balance_loss_mlp": 1.02269506, "epoch": 0.5712911468510447, "flos": 20996143603200.0, "grad_norm": 1.6146609274124086, "language_loss": 0.75141633, "learning_rate": 1.6372874278125742e-06, "loss": 0.77278382, "num_input_tokens_seen": 204714730, "step": 9502, "time_per_iteration": 2.6820828914642334 }, { "auxiliary_loss_clip": 0.01088314, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.04387522, "balance_loss_mlp": 1.01492405, "epoch": 0.5713512701037126, "flos": 18916987731840.0, "grad_norm": 1.5621825440350152, "language_loss": 0.82271576, "learning_rate": 1.636904431275105e-06, "loss": 0.84387839, "num_input_tokens_seen": 204735025, "step": 9503, "time_per_iteration": 2.663109302520752 }, { "auxiliary_loss_clip": 0.01085944, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04204583, "balance_loss_mlp": 1.02192843, "epoch": 0.5714113933563806, "flos": 17413443308160.0, "grad_norm": 2.684901451113001, "language_loss": 0.86263931, "learning_rate": 1.6365214485062553e-06, "loss": 0.88383818, "num_input_tokens_seen": 204751365, "step": 9504, "time_per_iteration": 2.763122320175171 }, { "auxiliary_loss_clip": 0.01075538, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.04011607, "balance_loss_mlp": 1.01565766, "epoch": 0.5714715166090486, "flos": 20193360589440.0, "grad_norm": 1.7486163539852246, "language_loss": 0.75459665, "learning_rate": 1.6361384795205496e-06, "loss": 0.77563769, "num_input_tokens_seen": 204768980, "step": 9505, "time_per_iteration": 4.519685506820679 }, { "auxiliary_loss_clip": 0.0111822, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.04235733, "balance_loss_mlp": 1.02002621, "epoch": 0.5715316398617165, "flos": 18551668458240.0, "grad_norm": 1.4826686830874622, "language_loss": 0.81888402, "learning_rate": 1.635755524332509e-06, "loss": 0.84038711, "num_input_tokens_seen": 204788110, "step": 9506, "time_per_iteration": 5.6948935985565186 }, { "auxiliary_loss_clip": 0.01080083, "auxiliary_loss_mlp": 0.00770857, "balance_loss_clip": 1.03905082, "balance_loss_mlp": 1.00010204, "epoch": 0.5715917631143845, "flos": 18478195188480.0, "grad_norm": 1.7330193393772828, "language_loss": 0.77595812, "learning_rate": 1.6353725829566552e-06, "loss": 0.79446745, "num_input_tokens_seen": 204807240, "step": 9507, "time_per_iteration": 2.7299420833587646 }, { "auxiliary_loss_clip": 0.01098783, "auxiliary_loss_mlp": 0.01037694, "balance_loss_clip": 1.04040074, "balance_loss_mlp": 1.02350807, "epoch": 0.5716518863670524, "flos": 24020037037440.0, "grad_norm": 1.9478835056583133, "language_loss": 0.6852861, "learning_rate": 1.63498965540751e-06, "loss": 0.70665085, "num_input_tokens_seen": 204826415, "step": 9508, "time_per_iteration": 2.7023262977600098 }, { "auxiliary_loss_clip": 0.01121987, "auxiliary_loss_mlp": 0.01031189, "balance_loss_clip": 1.04333735, "balance_loss_mlp": 1.01777184, "epoch": 0.5717120096197205, "flos": 17819485626240.0, "grad_norm": 2.087333212498838, "language_loss": 0.80104595, "learning_rate": 1.634606741699593e-06, "loss": 0.82257771, "num_input_tokens_seen": 204844305, "step": 9509, "time_per_iteration": 2.6331591606140137 }, { "auxiliary_loss_clip": 0.01104906, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.04276729, "balance_loss_mlp": 1.02071953, "epoch": 0.5717721328723884, "flos": 21866043179520.0, "grad_norm": 1.9468766397229225, "language_loss": 0.71857727, "learning_rate": 1.6342238418474255e-06, "loss": 0.73996317, "num_input_tokens_seen": 204861765, "step": 9510, "time_per_iteration": 2.6763837337493896 }, { "auxiliary_loss_clip": 0.01096671, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.04109251, "balance_loss_mlp": 1.01920152, "epoch": 0.5718322561250564, "flos": 28437624126720.0, "grad_norm": 1.5755083758344817, "language_loss": 0.69395983, "learning_rate": 1.6338409558655264e-06, "loss": 0.71524119, "num_input_tokens_seen": 204882505, "step": 9511, "time_per_iteration": 4.320638418197632 }, { "auxiliary_loss_clip": 0.01097735, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.04172611, "balance_loss_mlp": 1.02338552, "epoch": 0.5718923793777243, "flos": 13551825905280.0, "grad_norm": 2.0067389560068047, "language_loss": 0.6147874, "learning_rate": 1.6334580837684152e-06, "loss": 0.63611984, "num_input_tokens_seen": 204899830, "step": 9512, "time_per_iteration": 2.759669065475464 }, { "auxiliary_loss_clip": 0.01095927, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.04188657, "balance_loss_mlp": 1.01700234, "epoch": 0.5719525026303923, "flos": 17822035491840.0, "grad_norm": 2.401258082797128, "language_loss": 0.76018667, "learning_rate": 1.6330752255706104e-06, "loss": 0.78144312, "num_input_tokens_seen": 204918100, "step": 9513, "time_per_iteration": 2.7117698192596436 }, { "auxiliary_loss_clip": 0.01030995, "auxiliary_loss_mlp": 0.00999994, "balance_loss_clip": 1.01519012, "balance_loss_mlp": 0.99881381, "epoch": 0.5720126258830602, "flos": 61298042814720.0, "grad_norm": 0.8987559536316853, "language_loss": 0.66807652, "learning_rate": 1.6326923812866288e-06, "loss": 0.68838638, "num_input_tokens_seen": 204972925, "step": 9514, "time_per_iteration": 3.1701343059539795 }, { "auxiliary_loss_clip": 0.01114643, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.0446943, "balance_loss_mlp": 1.02930832, "epoch": 0.5720727491357283, "flos": 23988040997760.0, "grad_norm": 2.0869347470902704, "language_loss": 0.81355566, "learning_rate": 1.63230955093099e-06, "loss": 0.83513075, "num_input_tokens_seen": 204990910, "step": 9515, "time_per_iteration": 2.668982744216919 }, { "auxiliary_loss_clip": 0.01098965, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.04036427, "balance_loss_mlp": 1.01993597, "epoch": 0.5721328723883962, "flos": 23405426398080.0, "grad_norm": 3.1746972716468664, "language_loss": 0.85928082, "learning_rate": 1.6319267345182092e-06, "loss": 0.88059723, "num_input_tokens_seen": 205010500, "step": 9516, "time_per_iteration": 2.6741178035736084 }, { "auxiliary_loss_clip": 0.01083742, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 1.04019785, "balance_loss_mlp": 1.01784658, "epoch": 0.5721929956410642, "flos": 18804910320000.0, "grad_norm": 1.8608727945257042, "language_loss": 0.87884629, "learning_rate": 1.6315439320628038e-06, "loss": 0.8999939, "num_input_tokens_seen": 205028560, "step": 9517, "time_per_iteration": 2.699981451034546 }, { "auxiliary_loss_clip": 0.01066403, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.03665698, "balance_loss_mlp": 1.01866579, "epoch": 0.5722531188937322, "flos": 27196659100800.0, "grad_norm": 1.632945668541975, "language_loss": 0.85146403, "learning_rate": 1.6311611435792893e-06, "loss": 0.87244439, "num_input_tokens_seen": 205048650, "step": 9518, "time_per_iteration": 2.8667659759521484 }, { "auxiliary_loss_clip": 0.01104733, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.04255366, "balance_loss_mlp": 1.02131414, "epoch": 0.5723132421464001, "flos": 15195672852480.0, "grad_norm": 1.838676422571758, "language_loss": 0.7901606, "learning_rate": 1.6307783690821812e-06, "loss": 0.81154531, "num_input_tokens_seen": 205066480, "step": 9519, "time_per_iteration": 2.593822479248047 }, { "auxiliary_loss_clip": 0.01117664, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.04276991, "balance_loss_mlp": 1.01755762, "epoch": 0.5723733653990681, "flos": 27599433281280.0, "grad_norm": 1.4978137038182386, "language_loss": 0.83191645, "learning_rate": 1.6303956085859944e-06, "loss": 0.85339302, "num_input_tokens_seen": 205087475, "step": 9520, "time_per_iteration": 2.664851427078247 }, { "auxiliary_loss_clip": 0.01098568, "auxiliary_loss_mlp": 0.01044625, "balance_loss_clip": 1.04248536, "balance_loss_mlp": 1.03115487, "epoch": 0.572433488651736, "flos": 18222870337920.0, "grad_norm": 2.1952309591015267, "language_loss": 0.72542965, "learning_rate": 1.630012862105243e-06, "loss": 0.74686158, "num_input_tokens_seen": 205106495, "step": 9521, "time_per_iteration": 2.7253611087799072 }, { "auxiliary_loss_clip": 0.011175, "auxiliary_loss_mlp": 0.00769564, "balance_loss_clip": 1.04164016, "balance_loss_mlp": 1.00010264, "epoch": 0.5724936119044041, "flos": 31249106484480.0, "grad_norm": 2.153094973040902, "language_loss": 0.78315163, "learning_rate": 1.6296301296544415e-06, "loss": 0.80202222, "num_input_tokens_seen": 205128285, "step": 9522, "time_per_iteration": 2.6890037059783936 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01034098, "balance_loss_clip": 1.04117084, "balance_loss_mlp": 1.02251649, "epoch": 0.572553735157072, "flos": 19202189719680.0, "grad_norm": 1.511112661891623, "language_loss": 0.71476662, "learning_rate": 1.629247411248102e-06, "loss": 0.73599374, "num_input_tokens_seen": 205146595, "step": 9523, "time_per_iteration": 2.6567182540893555 }, { "auxiliary_loss_clip": 0.01092274, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.03921247, "balance_loss_mlp": 1.02187228, "epoch": 0.57261385840974, "flos": 21214911386880.0, "grad_norm": 2.2130630300856207, "language_loss": 0.70017171, "learning_rate": 1.628864706900738e-06, "loss": 0.72143173, "num_input_tokens_seen": 205164295, "step": 9524, "time_per_iteration": 2.700518846511841 }, { "auxiliary_loss_clip": 0.01107505, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.04225564, "balance_loss_mlp": 1.01971316, "epoch": 0.5726739816624079, "flos": 33984529793280.0, "grad_norm": 1.461112152817653, "language_loss": 0.65126455, "learning_rate": 1.6284820166268615e-06, "loss": 0.67265761, "num_input_tokens_seen": 205185380, "step": 9525, "time_per_iteration": 2.7389535903930664 }, { "auxiliary_loss_clip": 0.01091158, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.03928351, "balance_loss_mlp": 1.023139, "epoch": 0.5727341049150759, "flos": 24275972419200.0, "grad_norm": 1.930578654391071, "language_loss": 0.72484279, "learning_rate": 1.628099340440984e-06, "loss": 0.7461046, "num_input_tokens_seen": 205204895, "step": 9526, "time_per_iteration": 2.702472448348999 }, { "auxiliary_loss_clip": 0.01103623, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.03998101, "balance_loss_mlp": 1.02604759, "epoch": 0.5727942281677438, "flos": 28400564269440.0, "grad_norm": 2.0565235980515206, "language_loss": 0.8007257, "learning_rate": 1.6277166783576176e-06, "loss": 0.8221432, "num_input_tokens_seen": 205223440, "step": 9527, "time_per_iteration": 2.7238149642944336 }, { "auxiliary_loss_clip": 0.01101882, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.03860235, "balance_loss_mlp": 1.02360809, "epoch": 0.5728543514204119, "flos": 19536769929600.0, "grad_norm": 1.770832454252008, "language_loss": 0.72136271, "learning_rate": 1.6273340303912713e-06, "loss": 0.74274695, "num_input_tokens_seen": 205242800, "step": 9528, "time_per_iteration": 2.593954086303711 }, { "auxiliary_loss_clip": 0.01117957, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.04303622, "balance_loss_mlp": 1.02363753, "epoch": 0.5729144746730798, "flos": 21506757390720.0, "grad_norm": 2.0200513223103846, "language_loss": 0.86137569, "learning_rate": 1.6269513965564557e-06, "loss": 0.88291985, "num_input_tokens_seen": 205259465, "step": 9529, "time_per_iteration": 2.6399447917938232 }, { "auxiliary_loss_clip": 0.01022279, "auxiliary_loss_mlp": 0.00999796, "balance_loss_clip": 1.01659954, "balance_loss_mlp": 0.99862826, "epoch": 0.5729745979257478, "flos": 58681628242560.0, "grad_norm": 0.7634342678167043, "language_loss": 0.56170225, "learning_rate": 1.6265687768676813e-06, "loss": 0.58192301, "num_input_tokens_seen": 205314100, "step": 9530, "time_per_iteration": 3.081955671310425 }, { "auxiliary_loss_clip": 0.01096881, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.04126835, "balance_loss_mlp": 1.01860929, "epoch": 0.5730347211784158, "flos": 18552099421440.0, "grad_norm": 1.8014631294656338, "language_loss": 0.66785836, "learning_rate": 1.6261861713394553e-06, "loss": 0.6891337, "num_input_tokens_seen": 205333420, "step": 9531, "time_per_iteration": 2.650801658630371 }, { "auxiliary_loss_clip": 0.01102348, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 1.03970659, "balance_loss_mlp": 1.02834046, "epoch": 0.5730948444310837, "flos": 38031482396160.0, "grad_norm": 2.1479743871986314, "language_loss": 0.75923574, "learning_rate": 1.6258035799862876e-06, "loss": 0.78068173, "num_input_tokens_seen": 205350995, "step": 9532, "time_per_iteration": 2.7268972396850586 }, { "auxiliary_loss_clip": 0.01117449, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.0426352, "balance_loss_mlp": 1.01828206, "epoch": 0.5731549676837517, "flos": 25227066689280.0, "grad_norm": 1.3324145118640112, "language_loss": 0.78908527, "learning_rate": 1.625421002822686e-06, "loss": 0.81057048, "num_input_tokens_seen": 205372675, "step": 9533, "time_per_iteration": 2.6636223793029785 }, { "auxiliary_loss_clip": 0.01105019, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.04237115, "balance_loss_mlp": 1.01806676, "epoch": 0.5732150909364196, "flos": 23368222886400.0, "grad_norm": 1.7921135162563215, "language_loss": 0.85584033, "learning_rate": 1.6250384398631574e-06, "loss": 0.87718827, "num_input_tokens_seen": 205392590, "step": 9534, "time_per_iteration": 2.6173202991485596 }, { "auxiliary_loss_clip": 0.01098044, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.0421629, "balance_loss_mlp": 1.02537584, "epoch": 0.5732752141890877, "flos": 23079357711360.0, "grad_norm": 1.8285457434330181, "language_loss": 0.7536543, "learning_rate": 1.6246558911222085e-06, "loss": 0.77502143, "num_input_tokens_seen": 205414885, "step": 9535, "time_per_iteration": 2.6797807216644287 }, { "auxiliary_loss_clip": 0.0110163, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.04250264, "balance_loss_mlp": 1.01984715, "epoch": 0.5733353374417556, "flos": 24352282863360.0, "grad_norm": 1.4660219442049842, "language_loss": 0.71041429, "learning_rate": 1.624273356614346e-06, "loss": 0.73175883, "num_input_tokens_seen": 205434440, "step": 9536, "time_per_iteration": 2.6927666664123535 }, { "auxiliary_loss_clip": 0.0107587, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.03728056, "balance_loss_mlp": 1.02034533, "epoch": 0.5733954606944236, "flos": 27198849830400.0, "grad_norm": 1.9779932456354445, "language_loss": 0.69794559, "learning_rate": 1.6238908363540755e-06, "loss": 0.71905118, "num_input_tokens_seen": 205454225, "step": 9537, "time_per_iteration": 2.758420944213867 }, { "auxiliary_loss_clip": 0.01119262, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.04359508, "balance_loss_mlp": 1.02364206, "epoch": 0.5734555839470915, "flos": 28765129357440.0, "grad_norm": 1.8277858348507134, "language_loss": 0.62517941, "learning_rate": 1.623508330355902e-06, "loss": 0.64673591, "num_input_tokens_seen": 205474750, "step": 9538, "time_per_iteration": 2.6978628635406494 }, { "auxiliary_loss_clip": 0.01105121, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.04219174, "balance_loss_mlp": 1.02135563, "epoch": 0.5735157071997595, "flos": 22966813422720.0, "grad_norm": 1.6582870130678489, "language_loss": 0.83564949, "learning_rate": 1.6231258386343306e-06, "loss": 0.85704643, "num_input_tokens_seen": 205495495, "step": 9539, "time_per_iteration": 2.7695393562316895 }, { "auxiliary_loss_clip": 0.01086088, "auxiliary_loss_mlp": 0.01038955, "balance_loss_clip": 1.04798675, "balance_loss_mlp": 1.02566326, "epoch": 0.5735758304524274, "flos": 18989455420800.0, "grad_norm": 2.207302017109072, "language_loss": 0.73048598, "learning_rate": 1.6227433612038647e-06, "loss": 0.75173634, "num_input_tokens_seen": 205510070, "step": 9540, "time_per_iteration": 2.760653018951416 }, { "auxiliary_loss_clip": 0.01101303, "auxiliary_loss_mlp": 0.00769854, "balance_loss_clip": 1.03920221, "balance_loss_mlp": 1.00004601, "epoch": 0.5736359537050955, "flos": 28397942576640.0, "grad_norm": 2.4125489920069074, "language_loss": 0.79765099, "learning_rate": 1.6223608980790089e-06, "loss": 0.81636256, "num_input_tokens_seen": 205530190, "step": 9541, "time_per_iteration": 2.789978504180908 }, { "auxiliary_loss_clip": 0.01096764, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.040447, "balance_loss_mlp": 1.02054572, "epoch": 0.5736960769577634, "flos": 15627210848640.0, "grad_norm": 2.579963788523863, "language_loss": 0.6497947, "learning_rate": 1.6219784492742654e-06, "loss": 0.67109919, "num_input_tokens_seen": 205547380, "step": 9542, "time_per_iteration": 2.684465169906616 }, { "auxiliary_loss_clip": 0.01094703, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.03985989, "balance_loss_mlp": 1.01992106, "epoch": 0.5737562002104314, "flos": 18003994813440.0, "grad_norm": 2.1591412151518625, "language_loss": 0.82844281, "learning_rate": 1.6215960148041365e-06, "loss": 0.84971344, "num_input_tokens_seen": 205566540, "step": 9543, "time_per_iteration": 2.724700450897217 }, { "auxiliary_loss_clip": 0.01078135, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.03842759, "balance_loss_mlp": 1.01990938, "epoch": 0.5738163234630994, "flos": 20698192287360.0, "grad_norm": 2.0892075264702616, "language_loss": 0.73500836, "learning_rate": 1.6212135946831257e-06, "loss": 0.75613153, "num_input_tokens_seen": 205584200, "step": 9544, "time_per_iteration": 2.7072341442108154 }, { "auxiliary_loss_clip": 0.01063343, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.03527069, "balance_loss_mlp": 1.02173972, "epoch": 0.5738764467157673, "flos": 23149311448320.0, "grad_norm": 1.791719724630014, "language_loss": 0.76021409, "learning_rate": 1.620831188925733e-06, "loss": 0.78119594, "num_input_tokens_seen": 205604675, "step": 9545, "time_per_iteration": 4.402756690979004 }, { "auxiliary_loss_clip": 0.0109842, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04495752, "balance_loss_mlp": 1.02345061, "epoch": 0.5739365699684353, "flos": 29492930730240.0, "grad_norm": 1.94712066693327, "language_loss": 0.56656086, "learning_rate": 1.620448797546459e-06, "loss": 0.58791304, "num_input_tokens_seen": 205624680, "step": 9546, "time_per_iteration": 6.025787115097046 }, { "auxiliary_loss_clip": 0.01091236, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.03923881, "balance_loss_mlp": 1.02023625, "epoch": 0.5739966932211032, "flos": 14027247342720.0, "grad_norm": 2.369322585416499, "language_loss": 0.7595309, "learning_rate": 1.6200664205598055e-06, "loss": 0.78077716, "num_input_tokens_seen": 205641950, "step": 9547, "time_per_iteration": 2.71240496635437 }, { "auxiliary_loss_clip": 0.01104111, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.03877449, "balance_loss_mlp": 1.01709485, "epoch": 0.5740568164737713, "flos": 19062030850560.0, "grad_norm": 5.307379698295213, "language_loss": 0.74525601, "learning_rate": 1.6196840579802704e-06, "loss": 0.76660264, "num_input_tokens_seen": 205660130, "step": 9548, "time_per_iteration": 2.651829957962036 }, { "auxiliary_loss_clip": 0.01085909, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.03760338, "balance_loss_mlp": 1.02268577, "epoch": 0.5741169397264392, "flos": 22127832478080.0, "grad_norm": 4.02154100378115, "language_loss": 0.69476151, "learning_rate": 1.619301709822355e-06, "loss": 0.71597928, "num_input_tokens_seen": 205678895, "step": 9549, "time_per_iteration": 2.7304623126983643 }, { "auxiliary_loss_clip": 0.01068231, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.04319942, "balance_loss_mlp": 1.01907182, "epoch": 0.5741770629791072, "flos": 24936836797440.0, "grad_norm": 1.4366767261825364, "language_loss": 0.79742229, "learning_rate": 1.6189193761005564e-06, "loss": 0.81841469, "num_input_tokens_seen": 205698450, "step": 9550, "time_per_iteration": 2.759152889251709 }, { "auxiliary_loss_clip": 0.01091678, "auxiliary_loss_mlp": 0.01036065, "balance_loss_clip": 1.04081261, "balance_loss_mlp": 1.0213902, "epoch": 0.5742371862317751, "flos": 18801462614400.0, "grad_norm": 1.889418417446442, "language_loss": 0.67791235, "learning_rate": 1.6185370568293727e-06, "loss": 0.69918978, "num_input_tokens_seen": 205714870, "step": 9551, "time_per_iteration": 4.226199150085449 }, { "auxiliary_loss_clip": 0.01082087, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.04173434, "balance_loss_mlp": 1.02287543, "epoch": 0.5742973094844431, "flos": 24460661174400.0, "grad_norm": 2.3194402923297157, "language_loss": 0.7223655, "learning_rate": 1.6181547520233031e-06, "loss": 0.74354362, "num_input_tokens_seen": 205736045, "step": 9552, "time_per_iteration": 2.736600160598755 }, { "auxiliary_loss_clip": 0.01103832, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.04454732, "balance_loss_mlp": 1.02040219, "epoch": 0.574357432737111, "flos": 21652770176640.0, "grad_norm": 2.128940953023755, "language_loss": 0.79823256, "learning_rate": 1.617772461696843e-06, "loss": 0.81960428, "num_input_tokens_seen": 205754445, "step": 9553, "time_per_iteration": 2.6895127296447754 }, { "auxiliary_loss_clip": 0.01111471, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.04313147, "balance_loss_mlp": 1.02050185, "epoch": 0.5744175559897791, "flos": 16544728880640.0, "grad_norm": 1.880148698667659, "language_loss": 0.8353495, "learning_rate": 1.6173901858644895e-06, "loss": 0.85679281, "num_input_tokens_seen": 205770595, "step": 9554, "time_per_iteration": 2.615577220916748 }, { "auxiliary_loss_clip": 0.01115074, "auxiliary_loss_mlp": 0.0077091, "balance_loss_clip": 1.04545319, "balance_loss_mlp": 1.0001241, "epoch": 0.574477679242447, "flos": 24207598880640.0, "grad_norm": 1.4793540146055872, "language_loss": 0.71076667, "learning_rate": 1.6170079245407385e-06, "loss": 0.72962654, "num_input_tokens_seen": 205791935, "step": 9555, "time_per_iteration": 2.7411417961120605 }, { "auxiliary_loss_clip": 0.01093974, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.04077876, "balance_loss_mlp": 1.01763785, "epoch": 0.574537802495115, "flos": 14903000835840.0, "grad_norm": 2.2805548015379755, "language_loss": 0.72663784, "learning_rate": 1.6166256777400853e-06, "loss": 0.7478897, "num_input_tokens_seen": 205807260, "step": 9556, "time_per_iteration": 2.6720690727233887 }, { "auxiliary_loss_clip": 0.01111378, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.04576373, "balance_loss_mlp": 1.02015448, "epoch": 0.5745979257477829, "flos": 24934969290240.0, "grad_norm": 1.744837604754053, "language_loss": 0.74087226, "learning_rate": 1.6162434454770248e-06, "loss": 0.76231742, "num_input_tokens_seen": 205826885, "step": 9557, "time_per_iteration": 2.7899231910705566 }, { "auxiliary_loss_clip": 0.01108542, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.04274464, "balance_loss_mlp": 1.02157927, "epoch": 0.5746580490004509, "flos": 17235757704960.0, "grad_norm": 1.5016834383596844, "language_loss": 0.67902005, "learning_rate": 1.6158612277660514e-06, "loss": 0.70044577, "num_input_tokens_seen": 205844630, "step": 9558, "time_per_iteration": 2.762430429458618 }, { "auxiliary_loss_clip": 0.01094279, "auxiliary_loss_mlp": 0.01052047, "balance_loss_clip": 1.04277229, "balance_loss_mlp": 1.03471398, "epoch": 0.5747181722531189, "flos": 13187871348480.0, "grad_norm": 2.4192829019987148, "language_loss": 0.72013688, "learning_rate": 1.615479024621659e-06, "loss": 0.74160016, "num_input_tokens_seen": 205860960, "step": 9559, "time_per_iteration": 2.757319688796997 }, { "auxiliary_loss_clip": 0.01097547, "auxiliary_loss_mlp": 0.00769026, "balance_loss_clip": 1.04342794, "balance_loss_mlp": 1.00012159, "epoch": 0.5747782955057869, "flos": 22963006581120.0, "grad_norm": 1.6274858947785595, "language_loss": 0.78883743, "learning_rate": 1.6150968360583398e-06, "loss": 0.8075031, "num_input_tokens_seen": 205880675, "step": 9560, "time_per_iteration": 2.746260166168213 }, { "auxiliary_loss_clip": 0.01052934, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.03918111, "balance_loss_mlp": 1.0164957, "epoch": 0.5748384187584549, "flos": 23403235668480.0, "grad_norm": 2.1977539095196903, "language_loss": 0.64321613, "learning_rate": 1.614714662090588e-06, "loss": 0.6640439, "num_input_tokens_seen": 205900050, "step": 9561, "time_per_iteration": 2.8124732971191406 }, { "auxiliary_loss_clip": 0.01116845, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.04539895, "balance_loss_mlp": 1.02567124, "epoch": 0.5748985420111228, "flos": 17785514338560.0, "grad_norm": 2.0210299953328414, "language_loss": 0.7193495, "learning_rate": 1.6143325027328945e-06, "loss": 0.74091417, "num_input_tokens_seen": 205918855, "step": 9562, "time_per_iteration": 2.7868704795837402 }, { "auxiliary_loss_clip": 0.01067199, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.03979492, "balance_loss_mlp": 1.02039778, "epoch": 0.5749586652637908, "flos": 19866250408320.0, "grad_norm": 1.4806264841650407, "language_loss": 0.84100068, "learning_rate": 1.613950357999751e-06, "loss": 0.86199754, "num_input_tokens_seen": 205936970, "step": 9563, "time_per_iteration": 2.7772703170776367 }, { "auxiliary_loss_clip": 0.01073481, "auxiliary_loss_mlp": 0.01039774, "balance_loss_clip": 1.0434773, "balance_loss_mlp": 1.02635074, "epoch": 0.5750187885164587, "flos": 21287235421440.0, "grad_norm": 2.0689431633426802, "language_loss": 0.5717746, "learning_rate": 1.6135682279056488e-06, "loss": 0.59290713, "num_input_tokens_seen": 205954630, "step": 9564, "time_per_iteration": 2.8411808013916016 }, { "auxiliary_loss_clip": 0.01092301, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.04144359, "balance_loss_mlp": 1.0226326, "epoch": 0.5750789117691267, "flos": 18804658924800.0, "grad_norm": 1.7191674250507119, "language_loss": 0.76114881, "learning_rate": 1.613186112465078e-06, "loss": 0.78243363, "num_input_tokens_seen": 205971510, "step": 9565, "time_per_iteration": 2.822044610977173 }, { "auxiliary_loss_clip": 0.01002918, "auxiliary_loss_mlp": 0.01012299, "balance_loss_clip": 1.01532471, "balance_loss_mlp": 1.01098824, "epoch": 0.5751390350217946, "flos": 70663224124800.0, "grad_norm": 0.74248986424084, "language_loss": 0.60725588, "learning_rate": 1.6128040116925287e-06, "loss": 0.62740809, "num_input_tokens_seen": 206035125, "step": 9566, "time_per_iteration": 3.427154064178467 }, { "auxiliary_loss_clip": 0.01093716, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.04347396, "balance_loss_mlp": 1.02224672, "epoch": 0.5751991582744627, "flos": 14246338348800.0, "grad_norm": 2.3384715191144214, "language_loss": 0.75378191, "learning_rate": 1.6124219256024901e-06, "loss": 0.77506685, "num_input_tokens_seen": 206052075, "step": 9567, "time_per_iteration": 2.8895022869110107 }, { "auxiliary_loss_clip": 0.0110852, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.04461062, "balance_loss_mlp": 1.02136469, "epoch": 0.5752592815271306, "flos": 18328160079360.0, "grad_norm": 1.398692478003959, "language_loss": 0.74487442, "learning_rate": 1.6120398542094504e-06, "loss": 0.7662977, "num_input_tokens_seen": 206069970, "step": 9568, "time_per_iteration": 2.745008945465088 }, { "auxiliary_loss_clip": 0.01122376, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.04557085, "balance_loss_mlp": 1.01852036, "epoch": 0.5753194047797986, "flos": 20922742160640.0, "grad_norm": 1.8288224744161317, "language_loss": 0.71572077, "learning_rate": 1.6116577975278994e-06, "loss": 0.73725533, "num_input_tokens_seen": 206088950, "step": 9569, "time_per_iteration": 2.9613218307495117 }, { "auxiliary_loss_clip": 0.01113684, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.04693925, "balance_loss_mlp": 1.02399325, "epoch": 0.5753795280324665, "flos": 19281804215040.0, "grad_norm": 2.1991270484780916, "language_loss": 0.55975366, "learning_rate": 1.6112757555723223e-06, "loss": 0.58126599, "num_input_tokens_seen": 206107780, "step": 9570, "time_per_iteration": 2.6928811073303223 }, { "auxiliary_loss_clip": 0.01118829, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.04458117, "balance_loss_mlp": 1.02252328, "epoch": 0.5754396512851345, "flos": 21652877917440.0, "grad_norm": 1.4030574698632734, "language_loss": 0.64338309, "learning_rate": 1.6108937283572082e-06, "loss": 0.66491854, "num_input_tokens_seen": 206127445, "step": 9571, "time_per_iteration": 2.635603427886963 }, { "auxiliary_loss_clip": 0.01111717, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.01890385, "epoch": 0.5754997745378025, "flos": 51021700179840.0, "grad_norm": 1.5230879857727748, "language_loss": 0.67137802, "learning_rate": 1.6105117158970434e-06, "loss": 0.69281137, "num_input_tokens_seen": 206152005, "step": 9572, "time_per_iteration": 2.9080519676208496 }, { "auxiliary_loss_clip": 0.01101219, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.04746473, "balance_loss_mlp": 1.01870155, "epoch": 0.5755598977904705, "flos": 22856890826880.0, "grad_norm": 1.7883651828614429, "language_loss": 0.72390687, "learning_rate": 1.6101297182063123e-06, "loss": 0.74524224, "num_input_tokens_seen": 206169875, "step": 9573, "time_per_iteration": 2.815703868865967 }, { "auxiliary_loss_clip": 0.01118198, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.04730046, "balance_loss_mlp": 1.0202539, "epoch": 0.5756200210431385, "flos": 38472824805120.0, "grad_norm": 1.8637575754568128, "language_loss": 0.76394922, "learning_rate": 1.6097477352995022e-06, "loss": 0.78545088, "num_input_tokens_seen": 206192635, "step": 9574, "time_per_iteration": 2.778196096420288 }, { "auxiliary_loss_clip": 0.01068081, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.03836775, "balance_loss_mlp": 1.02201867, "epoch": 0.5756801442958064, "flos": 23910006700800.0, "grad_norm": 2.572143968399992, "language_loss": 0.66373074, "learning_rate": 1.6093657671910968e-06, "loss": 0.68478066, "num_input_tokens_seen": 206211485, "step": 9575, "time_per_iteration": 2.780195951461792 }, { "auxiliary_loss_clip": 0.01097887, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.04497039, "balance_loss_mlp": 1.01917517, "epoch": 0.5757402675484744, "flos": 21105276099840.0, "grad_norm": 1.5189421087528554, "language_loss": 0.79787755, "learning_rate": 1.6089838138955804e-06, "loss": 0.81916952, "num_input_tokens_seen": 206231740, "step": 9576, "time_per_iteration": 2.7809135913848877 }, { "auxiliary_loss_clip": 0.01096091, "auxiliary_loss_mlp": 0.0102674, "balance_loss_clip": 1.0435828, "balance_loss_mlp": 1.01512265, "epoch": 0.5758003908011423, "flos": 20559110826240.0, "grad_norm": 1.7619408585744085, "language_loss": 0.69726396, "learning_rate": 1.6086018754274372e-06, "loss": 0.71849227, "num_input_tokens_seen": 206250975, "step": 9577, "time_per_iteration": 2.732150077819824 }, { "auxiliary_loss_clip": 0.01111358, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.04446626, "balance_loss_mlp": 1.02306843, "epoch": 0.5758605140538103, "flos": 16473015377280.0, "grad_norm": 2.216832845639703, "language_loss": 0.66558278, "learning_rate": 1.6082199518011504e-06, "loss": 0.6870482, "num_input_tokens_seen": 206268800, "step": 9578, "time_per_iteration": 2.639571189880371 }, { "auxiliary_loss_clip": 0.01091288, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.04414392, "balance_loss_mlp": 1.01997256, "epoch": 0.5759206373064782, "flos": 21287558643840.0, "grad_norm": 1.7735647320590846, "language_loss": 0.72313404, "learning_rate": 1.6078380430312016e-06, "loss": 0.74436903, "num_input_tokens_seen": 206287190, "step": 9579, "time_per_iteration": 2.6910343170166016 }, { "auxiliary_loss_clip": 0.0110168, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.04436874, "balance_loss_mlp": 1.02170634, "epoch": 0.5759807605591463, "flos": 26067879227520.0, "grad_norm": 4.803146579630836, "language_loss": 0.65395081, "learning_rate": 1.6074561491320742e-06, "loss": 0.67532551, "num_input_tokens_seen": 206307020, "step": 9580, "time_per_iteration": 2.7227509021759033 }, { "auxiliary_loss_clip": 0.01092842, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.04106581, "balance_loss_mlp": 1.0212729, "epoch": 0.5760408838118142, "flos": 18873068376960.0, "grad_norm": 1.9154940218320493, "language_loss": 0.85214174, "learning_rate": 1.6070742701182486e-06, "loss": 0.87341785, "num_input_tokens_seen": 206324095, "step": 9581, "time_per_iteration": 2.699432849884033 }, { "auxiliary_loss_clip": 0.0113104, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.05060983, "balance_loss_mlp": 1.02360821, "epoch": 0.5761010070644822, "flos": 15378134964480.0, "grad_norm": 2.109676550381332, "language_loss": 0.67354548, "learning_rate": 1.6066924060042057e-06, "loss": 0.69522369, "num_input_tokens_seen": 206343210, "step": 9582, "time_per_iteration": 2.6381587982177734 }, { "auxiliary_loss_clip": 0.01026383, "auxiliary_loss_mlp": 0.01001724, "balance_loss_clip": 1.01951599, "balance_loss_mlp": 1.00040722, "epoch": 0.5761611303171501, "flos": 71471932882560.0, "grad_norm": 0.6463341323488921, "language_loss": 0.57134479, "learning_rate": 1.6063105568044271e-06, "loss": 0.59162581, "num_input_tokens_seen": 206415935, "step": 9583, "time_per_iteration": 3.52109694480896 }, { "auxiliary_loss_clip": 0.01090801, "auxiliary_loss_mlp": 0.01030991, "balance_loss_clip": 1.04208195, "balance_loss_mlp": 1.01818216, "epoch": 0.5762212535698181, "flos": 16246167033600.0, "grad_norm": 1.791358766979404, "language_loss": 0.82729411, "learning_rate": 1.6059287225333912e-06, "loss": 0.84851205, "num_input_tokens_seen": 206431900, "step": 9584, "time_per_iteration": 2.7258176803588867 }, { "auxiliary_loss_clip": 0.0104221, "auxiliary_loss_mlp": 0.01002028, "balance_loss_clip": 1.0174526, "balance_loss_mlp": 1.00080013, "epoch": 0.5762813768224861, "flos": 70185504216960.0, "grad_norm": 0.623568426409687, "language_loss": 0.49559212, "learning_rate": 1.6055469032055773e-06, "loss": 0.51603448, "num_input_tokens_seen": 206501200, "step": 9585, "time_per_iteration": 7.823396682739258 }, { "auxiliary_loss_clip": 0.01092491, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.04217815, "balance_loss_mlp": 1.01516044, "epoch": 0.5763415000751541, "flos": 20518028645760.0, "grad_norm": 1.574762209284147, "language_loss": 0.85150623, "learning_rate": 1.605165098835465e-06, "loss": 0.87270141, "num_input_tokens_seen": 206520575, "step": 9586, "time_per_iteration": 2.6869027614593506 }, { "auxiliary_loss_clip": 0.0110803, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.04531455, "balance_loss_mlp": 1.02091956, "epoch": 0.5764016233278221, "flos": 15815526877440.0, "grad_norm": 2.1680790738732796, "language_loss": 0.80101568, "learning_rate": 1.6047833094375308e-06, "loss": 0.8224445, "num_input_tokens_seen": 206538060, "step": 9587, "time_per_iteration": 2.664121627807617 }, { "auxiliary_loss_clip": 0.01091421, "auxiliary_loss_mlp": 0.01037732, "balance_loss_clip": 1.04280019, "balance_loss_mlp": 1.02400517, "epoch": 0.57646174658049, "flos": 20772312001920.0, "grad_norm": 1.6197519148440016, "language_loss": 0.66023791, "learning_rate": 1.6044015350262542e-06, "loss": 0.68152946, "num_input_tokens_seen": 206557320, "step": 9588, "time_per_iteration": 2.6596546173095703 }, { "auxiliary_loss_clip": 0.01095166, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.04326534, "balance_loss_mlp": 1.02583766, "epoch": 0.576521869833158, "flos": 23549930812800.0, "grad_norm": 2.4954533064787383, "language_loss": 0.78688884, "learning_rate": 1.6040197756161104e-06, "loss": 0.80823773, "num_input_tokens_seen": 206575780, "step": 9589, "time_per_iteration": 2.799503803253174 }, { "auxiliary_loss_clip": 0.01114482, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.041682, "balance_loss_mlp": 1.01353538, "epoch": 0.5765819930858259, "flos": 20266582464000.0, "grad_norm": 2.2193599120856304, "language_loss": 0.79450285, "learning_rate": 1.6036380312215762e-06, "loss": 0.81590021, "num_input_tokens_seen": 206594100, "step": 9590, "time_per_iteration": 4.355879545211792 }, { "auxiliary_loss_clip": 0.01052935, "auxiliary_loss_mlp": 0.00769289, "balance_loss_clip": 1.03650951, "balance_loss_mlp": 1.00013447, "epoch": 0.5766421163384939, "flos": 23148772744320.0, "grad_norm": 1.8083193654510727, "language_loss": 0.63346255, "learning_rate": 1.6032563018571283e-06, "loss": 0.65168482, "num_input_tokens_seen": 206613325, "step": 9591, "time_per_iteration": 2.8449039459228516 }, { "auxiliary_loss_clip": 0.01122211, "auxiliary_loss_mlp": 0.00769941, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.00013709, "epoch": 0.5767022395911618, "flos": 25848895962240.0, "grad_norm": 2.331025746602298, "language_loss": 0.78112143, "learning_rate": 1.6028745875372406e-06, "loss": 0.80004299, "num_input_tokens_seen": 206634265, "step": 9592, "time_per_iteration": 2.7304346561431885 }, { "auxiliary_loss_clip": 0.01004052, "auxiliary_loss_mlp": 0.01021446, "balance_loss_clip": 1.02547979, "balance_loss_mlp": 1.01965749, "epoch": 0.5767623628438299, "flos": 68293299657600.0, "grad_norm": 0.7436002967471621, "language_loss": 0.59609032, "learning_rate": 1.6024928882763885e-06, "loss": 0.61634529, "num_input_tokens_seen": 206696990, "step": 9593, "time_per_iteration": 3.461658477783203 }, { "auxiliary_loss_clip": 0.01110844, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.042449, "balance_loss_mlp": 1.02810097, "epoch": 0.5768224860964978, "flos": 30188448754560.0, "grad_norm": 1.9449888897854992, "language_loss": 0.71144432, "learning_rate": 1.6021112040890463e-06, "loss": 0.73296678, "num_input_tokens_seen": 206717815, "step": 9594, "time_per_iteration": 2.8465657234191895 }, { "auxiliary_loss_clip": 0.01085879, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.04293251, "balance_loss_mlp": 1.02196598, "epoch": 0.5768826093491658, "flos": 17895041884800.0, "grad_norm": 2.485745999068748, "language_loss": 0.70693135, "learning_rate": 1.6017295349896863e-06, "loss": 0.72812331, "num_input_tokens_seen": 206735985, "step": 9595, "time_per_iteration": 2.724013566970825 }, { "auxiliary_loss_clip": 0.01120342, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.04522467, "balance_loss_mlp": 1.01821947, "epoch": 0.5769427326018337, "flos": 17457183095040.0, "grad_norm": 2.28937358102888, "language_loss": 0.69969249, "learning_rate": 1.6013478809927828e-06, "loss": 0.72120476, "num_input_tokens_seen": 206753370, "step": 9596, "time_per_iteration": 2.602410316467285 }, { "auxiliary_loss_clip": 0.01097835, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.04560232, "balance_loss_mlp": 1.01944959, "epoch": 0.5770028558545017, "flos": 39421728345600.0, "grad_norm": 1.7463690567151626, "language_loss": 0.67612261, "learning_rate": 1.6009662421128074e-06, "loss": 0.69743955, "num_input_tokens_seen": 206777645, "step": 9597, "time_per_iteration": 2.9427249431610107 }, { "auxiliary_loss_clip": 0.01096299, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.04274464, "balance_loss_mlp": 1.02137804, "epoch": 0.5770629791071697, "flos": 21536383132800.0, "grad_norm": 1.8692422611288704, "language_loss": 0.81584179, "learning_rate": 1.6005846183642323e-06, "loss": 0.83714437, "num_input_tokens_seen": 206794865, "step": 9598, "time_per_iteration": 2.748018503189087 }, { "auxiliary_loss_clip": 0.01073806, "auxiliary_loss_mlp": 0.01042323, "balance_loss_clip": 1.03563309, "balance_loss_mlp": 1.0270164, "epoch": 0.5771231023598377, "flos": 20886795624960.0, "grad_norm": 1.6175391320992503, "language_loss": 0.7306143, "learning_rate": 1.6002030097615277e-06, "loss": 0.7517755, "num_input_tokens_seen": 206814095, "step": 9599, "time_per_iteration": 2.7712650299072266 }, { "auxiliary_loss_clip": 0.01115679, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.04342914, "balance_loss_mlp": 1.0211184, "epoch": 0.5771832256125057, "flos": 18077216688000.0, "grad_norm": 3.919070780783451, "language_loss": 0.78193593, "learning_rate": 1.5998214163191663e-06, "loss": 0.80342484, "num_input_tokens_seen": 206832245, "step": 9600, "time_per_iteration": 2.6597604751586914 }, { "auxiliary_loss_clip": 0.01113425, "auxiliary_loss_mlp": 0.0077084, "balance_loss_clip": 1.04604816, "balance_loss_mlp": 1.00016284, "epoch": 0.5772433488651736, "flos": 26359078786560.0, "grad_norm": 1.665079650983798, "language_loss": 0.72689855, "learning_rate": 1.5994398380516163e-06, "loss": 0.74574125, "num_input_tokens_seen": 206851535, "step": 9601, "time_per_iteration": 2.7263121604919434 }, { "auxiliary_loss_clip": 0.01064473, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.04480124, "balance_loss_mlp": 1.02311611, "epoch": 0.5773034721178416, "flos": 19680987035520.0, "grad_norm": 2.0948856363437534, "language_loss": 0.68606448, "learning_rate": 1.599058274973348e-06, "loss": 0.70706952, "num_input_tokens_seen": 206870595, "step": 9602, "time_per_iteration": 2.8572375774383545 }, { "auxiliary_loss_clip": 0.01088049, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.03997481, "balance_loss_mlp": 1.02274275, "epoch": 0.5773635953705095, "flos": 25082885496960.0, "grad_norm": 1.4139424352201144, "language_loss": 0.73376763, "learning_rate": 1.5986767270988297e-06, "loss": 0.75500333, "num_input_tokens_seen": 206892320, "step": 9603, "time_per_iteration": 2.816098928451538 }, { "auxiliary_loss_clip": 0.01108536, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.0450983, "balance_loss_mlp": 1.01732492, "epoch": 0.5774237186231775, "flos": 21032987978880.0, "grad_norm": 1.7349679186761677, "language_loss": 0.76407522, "learning_rate": 1.5982951944425298e-06, "loss": 0.78545588, "num_input_tokens_seen": 206912485, "step": 9604, "time_per_iteration": 2.718163013458252 }, { "auxiliary_loss_clip": 0.01086662, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.04304457, "balance_loss_mlp": 1.02200651, "epoch": 0.5774838418758454, "flos": 15231727128960.0, "grad_norm": 2.5247859182247026, "language_loss": 0.83387136, "learning_rate": 1.5979136770189174e-06, "loss": 0.85509419, "num_input_tokens_seen": 206929100, "step": 9605, "time_per_iteration": 2.8076066970825195 }, { "auxiliary_loss_clip": 0.01096142, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.04626584, "balance_loss_mlp": 1.01667333, "epoch": 0.5775439651285135, "flos": 23582609210880.0, "grad_norm": 1.8595500746131972, "language_loss": 0.77926147, "learning_rate": 1.5975321748424581e-06, "loss": 0.80054009, "num_input_tokens_seen": 206947020, "step": 9606, "time_per_iteration": 2.7766621112823486 }, { "auxiliary_loss_clip": 0.01117345, "auxiliary_loss_mlp": 0.01035757, "balance_loss_clip": 1.04331446, "balance_loss_mlp": 1.02362752, "epoch": 0.5776040883811814, "flos": 18040515966720.0, "grad_norm": 1.672602422897938, "language_loss": 0.73896575, "learning_rate": 1.597150687927619e-06, "loss": 0.76049674, "num_input_tokens_seen": 206964065, "step": 9607, "time_per_iteration": 2.6057968139648438 }, { "auxiliary_loss_clip": 0.01076534, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.04220486, "balance_loss_mlp": 1.02155876, "epoch": 0.5776642116338494, "flos": 18624638937600.0, "grad_norm": 1.6326461875987317, "language_loss": 0.69385672, "learning_rate": 1.5967692162888664e-06, "loss": 0.71496868, "num_input_tokens_seen": 206981940, "step": 9608, "time_per_iteration": 2.784708023071289 }, { "auxiliary_loss_clip": 0.01084539, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.03977787, "balance_loss_mlp": 1.01979709, "epoch": 0.5777243348865173, "flos": 28402539517440.0, "grad_norm": 1.6850838728782904, "language_loss": 0.76766187, "learning_rate": 1.596387759940665e-06, "loss": 0.78883779, "num_input_tokens_seen": 207002365, "step": 9609, "time_per_iteration": 2.7439122200012207 }, { "auxiliary_loss_clip": 0.01090565, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.04297495, "balance_loss_mlp": 1.02154744, "epoch": 0.5777844581391853, "flos": 24024705805440.0, "grad_norm": 1.7626877282975804, "language_loss": 0.76948774, "learning_rate": 1.5960063188974808e-06, "loss": 0.79072988, "num_input_tokens_seen": 207021195, "step": 9610, "time_per_iteration": 2.748898506164551 }, { "auxiliary_loss_clip": 0.0108266, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.03885353, "balance_loss_mlp": 1.01625562, "epoch": 0.5778445813918534, "flos": 17777361951360.0, "grad_norm": 2.997373910278609, "language_loss": 0.68867594, "learning_rate": 1.5956248931737777e-06, "loss": 0.70980155, "num_input_tokens_seen": 207037465, "step": 9611, "time_per_iteration": 2.7037806510925293 }, { "auxiliary_loss_clip": 0.01103482, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.03957248, "balance_loss_mlp": 1.01607609, "epoch": 0.5779047046445213, "flos": 22233194046720.0, "grad_norm": 1.7435127822918648, "language_loss": 0.83207917, "learning_rate": 1.5952434827840185e-06, "loss": 0.85340309, "num_input_tokens_seen": 207054230, "step": 9612, "time_per_iteration": 2.6507790088653564 }, { "auxiliary_loss_clip": 0.01119736, "auxiliary_loss_mlp": 0.01030573, "balance_loss_clip": 1.04522681, "balance_loss_mlp": 1.01779914, "epoch": 0.5779648278971893, "flos": 21434361528960.0, "grad_norm": 1.6430153650030166, "language_loss": 0.79567391, "learning_rate": 1.594862087742667e-06, "loss": 0.81717706, "num_input_tokens_seen": 207073150, "step": 9613, "time_per_iteration": 2.679202079772949 }, { "auxiliary_loss_clip": 0.01107, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.04167032, "balance_loss_mlp": 1.02013552, "epoch": 0.5780249511498572, "flos": 19026120228480.0, "grad_norm": 1.7764623177151277, "language_loss": 0.77572, "learning_rate": 1.5944807080641863e-06, "loss": 0.7971108, "num_input_tokens_seen": 207090375, "step": 9614, "time_per_iteration": 2.6978790760040283 }, { "auxiliary_loss_clip": 0.01086413, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.04169321, "balance_loss_mlp": 1.020715, "epoch": 0.5780850744025252, "flos": 12124663752960.0, "grad_norm": 2.2008207091737093, "language_loss": 0.81598818, "learning_rate": 1.5940993437630375e-06, "loss": 0.83718669, "num_input_tokens_seen": 207106030, "step": 9615, "time_per_iteration": 2.7248473167419434 }, { "auxiliary_loss_clip": 0.01104516, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.03926682, "balance_loss_mlp": 1.01978278, "epoch": 0.5781451976551931, "flos": 25044425009280.0, "grad_norm": 1.4596798757523364, "language_loss": 0.67086244, "learning_rate": 1.5937179948536825e-06, "loss": 0.69223398, "num_input_tokens_seen": 207125435, "step": 9616, "time_per_iteration": 2.7597362995147705 }, { "auxiliary_loss_clip": 0.01106834, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.04345763, "balance_loss_mlp": 1.01935697, "epoch": 0.5782053209078611, "flos": 19245606284160.0, "grad_norm": 1.6175721800228267, "language_loss": 0.77521074, "learning_rate": 1.5933366613505812e-06, "loss": 0.79660165, "num_input_tokens_seen": 207145095, "step": 9617, "time_per_iteration": 2.8377323150634766 }, { "auxiliary_loss_clip": 0.01094943, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.04236281, "balance_loss_mlp": 1.02231812, "epoch": 0.578265444160529, "flos": 25993831340160.0, "grad_norm": 1.5155731004031996, "language_loss": 0.75113726, "learning_rate": 1.5929553432681947e-06, "loss": 0.77243888, "num_input_tokens_seen": 207166045, "step": 9618, "time_per_iteration": 2.665472984313965 }, { "auxiliary_loss_clip": 0.0111694, "auxiliary_loss_mlp": 0.01028064, "balance_loss_clip": 1.04336691, "balance_loss_mlp": 1.01594067, "epoch": 0.5783255674131971, "flos": 21798603394560.0, "grad_norm": 2.8083861615500445, "language_loss": 0.81775922, "learning_rate": 1.5925740406209826e-06, "loss": 0.83920932, "num_input_tokens_seen": 207185290, "step": 9619, "time_per_iteration": 2.6156482696533203 }, { "auxiliary_loss_clip": 0.01099184, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.04264188, "balance_loss_mlp": 1.0207603, "epoch": 0.578385690665865, "flos": 24789746603520.0, "grad_norm": 1.7083869874707343, "language_loss": 0.72963226, "learning_rate": 1.5921927534234039e-06, "loss": 0.75094968, "num_input_tokens_seen": 207205505, "step": 9620, "time_per_iteration": 2.7066376209259033 }, { "auxiliary_loss_clip": 0.01096891, "auxiliary_loss_mlp": 0.01030675, "balance_loss_clip": 1.04079533, "balance_loss_mlp": 1.01831877, "epoch": 0.578445813918533, "flos": 21212864311680.0, "grad_norm": 8.221069459540734, "language_loss": 0.76836628, "learning_rate": 1.591811481689916e-06, "loss": 0.78964192, "num_input_tokens_seen": 207225315, "step": 9621, "time_per_iteration": 2.746229887008667 }, { "auxiliary_loss_clip": 0.01054178, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.03465438, "balance_loss_mlp": 1.02871835, "epoch": 0.5785059371712009, "flos": 25046795306880.0, "grad_norm": 1.8397649270084009, "language_loss": 0.70646143, "learning_rate": 1.5914302254349787e-06, "loss": 0.72743344, "num_input_tokens_seen": 207247690, "step": 9622, "time_per_iteration": 2.7708969116210938 }, { "auxiliary_loss_clip": 0.01024027, "auxiliary_loss_mlp": 0.01003845, "balance_loss_clip": 1.01965523, "balance_loss_mlp": 1.00259304, "epoch": 0.5785660604238689, "flos": 70843172284800.0, "grad_norm": 0.7693139889423115, "language_loss": 0.55946988, "learning_rate": 1.5910489846730476e-06, "loss": 0.57974857, "num_input_tokens_seen": 207301735, "step": 9623, "time_per_iteration": 3.2743892669677734 }, { "auxiliary_loss_clip": 0.01084844, "auxiliary_loss_mlp": 0.01037987, "balance_loss_clip": 1.04244125, "balance_loss_mlp": 1.02392614, "epoch": 0.578626183676537, "flos": 31649977244160.0, "grad_norm": 2.0494784145389677, "language_loss": 0.71381462, "learning_rate": 1.5906677594185799e-06, "loss": 0.73504293, "num_input_tokens_seen": 207321240, "step": 9624, "time_per_iteration": 2.761348247528076 }, { "auxiliary_loss_clip": 0.01084192, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.03928137, "balance_loss_mlp": 1.02572453, "epoch": 0.5786863069292049, "flos": 21865181253120.0, "grad_norm": 2.0143803075104687, "language_loss": 0.82421607, "learning_rate": 1.5902865496860322e-06, "loss": 0.845451, "num_input_tokens_seen": 207339540, "step": 9625, "time_per_iteration": 4.566919326782227 }, { "auxiliary_loss_clip": 0.01116336, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.042328, "balance_loss_mlp": 1.02037549, "epoch": 0.5787464301818729, "flos": 23364954748800.0, "grad_norm": 1.438878240234706, "language_loss": 0.70356315, "learning_rate": 1.5899053554898591e-06, "loss": 0.72506356, "num_input_tokens_seen": 207360470, "step": 9626, "time_per_iteration": 2.6495361328125 }, { "auxiliary_loss_clip": 0.01095761, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.0427779, "balance_loss_mlp": 1.02442503, "epoch": 0.5788065534345408, "flos": 30004011394560.0, "grad_norm": 1.470476031522724, "language_loss": 0.72111934, "learning_rate": 1.5895241768445166e-06, "loss": 0.74244475, "num_input_tokens_seen": 207383080, "step": 9627, "time_per_iteration": 2.8884880542755127 }, { "auxiliary_loss_clip": 0.01104923, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.04045546, "balance_loss_mlp": 1.01872754, "epoch": 0.5788666766872088, "flos": 24527849564160.0, "grad_norm": 5.936898137308074, "language_loss": 0.83902895, "learning_rate": 1.589143013764458e-06, "loss": 0.8603847, "num_input_tokens_seen": 207401000, "step": 9628, "time_per_iteration": 2.746950626373291 }, { "auxiliary_loss_clip": 0.01093971, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.03782499, "balance_loss_mlp": 1.01856256, "epoch": 0.5789267999398767, "flos": 23732823888000.0, "grad_norm": 1.5735702827765405, "language_loss": 0.72260225, "learning_rate": 1.5887618662641376e-06, "loss": 0.74385989, "num_input_tokens_seen": 207419230, "step": 9629, "time_per_iteration": 4.194722652435303 }, { "auxiliary_loss_clip": 0.01096902, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.043715, "balance_loss_mlp": 1.02154994, "epoch": 0.5789869231925447, "flos": 21135045496320.0, "grad_norm": 2.2622526010485062, "language_loss": 0.74250948, "learning_rate": 1.5883807343580087e-06, "loss": 0.76382619, "num_input_tokens_seen": 207437615, "step": 9630, "time_per_iteration": 2.754213571548462 }, { "auxiliary_loss_clip": 0.01083141, "auxiliary_loss_mlp": 0.00770695, "balance_loss_clip": 1.0400362, "balance_loss_mlp": 1.00009274, "epoch": 0.5790470464452127, "flos": 21209632087680.0, "grad_norm": 1.6843723839781237, "language_loss": 0.78927267, "learning_rate": 1.587999618060523e-06, "loss": 0.8078109, "num_input_tokens_seen": 207457270, "step": 9631, "time_per_iteration": 2.757955551147461 }, { "auxiliary_loss_clip": 0.01116603, "auxiliary_loss_mlp": 0.01029207, "balance_loss_clip": 1.04169166, "balance_loss_mlp": 1.01674962, "epoch": 0.5791071696978807, "flos": 23404384903680.0, "grad_norm": 1.5220400196762927, "language_loss": 0.75543463, "learning_rate": 1.5876185173861333e-06, "loss": 0.77689266, "num_input_tokens_seen": 207477890, "step": 9632, "time_per_iteration": 2.5955679416656494 }, { "auxiliary_loss_clip": 0.01090291, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.04132521, "balance_loss_mlp": 1.01704419, "epoch": 0.5791672929505486, "flos": 24206521472640.0, "grad_norm": 2.166079097569446, "language_loss": 0.79483461, "learning_rate": 1.5872374323492915e-06, "loss": 0.81604362, "num_input_tokens_seen": 207497670, "step": 9633, "time_per_iteration": 3.0309832096099854 }, { "auxiliary_loss_clip": 0.01090489, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.04247785, "balance_loss_mlp": 1.02621174, "epoch": 0.5792274162032166, "flos": 24348871071360.0, "grad_norm": 1.6628345099755575, "language_loss": 0.77489352, "learning_rate": 1.5868563629644464e-06, "loss": 0.79620135, "num_input_tokens_seen": 207516105, "step": 9634, "time_per_iteration": 2.742804765701294 }, { "auxiliary_loss_clip": 0.01103303, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.04325557, "balance_loss_mlp": 1.0265131, "epoch": 0.5792875394558845, "flos": 20449403712000.0, "grad_norm": 2.0206641079359695, "language_loss": 0.63376474, "learning_rate": 1.5864753092460502e-06, "loss": 0.65519655, "num_input_tokens_seen": 207533685, "step": 9635, "time_per_iteration": 2.758554220199585 }, { "auxiliary_loss_clip": 0.01090702, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.0402782, "balance_loss_mlp": 1.02797055, "epoch": 0.5793476627085525, "flos": 24060329118720.0, "grad_norm": 1.4022803042470642, "language_loss": 0.77229643, "learning_rate": 1.5860942712085516e-06, "loss": 0.793612, "num_input_tokens_seen": 207552840, "step": 9636, "time_per_iteration": 2.6893904209136963 }, { "auxiliary_loss_clip": 0.01087778, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.03770018, "balance_loss_mlp": 1.02124608, "epoch": 0.5794077859612206, "flos": 22054287381120.0, "grad_norm": 1.6516741793622702, "language_loss": 0.68164212, "learning_rate": 1.5857132488663998e-06, "loss": 0.70285416, "num_input_tokens_seen": 207572095, "step": 9637, "time_per_iteration": 2.7232043743133545 }, { "auxiliary_loss_clip": 0.01076767, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.04049063, "balance_loss_mlp": 1.02214098, "epoch": 0.5794679092138885, "flos": 11434855991040.0, "grad_norm": 2.739438707467598, "language_loss": 0.72531378, "learning_rate": 1.585332242234043e-06, "loss": 0.74643862, "num_input_tokens_seen": 207587495, "step": 9638, "time_per_iteration": 2.819202423095703 }, { "auxiliary_loss_clip": 0.01107966, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.04470587, "balance_loss_mlp": 1.02056587, "epoch": 0.5795280324665565, "flos": 18880215183360.0, "grad_norm": 1.716063507275685, "language_loss": 0.72309893, "learning_rate": 1.5849512513259291e-06, "loss": 0.74450737, "num_input_tokens_seen": 207606795, "step": 9639, "time_per_iteration": 2.683488130569458 }, { "auxiliary_loss_clip": 0.01094721, "auxiliary_loss_mlp": 0.01039725, "balance_loss_clip": 1.0399698, "balance_loss_mlp": 1.02682686, "epoch": 0.5795881557192244, "flos": 13005947940480.0, "grad_norm": 1.8567608995858262, "language_loss": 0.70044529, "learning_rate": 1.5845702761565054e-06, "loss": 0.72178972, "num_input_tokens_seen": 207623620, "step": 9640, "time_per_iteration": 2.672945737838745 }, { "auxiliary_loss_clip": 0.01096614, "auxiliary_loss_mlp": 0.01042841, "balance_loss_clip": 1.0413754, "balance_loss_mlp": 1.02858996, "epoch": 0.5796482789718924, "flos": 19932397303680.0, "grad_norm": 2.4123450370287958, "language_loss": 0.7753675, "learning_rate": 1.5841893167402183e-06, "loss": 0.79676205, "num_input_tokens_seen": 207639380, "step": 9641, "time_per_iteration": 2.688164472579956 }, { "auxiliary_loss_clip": 0.01119399, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.04407382, "balance_loss_mlp": 1.02385926, "epoch": 0.5797084022245603, "flos": 21650794928640.0, "grad_norm": 1.8311937480298248, "language_loss": 0.73798597, "learning_rate": 1.5838083730915143e-06, "loss": 0.75954694, "num_input_tokens_seen": 207657915, "step": 9642, "time_per_iteration": 2.624521017074585 }, { "auxiliary_loss_clip": 0.01102536, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.04526544, "balance_loss_mlp": 1.02577972, "epoch": 0.5797685254772283, "flos": 26031573555840.0, "grad_norm": 5.942363913556237, "language_loss": 0.73259425, "learning_rate": 1.5834274452248378e-06, "loss": 0.75400496, "num_input_tokens_seen": 207678620, "step": 9643, "time_per_iteration": 2.715672254562378 }, { "auxiliary_loss_clip": 0.01121691, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.04416251, "balance_loss_mlp": 1.02062845, "epoch": 0.5798286487298963, "flos": 22705167778560.0, "grad_norm": 1.8659489070776951, "language_loss": 0.67181957, "learning_rate": 1.5830465331546352e-06, "loss": 0.69337404, "num_input_tokens_seen": 207696980, "step": 9644, "time_per_iteration": 2.6038551330566406 }, { "auxiliary_loss_clip": 0.01116177, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.04553771, "balance_loss_mlp": 1.02103531, "epoch": 0.5798887719825643, "flos": 23148988225920.0, "grad_norm": 2.1679759651263044, "language_loss": 0.85346615, "learning_rate": 1.5826656368953496e-06, "loss": 0.8749733, "num_input_tokens_seen": 207714065, "step": 9645, "time_per_iteration": 2.667259931564331 }, { "auxiliary_loss_clip": 0.01122251, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.04620934, "balance_loss_mlp": 1.01735902, "epoch": 0.5799488952352322, "flos": 24426043441920.0, "grad_norm": 2.1123906469300935, "language_loss": 0.75605559, "learning_rate": 1.5822847564614244e-06, "loss": 0.77757978, "num_input_tokens_seen": 207734720, "step": 9646, "time_per_iteration": 2.559659481048584 }, { "auxiliary_loss_clip": 0.01099999, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.04342473, "balance_loss_mlp": 1.02371335, "epoch": 0.5800090184879002, "flos": 38395903829760.0, "grad_norm": 1.698650252646941, "language_loss": 0.59495735, "learning_rate": 1.5819038918673038e-06, "loss": 0.61633444, "num_input_tokens_seen": 207755435, "step": 9647, "time_per_iteration": 2.7939651012420654 }, { "auxiliary_loss_clip": 0.0107788, "auxiliary_loss_mlp": 0.0105249, "balance_loss_clip": 1.04142165, "balance_loss_mlp": 1.03642702, "epoch": 0.5800691417405681, "flos": 19784840232960.0, "grad_norm": 1.6988187353884752, "language_loss": 0.84499681, "learning_rate": 1.5815230431274288e-06, "loss": 0.86630046, "num_input_tokens_seen": 207773570, "step": 9648, "time_per_iteration": 2.7750449180603027 }, { "auxiliary_loss_clip": 0.01032269, "auxiliary_loss_mlp": 0.01003411, "balance_loss_clip": 1.01776171, "balance_loss_mlp": 1.0021714, "epoch": 0.5801292649932361, "flos": 70314565783680.0, "grad_norm": 0.8432525659417933, "language_loss": 0.62929457, "learning_rate": 1.581142210256242e-06, "loss": 0.64965135, "num_input_tokens_seen": 207830095, "step": 9649, "time_per_iteration": 3.21219801902771 }, { "auxiliary_loss_clip": 0.01078275, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.03673697, "balance_loss_mlp": 1.02525127, "epoch": 0.5801893882459042, "flos": 18734812928640.0, "grad_norm": 1.587591091557097, "language_loss": 0.82462633, "learning_rate": 1.5807613932681857e-06, "loss": 0.84579957, "num_input_tokens_seen": 207848555, "step": 9650, "time_per_iteration": 2.8374016284942627 }, { "auxiliary_loss_clip": 0.0108491, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.03912425, "balance_loss_mlp": 1.0230515, "epoch": 0.5802495114985721, "flos": 15596507698560.0, "grad_norm": 3.679017793776146, "language_loss": 0.7786057, "learning_rate": 1.580380592177698e-06, "loss": 0.79981905, "num_input_tokens_seen": 207867060, "step": 9651, "time_per_iteration": 2.728508949279785 }, { "auxiliary_loss_clip": 0.01103104, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.04429924, "balance_loss_mlp": 1.02555537, "epoch": 0.5803096347512401, "flos": 18255405081600.0, "grad_norm": 1.8929228958840072, "language_loss": 0.74471784, "learning_rate": 1.5799998069992213e-06, "loss": 0.76614177, "num_input_tokens_seen": 207884520, "step": 9652, "time_per_iteration": 2.6977131366729736 }, { "auxiliary_loss_clip": 0.01092621, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.04145324, "balance_loss_mlp": 1.0150857, "epoch": 0.580369758003908, "flos": 22893160584960.0, "grad_norm": 2.031010770866024, "language_loss": 0.7703613, "learning_rate": 1.579619037747193e-06, "loss": 0.79157287, "num_input_tokens_seen": 207905370, "step": 9653, "time_per_iteration": 2.7233431339263916 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.04465187, "balance_loss_mlp": 1.02035964, "epoch": 0.580429881256576, "flos": 18697681244160.0, "grad_norm": 1.9204408515131524, "language_loss": 0.74248046, "learning_rate": 1.5792382844360534e-06, "loss": 0.76404566, "num_input_tokens_seen": 207923790, "step": 9654, "time_per_iteration": 2.595330238342285 }, { "auxiliary_loss_clip": 0.01054131, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.04102838, "balance_loss_mlp": 1.02466965, "epoch": 0.5804900045092439, "flos": 24681978823680.0, "grad_norm": 1.627345886244452, "language_loss": 0.70138443, "learning_rate": 1.5788575470802408e-06, "loss": 0.72230321, "num_input_tokens_seen": 207942335, "step": 9655, "time_per_iteration": 2.8097565174102783 }, { "auxiliary_loss_clip": 0.01125048, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.04366922, "balance_loss_mlp": 1.02495217, "epoch": 0.580550127761912, "flos": 23112790295040.0, "grad_norm": 1.8908787804935243, "language_loss": 0.69673449, "learning_rate": 1.5784768256941915e-06, "loss": 0.71836954, "num_input_tokens_seen": 207961975, "step": 9656, "time_per_iteration": 2.6233110427856445 }, { "auxiliary_loss_clip": 0.01107455, "auxiliary_loss_mlp": 0.01034723, "balance_loss_clip": 1.04619503, "balance_loss_mlp": 1.02208686, "epoch": 0.5806102510145799, "flos": 18475681236480.0, "grad_norm": 1.5577317145380594, "language_loss": 0.71972537, "learning_rate": 1.5780961202923433e-06, "loss": 0.7411471, "num_input_tokens_seen": 207979520, "step": 9657, "time_per_iteration": 2.616337537765503 }, { "auxiliary_loss_clip": 0.01111294, "auxiliary_loss_mlp": 0.01037621, "balance_loss_clip": 1.04370785, "balance_loss_mlp": 1.0237869, "epoch": 0.5806703742672479, "flos": 23915645136000.0, "grad_norm": 1.9819747060784367, "language_loss": 0.70975304, "learning_rate": 1.5777154308891328e-06, "loss": 0.73124212, "num_input_tokens_seen": 207998375, "step": 9658, "time_per_iteration": 2.6383109092712402 }, { "auxiliary_loss_clip": 0.01031383, "auxiliary_loss_mlp": 0.01001283, "balance_loss_clip": 1.01641989, "balance_loss_mlp": 1.00009727, "epoch": 0.5807304975199158, "flos": 66311999412480.0, "grad_norm": 0.7167527277810166, "language_loss": 0.5357672, "learning_rate": 1.5773347574989953e-06, "loss": 0.55609381, "num_input_tokens_seen": 208060605, "step": 9659, "time_per_iteration": 3.1848106384277344 }, { "auxiliary_loss_clip": 0.0111162, "auxiliary_loss_mlp": 0.01040087, "balance_loss_clip": 1.04272866, "balance_loss_mlp": 1.02638984, "epoch": 0.5807906207725838, "flos": 31722444933120.0, "grad_norm": 1.8377682291636406, "language_loss": 0.61835778, "learning_rate": 1.576954100136366e-06, "loss": 0.63987488, "num_input_tokens_seen": 208080320, "step": 9660, "time_per_iteration": 2.7875893115997314 }, { "auxiliary_loss_clip": 0.01108259, "auxiliary_loss_mlp": 0.01035512, "balance_loss_clip": 1.03933334, "balance_loss_mlp": 1.02131391, "epoch": 0.5808507440252517, "flos": 23801161512960.0, "grad_norm": 1.4582842247400174, "language_loss": 0.65268373, "learning_rate": 1.5765734588156797e-06, "loss": 0.6741215, "num_input_tokens_seen": 208099305, "step": 9661, "time_per_iteration": 2.640033721923828 }, { "auxiliary_loss_clip": 0.01060469, "auxiliary_loss_mlp": 0.01027812, "balance_loss_clip": 1.03416336, "balance_loss_mlp": 1.01562285, "epoch": 0.5809108672779197, "flos": 13698449222400.0, "grad_norm": 13.818010552074016, "language_loss": 0.74664855, "learning_rate": 1.5761928335513704e-06, "loss": 0.76753139, "num_input_tokens_seen": 208116960, "step": 9662, "time_per_iteration": 2.78912091255188 }, { "auxiliary_loss_clip": 0.0103935, "auxiliary_loss_mlp": 0.01000149, "balance_loss_clip": 1.01472378, "balance_loss_mlp": 0.99883789, "epoch": 0.5809709905305876, "flos": 69134866381440.0, "grad_norm": 0.8720581464390529, "language_loss": 0.58341724, "learning_rate": 1.5758122243578709e-06, "loss": 0.60381216, "num_input_tokens_seen": 208182190, "step": 9663, "time_per_iteration": 3.2206766605377197 }, { "auxiliary_loss_clip": 0.01099545, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.04324317, "balance_loss_mlp": 1.02127123, "epoch": 0.5810311137832557, "flos": 19827538525440.0, "grad_norm": 2.2012699158511073, "language_loss": 0.82044816, "learning_rate": 1.5754316312496152e-06, "loss": 0.84178805, "num_input_tokens_seen": 208197015, "step": 9664, "time_per_iteration": 5.9192726612091064 }, { "auxiliary_loss_clip": 0.01089768, "auxiliary_loss_mlp": 0.00771212, "balance_loss_clip": 1.03780138, "balance_loss_mlp": 1.0000962, "epoch": 0.5810912370359237, "flos": 29238503719680.0, "grad_norm": 4.331316838714664, "language_loss": 0.81583905, "learning_rate": 1.5750510542410337e-06, "loss": 0.83444887, "num_input_tokens_seen": 208215795, "step": 9665, "time_per_iteration": 2.7813103199005127 }, { "auxiliary_loss_clip": 0.01104588, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.0461179, "balance_loss_mlp": 1.02123475, "epoch": 0.5811513602885916, "flos": 22785572373120.0, "grad_norm": 1.7229241789226792, "language_loss": 0.81392443, "learning_rate": 1.5746704933465599e-06, "loss": 0.83532941, "num_input_tokens_seen": 208234655, "step": 9666, "time_per_iteration": 2.7249464988708496 }, { "auxiliary_loss_clip": 0.01101961, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.04181623, "balance_loss_mlp": 1.02339292, "epoch": 0.5812114835412596, "flos": 18734346051840.0, "grad_norm": 1.7975787773576042, "language_loss": 0.80100554, "learning_rate": 1.5742899485806227e-06, "loss": 0.82238424, "num_input_tokens_seen": 208251300, "step": 9667, "time_per_iteration": 2.600576639175415 }, { "auxiliary_loss_clip": 0.01117108, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.04451418, "balance_loss_mlp": 1.02237177, "epoch": 0.5812716067939275, "flos": 26431295080320.0, "grad_norm": 1.4400303722288619, "language_loss": 0.78809667, "learning_rate": 1.573909419957653e-06, "loss": 0.80964047, "num_input_tokens_seen": 208272685, "step": 9668, "time_per_iteration": 4.22690486907959 }, { "auxiliary_loss_clip": 0.01098312, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.04209864, "balance_loss_mlp": 1.02148795, "epoch": 0.5813317300465956, "flos": 43397865285120.0, "grad_norm": 1.8465293320084986, "language_loss": 0.64245093, "learning_rate": 1.5735289074920819e-06, "loss": 0.66377068, "num_input_tokens_seen": 208294315, "step": 9669, "time_per_iteration": 2.8652687072753906 }, { "auxiliary_loss_clip": 0.01069091, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.03997946, "balance_loss_mlp": 1.02672672, "epoch": 0.5813918532992635, "flos": 24785472885120.0, "grad_norm": 1.4411692985545548, "language_loss": 0.7307651, "learning_rate": 1.5731484111983363e-06, "loss": 0.75186646, "num_input_tokens_seen": 208315610, "step": 9670, "time_per_iteration": 2.829456329345703 }, { "auxiliary_loss_clip": 0.01086705, "auxiliary_loss_mlp": 0.01034661, "balance_loss_clip": 1.03999424, "balance_loss_mlp": 1.02194691, "epoch": 0.5814519765519315, "flos": 22857357703680.0, "grad_norm": 2.0479138475359844, "language_loss": 0.7874738, "learning_rate": 1.5727679310908464e-06, "loss": 0.80868745, "num_input_tokens_seen": 208334725, "step": 9671, "time_per_iteration": 2.7991318702697754 }, { "auxiliary_loss_clip": 0.0107985, "auxiliary_loss_mlp": 0.01044541, "balance_loss_clip": 1.0416975, "balance_loss_mlp": 1.02910936, "epoch": 0.5815120998045994, "flos": 24060831909120.0, "grad_norm": 1.9838213735263186, "language_loss": 0.61369407, "learning_rate": 1.5723874671840399e-06, "loss": 0.634938, "num_input_tokens_seen": 208353825, "step": 9672, "time_per_iteration": 2.8498592376708984 }, { "auxiliary_loss_clip": 0.01065855, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.04000103, "balance_loss_mlp": 1.02496195, "epoch": 0.5815722230572674, "flos": 24279491952000.0, "grad_norm": 2.0691966635939365, "language_loss": 0.81397313, "learning_rate": 1.572007019492342e-06, "loss": 0.83501786, "num_input_tokens_seen": 208374160, "step": 9673, "time_per_iteration": 2.8208439350128174 }, { "auxiliary_loss_clip": 0.0108779, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.04342866, "balance_loss_mlp": 1.0242784, "epoch": 0.5816323463099353, "flos": 22200371994240.0, "grad_norm": 1.86389400550988, "language_loss": 0.88404083, "learning_rate": 1.5716265880301817e-06, "loss": 0.905303, "num_input_tokens_seen": 208392105, "step": 9674, "time_per_iteration": 2.7522170543670654 }, { "auxiliary_loss_clip": 0.01120808, "auxiliary_loss_mlp": 0.00770234, "balance_loss_clip": 1.04347241, "balance_loss_mlp": 1.00026846, "epoch": 0.5816924695626033, "flos": 24134448833280.0, "grad_norm": 1.4106486697266074, "language_loss": 0.78974068, "learning_rate": 1.571246172811984e-06, "loss": 0.80865109, "num_input_tokens_seen": 208411755, "step": 9675, "time_per_iteration": 2.6588079929351807 }, { "auxiliary_loss_clip": 0.01106314, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.04066849, "balance_loss_mlp": 1.02178526, "epoch": 0.5817525928152713, "flos": 21324223451520.0, "grad_norm": 1.863415006013356, "language_loss": 0.70507479, "learning_rate": 1.5708657738521748e-06, "loss": 0.72649372, "num_input_tokens_seen": 208429995, "step": 9676, "time_per_iteration": 2.64201283454895 }, { "auxiliary_loss_clip": 0.01058756, "auxiliary_loss_mlp": 0.01033649, "balance_loss_clip": 1.0396111, "balance_loss_mlp": 1.02030993, "epoch": 0.5818127160679393, "flos": 26934510666240.0, "grad_norm": 2.6670948708651636, "language_loss": 0.63821483, "learning_rate": 1.5704853911651779e-06, "loss": 0.65913892, "num_input_tokens_seen": 208443655, "step": 9677, "time_per_iteration": 2.818047523498535 }, { "auxiliary_loss_clip": 0.01020823, "auxiliary_loss_mlp": 0.01010612, "balance_loss_clip": 1.02114296, "balance_loss_mlp": 1.00937831, "epoch": 0.5818728393206073, "flos": 63918626342400.0, "grad_norm": 0.8047469836092298, "language_loss": 0.54188442, "learning_rate": 1.5701050247654182e-06, "loss": 0.56219876, "num_input_tokens_seen": 208498405, "step": 9678, "time_per_iteration": 3.2669215202331543 }, { "auxiliary_loss_clip": 0.01019281, "auxiliary_loss_mlp": 0.0100911, "balance_loss_clip": 1.01330447, "balance_loss_mlp": 1.00782299, "epoch": 0.5819329625732752, "flos": 64954108638720.0, "grad_norm": 0.7377482843760589, "language_loss": 0.56218177, "learning_rate": 1.569724674667319e-06, "loss": 0.58246571, "num_input_tokens_seen": 208559075, "step": 9679, "time_per_iteration": 3.130009174346924 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.04236495, "balance_loss_mlp": 1.01982164, "epoch": 0.5819930858259432, "flos": 21215270522880.0, "grad_norm": 1.65967573029577, "language_loss": 0.65638047, "learning_rate": 1.5693443408853032e-06, "loss": 0.67788512, "num_input_tokens_seen": 208577770, "step": 9680, "time_per_iteration": 2.63765811920166 }, { "auxiliary_loss_clip": 0.01095966, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.04104781, "balance_loss_mlp": 1.01797342, "epoch": 0.5820532090786111, "flos": 19458520151040.0, "grad_norm": 1.9145859585775957, "language_loss": 0.83394265, "learning_rate": 1.5689640234337933e-06, "loss": 0.85520327, "num_input_tokens_seen": 208595110, "step": 9681, "time_per_iteration": 2.6886913776397705 }, { "auxiliary_loss_clip": 0.0112012, "auxiliary_loss_mlp": 0.01033373, "balance_loss_clip": 1.04263687, "balance_loss_mlp": 1.02064157, "epoch": 0.5821133323312792, "flos": 17712615686400.0, "grad_norm": 1.6180763493056738, "language_loss": 0.76095504, "learning_rate": 1.5685837223272109e-06, "loss": 0.78248996, "num_input_tokens_seen": 208612080, "step": 9682, "time_per_iteration": 2.616946220397949 }, { "auxiliary_loss_clip": 0.01054825, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.03545356, "balance_loss_mlp": 1.0205251, "epoch": 0.5821734555839471, "flos": 24571804832640.0, "grad_norm": 1.897202579717977, "language_loss": 0.7534517, "learning_rate": 1.568203437579977e-06, "loss": 0.77435744, "num_input_tokens_seen": 208630235, "step": 9683, "time_per_iteration": 2.7519571781158447 }, { "auxiliary_loss_clip": 0.01098515, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.04482961, "balance_loss_mlp": 1.0191133, "epoch": 0.5822335788366151, "flos": 22382259488640.0, "grad_norm": 1.7304603651050097, "language_loss": 0.73967683, "learning_rate": 1.5678231692065116e-06, "loss": 0.76098949, "num_input_tokens_seen": 208647925, "step": 9684, "time_per_iteration": 2.585839033126831 }, { "auxiliary_loss_clip": 0.01095398, "auxiliary_loss_mlp": 0.01040225, "balance_loss_clip": 1.04306865, "balance_loss_mlp": 1.02714145, "epoch": 0.582293702089283, "flos": 26722494639360.0, "grad_norm": 1.9911340281622987, "language_loss": 0.78017914, "learning_rate": 1.5674429172212348e-06, "loss": 0.80153537, "num_input_tokens_seen": 208666180, "step": 9685, "time_per_iteration": 2.6262004375457764 }, { "auxiliary_loss_clip": 0.01119541, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.04301238, "balance_loss_mlp": 1.02463138, "epoch": 0.582353825341951, "flos": 17348661129600.0, "grad_norm": 1.534499166945951, "language_loss": 0.75514185, "learning_rate": 1.5670626816385667e-06, "loss": 0.7767145, "num_input_tokens_seen": 208684240, "step": 9686, "time_per_iteration": 2.4799644947052 }, { "auxiliary_loss_clip": 0.01029752, "auxiliary_loss_mlp": 0.00999968, "balance_loss_clip": 1.01506877, "balance_loss_mlp": 0.99893057, "epoch": 0.5824139485946189, "flos": 55473261534720.0, "grad_norm": 0.8130045203422185, "language_loss": 0.57394326, "learning_rate": 1.5666824624729244e-06, "loss": 0.59424043, "num_input_tokens_seen": 208736090, "step": 9687, "time_per_iteration": 2.9722440242767334 }, { "auxiliary_loss_clip": 0.01079028, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.03950655, "balance_loss_mlp": 1.02262747, "epoch": 0.582474071847287, "flos": 20303031790080.0, "grad_norm": 1.7516030258378996, "language_loss": 0.70063931, "learning_rate": 1.566302259738727e-06, "loss": 0.72181278, "num_input_tokens_seen": 208754600, "step": 9688, "time_per_iteration": 2.802976369857788 }, { "auxiliary_loss_clip": 0.01110989, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.04311526, "balance_loss_mlp": 1.02075768, "epoch": 0.5825341950999549, "flos": 23878010661120.0, "grad_norm": 2.126323858989827, "language_loss": 0.65013343, "learning_rate": 1.5659220734503918e-06, "loss": 0.67157751, "num_input_tokens_seen": 208773140, "step": 9689, "time_per_iteration": 2.6299288272857666 }, { "auxiliary_loss_clip": 0.01095981, "auxiliary_loss_mlp": 0.00770437, "balance_loss_clip": 1.04142618, "balance_loss_mlp": 1.00009274, "epoch": 0.5825943183526229, "flos": 23113041690240.0, "grad_norm": 1.599269729220552, "language_loss": 0.7352339, "learning_rate": 1.5655419036223341e-06, "loss": 0.75389808, "num_input_tokens_seen": 208793410, "step": 9690, "time_per_iteration": 2.6903798580169678 }, { "auxiliary_loss_clip": 0.01096107, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.03903055, "balance_loss_mlp": 1.02372348, "epoch": 0.5826544416052909, "flos": 22857429530880.0, "grad_norm": 1.61399606195473, "language_loss": 0.75654376, "learning_rate": 1.5651617502689717e-06, "loss": 0.77789205, "num_input_tokens_seen": 208811920, "step": 9691, "time_per_iteration": 2.7056210041046143 }, { "auxiliary_loss_clip": 0.01109061, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.04082966, "balance_loss_mlp": 1.01972461, "epoch": 0.5827145648579588, "flos": 31501845555840.0, "grad_norm": 2.2562223304416755, "language_loss": 0.80682158, "learning_rate": 1.5647816134047184e-06, "loss": 0.82823855, "num_input_tokens_seen": 208834720, "step": 9692, "time_per_iteration": 2.7577641010284424 }, { "auxiliary_loss_clip": 0.01028968, "auxiliary_loss_mlp": 0.01002786, "balance_loss_clip": 1.01420581, "balance_loss_mlp": 1.00161159, "epoch": 0.5827746881106268, "flos": 69811817074560.0, "grad_norm": 0.7560919402716259, "language_loss": 0.5693723, "learning_rate": 1.5644014930439907e-06, "loss": 0.58968985, "num_input_tokens_seen": 208898415, "step": 9693, "time_per_iteration": 3.145176887512207 }, { "auxiliary_loss_clip": 0.01105496, "auxiliary_loss_mlp": 0.0076985, "balance_loss_clip": 1.04020321, "balance_loss_mlp": 1.00010538, "epoch": 0.5828348113632947, "flos": 23112395245440.0, "grad_norm": 2.61225629767126, "language_loss": 0.79375291, "learning_rate": 1.5640213892012025e-06, "loss": 0.81250644, "num_input_tokens_seen": 208919045, "step": 9694, "time_per_iteration": 2.7443995475769043 }, { "auxiliary_loss_clip": 0.01083069, "auxiliary_loss_mlp": 0.01042673, "balance_loss_clip": 1.03822398, "balance_loss_mlp": 1.02909541, "epoch": 0.5828949346159628, "flos": 21873082245120.0, "grad_norm": 1.4254101237523094, "language_loss": 0.76205015, "learning_rate": 1.5636413018907656e-06, "loss": 0.78330755, "num_input_tokens_seen": 208939375, "step": 9695, "time_per_iteration": 2.688107490539551 }, { "auxiliary_loss_clip": 0.01027446, "auxiliary_loss_mlp": 0.01003052, "balance_loss_clip": 1.01271224, "balance_loss_mlp": 1.00191391, "epoch": 0.5829550578686307, "flos": 65962553950080.0, "grad_norm": 0.7742487055111029, "language_loss": 0.54982823, "learning_rate": 1.563261231127095e-06, "loss": 0.57013327, "num_input_tokens_seen": 209004760, "step": 9696, "time_per_iteration": 3.239593029022217 }, { "auxiliary_loss_clip": 0.0108245, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.04170382, "balance_loss_mlp": 1.01751041, "epoch": 0.5830151811212987, "flos": 16289799079680.0, "grad_norm": 2.124266497676036, "language_loss": 0.76664579, "learning_rate": 1.5628811769246021e-06, "loss": 0.78777242, "num_input_tokens_seen": 209022930, "step": 9697, "time_per_iteration": 2.6790308952331543 }, { "auxiliary_loss_clip": 0.01121339, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.04233479, "balance_loss_mlp": 1.02154899, "epoch": 0.5830753043739666, "flos": 24168851084160.0, "grad_norm": 1.5579611092820027, "language_loss": 0.77714729, "learning_rate": 1.5625011392976991e-06, "loss": 0.79871726, "num_input_tokens_seen": 209043740, "step": 9698, "time_per_iteration": 2.635885715484619 }, { "auxiliary_loss_clip": 0.01079274, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.0413661, "balance_loss_mlp": 1.02498519, "epoch": 0.5831354276266346, "flos": 27059050097280.0, "grad_norm": 1.5784163010462595, "language_loss": 0.84167337, "learning_rate": 1.5621211182607966e-06, "loss": 0.86284947, "num_input_tokens_seen": 209068885, "step": 9699, "time_per_iteration": 2.8312487602233887 }, { "auxiliary_loss_clip": 0.01095092, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.03954756, "balance_loss_mlp": 1.02281952, "epoch": 0.5831955508793025, "flos": 23623475909760.0, "grad_norm": 2.065302984121428, "language_loss": 0.65489984, "learning_rate": 1.561741113828305e-06, "loss": 0.67621672, "num_input_tokens_seen": 209087340, "step": 9700, "time_per_iteration": 2.784442901611328 }, { "auxiliary_loss_clip": 0.01108875, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.04089403, "balance_loss_mlp": 1.02150953, "epoch": 0.5832556741319705, "flos": 24973250209920.0, "grad_norm": 1.5991522353668115, "language_loss": 0.71547067, "learning_rate": 1.5613611260146344e-06, "loss": 0.73690522, "num_input_tokens_seen": 209108840, "step": 9701, "time_per_iteration": 2.6895313262939453 }, { "auxiliary_loss_clip": 0.01096283, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.04180253, "balance_loss_mlp": 1.02841139, "epoch": 0.5833157973846385, "flos": 23221563655680.0, "grad_norm": 1.6635802287235106, "language_loss": 0.85541105, "learning_rate": 1.5609811548341936e-06, "loss": 0.87678826, "num_input_tokens_seen": 209127985, "step": 9702, "time_per_iteration": 2.6746225357055664 }, { "auxiliary_loss_clip": 0.01102319, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.04071856, "balance_loss_mlp": 1.02131367, "epoch": 0.5833759206373065, "flos": 21977941023360.0, "grad_norm": 1.4183987857502756, "language_loss": 0.77847046, "learning_rate": 1.560601200301392e-06, "loss": 0.79983002, "num_input_tokens_seen": 209146885, "step": 9703, "time_per_iteration": 4.3035502433776855 }, { "auxiliary_loss_clip": 0.01122779, "auxiliary_loss_mlp": 0.01034804, "balance_loss_clip": 1.04359257, "balance_loss_mlp": 1.0208385, "epoch": 0.5834360438899745, "flos": 21762405463680.0, "grad_norm": 1.8064531110729998, "language_loss": 0.71067387, "learning_rate": 1.5602212624306366e-06, "loss": 0.73224974, "num_input_tokens_seen": 209166130, "step": 9704, "time_per_iteration": 4.107022762298584 }, { "auxiliary_loss_clip": 0.01094563, "auxiliary_loss_mlp": 0.01038062, "balance_loss_clip": 1.04187346, "balance_loss_mlp": 1.02561641, "epoch": 0.5834961671426424, "flos": 15992566035840.0, "grad_norm": 1.6675564380890735, "language_loss": 0.81363106, "learning_rate": 1.559841341236335e-06, "loss": 0.8349573, "num_input_tokens_seen": 209183350, "step": 9705, "time_per_iteration": 2.7058465480804443 }, { "auxiliary_loss_clip": 0.010702, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.03672004, "balance_loss_mlp": 1.02125466, "epoch": 0.5835562903953104, "flos": 22818322598400.0, "grad_norm": 1.7137147806220967, "language_loss": 0.80614948, "learning_rate": 1.5594614367328937e-06, "loss": 0.82719278, "num_input_tokens_seen": 209203945, "step": 9706, "time_per_iteration": 2.776280164718628 }, { "auxiliary_loss_clip": 0.01105997, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.04129124, "balance_loss_mlp": 1.02315402, "epoch": 0.5836164136479783, "flos": 48468056624640.0, "grad_norm": 2.0771057832537414, "language_loss": 0.74647468, "learning_rate": 1.5590815489347187e-06, "loss": 0.76790154, "num_input_tokens_seen": 209227080, "step": 9707, "time_per_iteration": 2.857609272003174 }, { "auxiliary_loss_clip": 0.01081909, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.03649998, "balance_loss_mlp": 1.01878548, "epoch": 0.5836765369006464, "flos": 26905998245760.0, "grad_norm": 2.7159127892637067, "language_loss": 0.81819087, "learning_rate": 1.5587016778562163e-06, "loss": 0.83932543, "num_input_tokens_seen": 209248170, "step": 9708, "time_per_iteration": 4.28432822227478 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.0439347, "balance_loss_mlp": 1.01914191, "epoch": 0.5837366601533143, "flos": 20084048524800.0, "grad_norm": 1.4146539482815383, "language_loss": 0.78367102, "learning_rate": 1.5583218235117896e-06, "loss": 0.80501604, "num_input_tokens_seen": 209267730, "step": 9709, "time_per_iteration": 2.6337647438049316 }, { "auxiliary_loss_clip": 0.01017869, "auxiliary_loss_mlp": 0.00999553, "balance_loss_clip": 1.01163578, "balance_loss_mlp": 0.99844998, "epoch": 0.5837967834059823, "flos": 65363885971200.0, "grad_norm": 0.7723563596720286, "language_loss": 0.5654794, "learning_rate": 1.557941985915844e-06, "loss": 0.58565366, "num_input_tokens_seen": 209332510, "step": 9710, "time_per_iteration": 3.255643844604492 }, { "auxiliary_loss_clip": 0.01084064, "auxiliary_loss_mlp": 0.01035883, "balance_loss_clip": 1.03939962, "balance_loss_mlp": 1.02429581, "epoch": 0.5838569066586502, "flos": 25338641310720.0, "grad_norm": 1.5220841159249796, "language_loss": 0.6560964, "learning_rate": 1.5575621650827833e-06, "loss": 0.67729586, "num_input_tokens_seen": 209353355, "step": 9711, "time_per_iteration": 2.7771286964416504 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.04342008, "balance_loss_mlp": 1.02279854, "epoch": 0.5839170299113182, "flos": 22229243550720.0, "grad_norm": 1.6457925309868888, "language_loss": 0.78601259, "learning_rate": 1.5571823610270085e-06, "loss": 0.80762661, "num_input_tokens_seen": 209370960, "step": 9712, "time_per_iteration": 2.6130564212799072 }, { "auxiliary_loss_clip": 0.01079932, "auxiliary_loss_mlp": 0.0077171, "balance_loss_clip": 1.03610897, "balance_loss_mlp": 1.00007439, "epoch": 0.5839771531639861, "flos": 22200012858240.0, "grad_norm": 1.6123088749448828, "language_loss": 0.73624194, "learning_rate": 1.5568025737629234e-06, "loss": 0.75475836, "num_input_tokens_seen": 209390955, "step": 9713, "time_per_iteration": 2.752688407897949 }, { "auxiliary_loss_clip": 0.01098855, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.03949571, "balance_loss_mlp": 1.02000761, "epoch": 0.5840372764166541, "flos": 22419355259520.0, "grad_norm": 2.057640389539287, "language_loss": 0.69393289, "learning_rate": 1.5564228033049292e-06, "loss": 0.71526623, "num_input_tokens_seen": 209410260, "step": 9714, "time_per_iteration": 2.697676181793213 }, { "auxiliary_loss_clip": 0.01118564, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.04040492, "balance_loss_mlp": 1.02368677, "epoch": 0.5840973996693221, "flos": 19828256797440.0, "grad_norm": 1.733937894535342, "language_loss": 0.80418617, "learning_rate": 1.5560430496674268e-06, "loss": 0.82574189, "num_input_tokens_seen": 209429920, "step": 9715, "time_per_iteration": 2.5865848064422607 }, { "auxiliary_loss_clip": 0.01094879, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.03690863, "balance_loss_mlp": 1.02182388, "epoch": 0.5841575229219901, "flos": 21142982401920.0, "grad_norm": 2.4772648960449586, "language_loss": 0.72541732, "learning_rate": 1.5556633128648167e-06, "loss": 0.74672222, "num_input_tokens_seen": 209449470, "step": 9716, "time_per_iteration": 2.760240077972412 }, { "auxiliary_loss_clip": 0.01088946, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.03793585, "balance_loss_mlp": 1.02124131, "epoch": 0.5842176461746581, "flos": 24640322025600.0, "grad_norm": 1.7815945401286815, "language_loss": 0.75058079, "learning_rate": 1.5552835929114976e-06, "loss": 0.7718066, "num_input_tokens_seen": 209467695, "step": 9717, "time_per_iteration": 2.7470862865448 }, { "auxiliary_loss_clip": 0.01109202, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.04155052, "balance_loss_mlp": 1.02575004, "epoch": 0.584277769427326, "flos": 19131158574720.0, "grad_norm": 3.2108802254609827, "language_loss": 0.79614913, "learning_rate": 1.5549038898218697e-06, "loss": 0.81762898, "num_input_tokens_seen": 209484250, "step": 9718, "time_per_iteration": 2.6843111515045166 }, { "auxiliary_loss_clip": 0.01094695, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.03992128, "balance_loss_mlp": 1.01880288, "epoch": 0.584337892679994, "flos": 22675111073280.0, "grad_norm": 1.6948464280827684, "language_loss": 0.67670137, "learning_rate": 1.5545242036103306e-06, "loss": 0.69797808, "num_input_tokens_seen": 209502830, "step": 9719, "time_per_iteration": 2.658722400665283 }, { "auxiliary_loss_clip": 0.01119777, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.04168653, "balance_loss_mlp": 1.02466464, "epoch": 0.5843980159326619, "flos": 31284083352960.0, "grad_norm": 1.997670996956063, "language_loss": 0.75795102, "learning_rate": 1.5541445342912786e-06, "loss": 0.77952886, "num_input_tokens_seen": 209525995, "step": 9720, "time_per_iteration": 2.6901891231536865 }, { "auxiliary_loss_clip": 0.01082891, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.04280281, "balance_loss_mlp": 1.02657783, "epoch": 0.58445813918533, "flos": 22748117466240.0, "grad_norm": 1.7155190503214905, "language_loss": 0.83123529, "learning_rate": 1.5537648818791105e-06, "loss": 0.85245907, "num_input_tokens_seen": 209545895, "step": 9721, "time_per_iteration": 2.71907639503479 }, { "auxiliary_loss_clip": 0.01037273, "auxiliary_loss_mlp": 0.01006637, "balance_loss_clip": 1.01290512, "balance_loss_mlp": 1.00543344, "epoch": 0.5845182624379979, "flos": 60686556658560.0, "grad_norm": 0.9400176499911559, "language_loss": 0.7134223, "learning_rate": 1.5533852463882226e-06, "loss": 0.73386145, "num_input_tokens_seen": 209602315, "step": 9722, "time_per_iteration": 3.1959645748138428 }, { "auxiliary_loss_clip": 0.01099534, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.03890538, "balance_loss_mlp": 1.02751184, "epoch": 0.5845783856906659, "flos": 16362446336640.0, "grad_norm": 1.9834511811038693, "language_loss": 0.89731622, "learning_rate": 1.5530056278330113e-06, "loss": 0.91871929, "num_input_tokens_seen": 209617615, "step": 9723, "time_per_iteration": 2.592627763748169 }, { "auxiliary_loss_clip": 0.01094383, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.04275918, "balance_loss_mlp": 1.02554142, "epoch": 0.5846385089433338, "flos": 20083402080000.0, "grad_norm": 1.398468813522248, "language_loss": 0.68486446, "learning_rate": 1.5526260262278709e-06, "loss": 0.70619082, "num_input_tokens_seen": 209637005, "step": 9724, "time_per_iteration": 2.655640125274658 }, { "auxiliary_loss_clip": 0.01110347, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.04291487, "balance_loss_mlp": 1.02341366, "epoch": 0.5846986321960018, "flos": 17311062568320.0, "grad_norm": 1.717409456716096, "language_loss": 0.86049938, "learning_rate": 1.552246441587197e-06, "loss": 0.88196886, "num_input_tokens_seen": 209653170, "step": 9725, "time_per_iteration": 2.6035261154174805 }, { "auxiliary_loss_clip": 0.01095255, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04249406, "balance_loss_mlp": 1.02926588, "epoch": 0.5847587554486697, "flos": 17197907748480.0, "grad_norm": 1.6193535846243259, "language_loss": 0.82923484, "learning_rate": 1.5518668739253821e-06, "loss": 0.85060942, "num_input_tokens_seen": 209671275, "step": 9726, "time_per_iteration": 2.655017137527466 }, { "auxiliary_loss_clip": 0.01055108, "auxiliary_loss_mlp": 0.00770936, "balance_loss_clip": 1.03983736, "balance_loss_mlp": 1.00008965, "epoch": 0.5848188787013378, "flos": 24529106540160.0, "grad_norm": 1.736262693329601, "language_loss": 0.66609311, "learning_rate": 1.5514873232568206e-06, "loss": 0.68435353, "num_input_tokens_seen": 209690380, "step": 9727, "time_per_iteration": 2.820906639099121 }, { "auxiliary_loss_clip": 0.01083507, "auxiliary_loss_mlp": 0.01045274, "balance_loss_clip": 1.03799105, "balance_loss_mlp": 1.03056347, "epoch": 0.5848790019540057, "flos": 20628382204800.0, "grad_norm": 1.7999573427153348, "language_loss": 0.81628853, "learning_rate": 1.5511077895959055e-06, "loss": 0.83757633, "num_input_tokens_seen": 209708845, "step": 9728, "time_per_iteration": 2.7597923278808594 }, { "auxiliary_loss_clip": 0.01103874, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.03965843, "balance_loss_mlp": 1.0296309, "epoch": 0.5849391252066737, "flos": 22418852469120.0, "grad_norm": 2.078641796720901, "language_loss": 0.77696002, "learning_rate": 1.550728272957027e-06, "loss": 0.79841954, "num_input_tokens_seen": 209729000, "step": 9729, "time_per_iteration": 2.663864850997925 }, { "auxiliary_loss_clip": 0.01102359, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.03954148, "balance_loss_mlp": 1.022475, "epoch": 0.5849992484593417, "flos": 25410929431680.0, "grad_norm": 1.8450519403802392, "language_loss": 0.70192915, "learning_rate": 1.5503487733545782e-06, "loss": 0.72332394, "num_input_tokens_seen": 209747435, "step": 9730, "time_per_iteration": 2.6668407917022705 }, { "auxiliary_loss_clip": 0.01124849, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.04504502, "balance_loss_mlp": 1.02224803, "epoch": 0.5850593717120096, "flos": 21065163586560.0, "grad_norm": 1.6923527463370078, "language_loss": 0.78973091, "learning_rate": 1.5499692908029482e-06, "loss": 0.81134546, "num_input_tokens_seen": 209764910, "step": 9731, "time_per_iteration": 2.6093108654022217 }, { "auxiliary_loss_clip": 0.01103256, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.04004776, "balance_loss_mlp": 1.03114593, "epoch": 0.5851194949646776, "flos": 25301545539840.0, "grad_norm": 2.322897025480009, "language_loss": 0.70276213, "learning_rate": 1.549589825316528e-06, "loss": 0.7242558, "num_input_tokens_seen": 209786115, "step": 9732, "time_per_iteration": 2.6483914852142334 }, { "auxiliary_loss_clip": 0.01068434, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.03862739, "balance_loss_mlp": 1.02584136, "epoch": 0.5851796182173455, "flos": 23587242065280.0, "grad_norm": 1.8361177860467572, "language_loss": 0.53096974, "learning_rate": 1.5492103769097075e-06, "loss": 0.55207402, "num_input_tokens_seen": 209806095, "step": 9733, "time_per_iteration": 2.7837493419647217 }, { "auxiliary_loss_clip": 0.0110623, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.04327631, "balance_loss_mlp": 1.023206, "epoch": 0.5852397414700136, "flos": 24822712310400.0, "grad_norm": 2.1555850580582945, "language_loss": 0.87172639, "learning_rate": 1.5488309455968739e-06, "loss": 0.89316678, "num_input_tokens_seen": 209823650, "step": 9734, "time_per_iteration": 2.647822618484497 }, { "auxiliary_loss_clip": 0.0109023, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.03915906, "balance_loss_mlp": 1.02305174, "epoch": 0.5852998647226815, "flos": 19937784343680.0, "grad_norm": 1.6523754491187739, "language_loss": 0.72117126, "learning_rate": 1.5484515313924163e-06, "loss": 0.74242795, "num_input_tokens_seen": 209843220, "step": 9735, "time_per_iteration": 2.6707499027252197 }, { "auxiliary_loss_clip": 0.01111823, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.04385519, "balance_loss_mlp": 1.02448797, "epoch": 0.5853599879753495, "flos": 16720367408640.0, "grad_norm": 5.660280505854459, "language_loss": 0.74303764, "learning_rate": 1.5480721343107217e-06, "loss": 0.76454127, "num_input_tokens_seen": 209854880, "step": 9736, "time_per_iteration": 2.6474769115448 }, { "auxiliary_loss_clip": 0.01084732, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.03950977, "balance_loss_mlp": 1.0241437, "epoch": 0.5854201112280174, "flos": 44456583680640.0, "grad_norm": 1.705724680337342, "language_loss": 0.7066859, "learning_rate": 1.5476927543661772e-06, "loss": 0.72792208, "num_input_tokens_seen": 209877870, "step": 9737, "time_per_iteration": 2.8703529834747314 }, { "auxiliary_loss_clip": 0.01079098, "auxiliary_loss_mlp": 0.01042352, "balance_loss_clip": 1.03875983, "balance_loss_mlp": 1.02830887, "epoch": 0.5854802344806854, "flos": 20339193807360.0, "grad_norm": 1.7465210824086157, "language_loss": 0.82571793, "learning_rate": 1.547313391573169e-06, "loss": 0.84693247, "num_input_tokens_seen": 209896690, "step": 9738, "time_per_iteration": 2.6930525302886963 }, { "auxiliary_loss_clip": 0.01123353, "auxiliary_loss_mlp": 0.00771973, "balance_loss_clip": 1.04294574, "balance_loss_mlp": 1.00014758, "epoch": 0.5855403577333533, "flos": 20921054221440.0, "grad_norm": 1.6403149295747592, "language_loss": 0.68084544, "learning_rate": 1.546934045946082e-06, "loss": 0.6997987, "num_input_tokens_seen": 209914640, "step": 9739, "time_per_iteration": 2.6120223999023438 }, { "auxiliary_loss_clip": 0.01122823, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.04343581, "balance_loss_mlp": 1.01383555, "epoch": 0.5856004809860214, "flos": 20448649526400.0, "grad_norm": 2.346965983276941, "language_loss": 0.5878849, "learning_rate": 1.5465547174993017e-06, "loss": 0.60939384, "num_input_tokens_seen": 209933375, "step": 9740, "time_per_iteration": 2.6393442153930664 }, { "auxiliary_loss_clip": 0.01091861, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.03964174, "balance_loss_mlp": 1.01996112, "epoch": 0.5856606042386893, "flos": 19640766781440.0, "grad_norm": 1.8171598434150709, "language_loss": 0.75508714, "learning_rate": 1.5461754062472113e-06, "loss": 0.77634859, "num_input_tokens_seen": 209952055, "step": 9741, "time_per_iteration": 2.6550915241241455 }, { "auxiliary_loss_clip": 0.01085436, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.03900838, "balance_loss_mlp": 1.02109587, "epoch": 0.5857207274913573, "flos": 21686166846720.0, "grad_norm": 1.6487285096737663, "language_loss": 0.75935274, "learning_rate": 1.5457961122041959e-06, "loss": 0.78055418, "num_input_tokens_seen": 209971190, "step": 9742, "time_per_iteration": 4.381955146789551 }, { "auxiliary_loss_clip": 0.01098042, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.04340363, "balance_loss_mlp": 1.0209775, "epoch": 0.5857808507440253, "flos": 23182708118400.0, "grad_norm": 1.6035533638401356, "language_loss": 0.74864548, "learning_rate": 1.5454168353846369e-06, "loss": 0.76996386, "num_input_tokens_seen": 209990695, "step": 9743, "time_per_iteration": 5.72803258895874 }, { "auxiliary_loss_clip": 0.01098389, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.04424453, "balance_loss_mlp": 1.01949835, "epoch": 0.5858409739966932, "flos": 27235299156480.0, "grad_norm": 1.98808093933083, "language_loss": 0.81046313, "learning_rate": 1.5450375758029172e-06, "loss": 0.83176875, "num_input_tokens_seen": 210010210, "step": 9744, "time_per_iteration": 2.7265267372131348 }, { "auxiliary_loss_clip": 0.01094798, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.04087067, "balance_loss_mlp": 1.01669562, "epoch": 0.5859010972493612, "flos": 27855512317440.0, "grad_norm": 1.7065591540492446, "language_loss": 0.71426034, "learning_rate": 1.5446583334734183e-06, "loss": 0.73551434, "num_input_tokens_seen": 210030030, "step": 9745, "time_per_iteration": 2.737842082977295 }, { "auxiliary_loss_clip": 0.01023206, "auxiliary_loss_mlp": 0.01004158, "balance_loss_clip": 1.01973987, "balance_loss_mlp": 1.00301957, "epoch": 0.5859612205020291, "flos": 70007064428160.0, "grad_norm": 0.7272764484566879, "language_loss": 0.53267932, "learning_rate": 1.5442791084105204e-06, "loss": 0.552953, "num_input_tokens_seen": 210094840, "step": 9746, "time_per_iteration": 3.3027215003967285 }, { "auxiliary_loss_clip": 0.01094571, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.04237437, "balance_loss_mlp": 1.02163196, "epoch": 0.5860213437546972, "flos": 24056019486720.0, "grad_norm": 2.0261235602549466, "language_loss": 0.73138428, "learning_rate": 1.5438999006286054e-06, "loss": 0.75268686, "num_input_tokens_seen": 210114660, "step": 9747, "time_per_iteration": 4.224852085113525 }, { "auxiliary_loss_clip": 0.01092652, "auxiliary_loss_mlp": 0.01046673, "balance_loss_clip": 1.03909874, "balance_loss_mlp": 1.03123569, "epoch": 0.5860814670073651, "flos": 18947583141120.0, "grad_norm": 1.867050340664373, "language_loss": 0.81183696, "learning_rate": 1.543520710142051e-06, "loss": 0.83323026, "num_input_tokens_seen": 210132770, "step": 9748, "time_per_iteration": 2.6568126678466797 }, { "auxiliary_loss_clip": 0.01111974, "auxiliary_loss_mlp": 0.01038317, "balance_loss_clip": 1.04387689, "balance_loss_mlp": 1.0241785, "epoch": 0.5861415902600331, "flos": 22561848512640.0, "grad_norm": 1.7272716772059427, "language_loss": 0.72221619, "learning_rate": 1.5431415369652375e-06, "loss": 0.7437191, "num_input_tokens_seen": 210151895, "step": 9749, "time_per_iteration": 2.6895384788513184 }, { "auxiliary_loss_clip": 0.01101508, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04664361, "balance_loss_mlp": 1.02205098, "epoch": 0.586201713512701, "flos": 14392027912320.0, "grad_norm": 2.592210537631562, "language_loss": 0.75040287, "learning_rate": 1.5427623811125428e-06, "loss": 0.77177632, "num_input_tokens_seen": 210168040, "step": 9750, "time_per_iteration": 2.737083911895752 }, { "auxiliary_loss_clip": 0.0108729, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.04378581, "balance_loss_mlp": 1.02202928, "epoch": 0.586261836765369, "flos": 19498560837120.0, "grad_norm": 1.8612157402372733, "language_loss": 0.70927167, "learning_rate": 1.542383242598344e-06, "loss": 0.73051161, "num_input_tokens_seen": 210187720, "step": 9751, "time_per_iteration": 2.7111241817474365 }, { "auxiliary_loss_clip": 0.01125805, "auxiliary_loss_mlp": 0.01043313, "balance_loss_clip": 1.04531717, "balance_loss_mlp": 1.02769637, "epoch": 0.5863219600180369, "flos": 20701819560960.0, "grad_norm": 1.7129799601344229, "language_loss": 0.74548101, "learning_rate": 1.5420041214370184e-06, "loss": 0.76717222, "num_input_tokens_seen": 210206080, "step": 9752, "time_per_iteration": 2.626716136932373 }, { "auxiliary_loss_clip": 0.01108046, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 1.04339004, "balance_loss_mlp": 1.01842308, "epoch": 0.586382083270705, "flos": 19792130693760.0, "grad_norm": 1.767262069370236, "language_loss": 0.77331054, "learning_rate": 1.541625017642943e-06, "loss": 0.79471087, "num_input_tokens_seen": 210225660, "step": 9753, "time_per_iteration": 2.6093239784240723 }, { "auxiliary_loss_clip": 0.01116295, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.04288065, "balance_loss_mlp": 1.01651943, "epoch": 0.5864422065233729, "flos": 16500558130560.0, "grad_norm": 1.6790243104766265, "language_loss": 0.70988512, "learning_rate": 1.5412459312304927e-06, "loss": 0.73133945, "num_input_tokens_seen": 210242725, "step": 9754, "time_per_iteration": 2.5604028701782227 }, { "auxiliary_loss_clip": 0.01095441, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.0401392, "balance_loss_mlp": 1.0194732, "epoch": 0.5865023297760409, "flos": 20413277608320.0, "grad_norm": 2.0857561604768065, "language_loss": 0.72379315, "learning_rate": 1.540866862214043e-06, "loss": 0.7450884, "num_input_tokens_seen": 210263225, "step": 9755, "time_per_iteration": 2.656785011291504 }, { "auxiliary_loss_clip": 0.01012678, "auxiliary_loss_mlp": 0.01004177, "balance_loss_clip": 1.01731849, "balance_loss_mlp": 1.00294328, "epoch": 0.5865624530287089, "flos": 63350769254400.0, "grad_norm": 0.7450356800362308, "language_loss": 0.56920898, "learning_rate": 1.540487810607967e-06, "loss": 0.58937752, "num_input_tokens_seen": 210322310, "step": 9756, "time_per_iteration": 3.2905054092407227 }, { "auxiliary_loss_clip": 0.01115752, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.04039788, "balance_loss_mlp": 1.01922202, "epoch": 0.5866225762813768, "flos": 27016279977600.0, "grad_norm": 11.015446509800649, "language_loss": 0.76104087, "learning_rate": 1.5401087764266396e-06, "loss": 0.78251553, "num_input_tokens_seen": 210340845, "step": 9757, "time_per_iteration": 2.6325418949127197 }, { "auxiliary_loss_clip": 0.01021435, "auxiliary_loss_mlp": 0.01009977, "balance_loss_clip": 1.01624918, "balance_loss_mlp": 1.00884426, "epoch": 0.5866826995340448, "flos": 72987038507520.0, "grad_norm": 0.8546616305193999, "language_loss": 0.60420328, "learning_rate": 1.5397297596844337e-06, "loss": 0.62451738, "num_input_tokens_seen": 210397815, "step": 9758, "time_per_iteration": 3.227780342102051 }, { "auxiliary_loss_clip": 0.0112535, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.0447619, "balance_loss_mlp": 1.02245307, "epoch": 0.5867428227867127, "flos": 21285727050240.0, "grad_norm": 2.191365428773927, "language_loss": 0.71787071, "learning_rate": 1.5393507603957212e-06, "loss": 0.73948681, "num_input_tokens_seen": 210413900, "step": 9759, "time_per_iteration": 2.593574047088623 }, { "auxiliary_loss_clip": 0.01096792, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.04106188, "balance_loss_mlp": 1.02525759, "epoch": 0.5868029460393808, "flos": 33468852188160.0, "grad_norm": 1.6194048366561686, "language_loss": 0.72730052, "learning_rate": 1.5389717785748742e-06, "loss": 0.74864709, "num_input_tokens_seen": 210434110, "step": 9760, "time_per_iteration": 2.7872965335845947 }, { "auxiliary_loss_clip": 0.01107006, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.04269731, "balance_loss_mlp": 1.01910627, "epoch": 0.5868630692920487, "flos": 17889475276800.0, "grad_norm": 1.9662195987833622, "language_loss": 0.72611898, "learning_rate": 1.5385928142362637e-06, "loss": 0.74751425, "num_input_tokens_seen": 210451685, "step": 9761, "time_per_iteration": 2.701533317565918 }, { "auxiliary_loss_clip": 0.01106159, "auxiliary_loss_mlp": 0.01036709, "balance_loss_clip": 1.04491735, "balance_loss_mlp": 1.02211809, "epoch": 0.5869231925447167, "flos": 21035035054080.0, "grad_norm": 1.7395731063260564, "language_loss": 0.75217378, "learning_rate": 1.5382138673942597e-06, "loss": 0.77360249, "num_input_tokens_seen": 210470825, "step": 9762, "time_per_iteration": 2.721714496612549 }, { "auxiliary_loss_clip": 0.01082216, "auxiliary_loss_mlp": 0.01036155, "balance_loss_clip": 1.03985929, "balance_loss_mlp": 1.02164149, "epoch": 0.5869833157973846, "flos": 74738219293440.0, "grad_norm": 4.660992958273475, "language_loss": 0.72322762, "learning_rate": 1.5378349380632317e-06, "loss": 0.74441129, "num_input_tokens_seen": 210500075, "step": 9763, "time_per_iteration": 3.1116628646850586 }, { "auxiliary_loss_clip": 0.01101878, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.03773355, "balance_loss_mlp": 1.02203679, "epoch": 0.5870434390500526, "flos": 17638998762240.0, "grad_norm": 1.815727939349207, "language_loss": 0.80352604, "learning_rate": 1.53745602625755e-06, "loss": 0.82489097, "num_input_tokens_seen": 210518150, "step": 9764, "time_per_iteration": 2.682579278945923 }, { "auxiliary_loss_clip": 0.01091583, "auxiliary_loss_mlp": 0.01034941, "balance_loss_clip": 1.04217017, "balance_loss_mlp": 1.02132726, "epoch": 0.5871035623027205, "flos": 21506146859520.0, "grad_norm": 1.83004906571999, "language_loss": 0.79265928, "learning_rate": 1.5370771319915819e-06, "loss": 0.81392443, "num_input_tokens_seen": 210537760, "step": 9765, "time_per_iteration": 2.6972546577453613 }, { "auxiliary_loss_clip": 0.01088979, "auxiliary_loss_mlp": 0.01039927, "balance_loss_clip": 1.04256606, "balance_loss_mlp": 1.02595556, "epoch": 0.5871636855553886, "flos": 13551861818880.0, "grad_norm": 1.76294195099967, "language_loss": 0.83693898, "learning_rate": 1.5366982552796947e-06, "loss": 0.85822797, "num_input_tokens_seen": 210555515, "step": 9766, "time_per_iteration": 2.7466630935668945 }, { "auxiliary_loss_clip": 0.01111118, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.04195547, "balance_loss_mlp": 1.02393794, "epoch": 0.5872238088080565, "flos": 26212922346240.0, "grad_norm": 1.5937380342892973, "language_loss": 0.6981988, "learning_rate": 1.536319396136257e-06, "loss": 0.71968091, "num_input_tokens_seen": 210575000, "step": 9767, "time_per_iteration": 2.6740965843200684 }, { "auxiliary_loss_clip": 0.0110439, "auxiliary_loss_mlp": 0.0077267, "balance_loss_clip": 1.04049277, "balance_loss_mlp": 1.00008368, "epoch": 0.5872839320607245, "flos": 30665198995200.0, "grad_norm": 2.1136221747138095, "language_loss": 0.6360091, "learning_rate": 1.5359405545756336e-06, "loss": 0.65477967, "num_input_tokens_seen": 210595185, "step": 9768, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01037412, "auxiliary_loss_mlp": 0.00751529, "balance_loss_clip": 1.01318574, "balance_loss_mlp": 0.99987358, "epoch": 0.5873440553133924, "flos": 60303570871680.0, "grad_norm": 0.7223687744232398, "language_loss": 0.53866827, "learning_rate": 1.5355617306121914e-06, "loss": 0.55655766, "num_input_tokens_seen": 210653210, "step": 9769, "time_per_iteration": 3.1609816551208496 }, { "auxiliary_loss_clip": 0.01084812, "auxiliary_loss_mlp": 0.01042021, "balance_loss_clip": 1.03922772, "balance_loss_mlp": 1.02880073, "epoch": 0.5874041785660604, "flos": 21539292134400.0, "grad_norm": 1.4066762666706196, "language_loss": 0.70984697, "learning_rate": 1.5351829242602945e-06, "loss": 0.73111528, "num_input_tokens_seen": 210673750, "step": 9770, "time_per_iteration": 2.7312963008880615 }, { "auxiliary_loss_clip": 0.01073411, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.0386194, "balance_loss_mlp": 1.02226591, "epoch": 0.5874643018187284, "flos": 24388947671040.0, "grad_norm": 1.7359405395861034, "language_loss": 0.681171, "learning_rate": 1.5348041355343077e-06, "loss": 0.70226407, "num_input_tokens_seen": 210692960, "step": 9771, "time_per_iteration": 2.7748193740844727 }, { "auxiliary_loss_clip": 0.01072231, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.03671551, "balance_loss_mlp": 1.02564466, "epoch": 0.5875244250713964, "flos": 28147717457280.0, "grad_norm": 1.5217173137024316, "language_loss": 0.661672, "learning_rate": 1.5344253644485954e-06, "loss": 0.68281412, "num_input_tokens_seen": 210714040, "step": 9772, "time_per_iteration": 2.841942071914673 }, { "auxiliary_loss_clip": 0.01124952, "auxiliary_loss_mlp": 0.01044932, "balance_loss_clip": 1.045434, "balance_loss_mlp": 1.03047216, "epoch": 0.5875845483240644, "flos": 25812410722560.0, "grad_norm": 1.4922365157265927, "language_loss": 0.74535245, "learning_rate": 1.534046611017519e-06, "loss": 0.76705128, "num_input_tokens_seen": 210733710, "step": 9773, "time_per_iteration": 2.6284871101379395 }, { "auxiliary_loss_clip": 0.01087977, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04292727, "balance_loss_mlp": 1.02706945, "epoch": 0.5876446715767323, "flos": 26906572863360.0, "grad_norm": 1.947316209295704, "language_loss": 0.52915788, "learning_rate": 1.5336678752554421e-06, "loss": 0.55045235, "num_input_tokens_seen": 210753580, "step": 9774, "time_per_iteration": 2.7891509532928467 }, { "auxiliary_loss_clip": 0.01113387, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.04437912, "balance_loss_mlp": 1.02526808, "epoch": 0.5877047948294003, "flos": 36684832579200.0, "grad_norm": 2.3607783176851824, "language_loss": 0.64713901, "learning_rate": 1.5332891571767264e-06, "loss": 0.66866958, "num_input_tokens_seen": 210773495, "step": 9775, "time_per_iteration": 2.771148920059204 }, { "auxiliary_loss_clip": 0.01105141, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.04033184, "balance_loss_mlp": 1.02344131, "epoch": 0.5877649180820682, "flos": 26724721282560.0, "grad_norm": 1.636403069820384, "language_loss": 0.73844278, "learning_rate": 1.5329104567957326e-06, "loss": 0.75986409, "num_input_tokens_seen": 210793645, "step": 9776, "time_per_iteration": 2.690695285797119 }, { "auxiliary_loss_clip": 0.01119488, "auxiliary_loss_mlp": 0.01039689, "balance_loss_clip": 1.0420121, "balance_loss_mlp": 1.0264504, "epoch": 0.5878250413347362, "flos": 21032197879680.0, "grad_norm": 1.5421458331894318, "language_loss": 0.73914766, "learning_rate": 1.532531774126821e-06, "loss": 0.76073945, "num_input_tokens_seen": 210813415, "step": 9777, "time_per_iteration": 2.6284945011138916 }, { "auxiliary_loss_clip": 0.01083567, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.04067087, "balance_loss_mlp": 1.02573574, "epoch": 0.5878851645874041, "flos": 25484259047040.0, "grad_norm": 1.8412101918270336, "language_loss": 0.74325955, "learning_rate": 1.5321531091843512e-06, "loss": 0.76448435, "num_input_tokens_seen": 210833850, "step": 9778, "time_per_iteration": 2.7255308628082275 }, { "auxiliary_loss_clip": 0.01072977, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.03567362, "balance_loss_mlp": 1.0246737, "epoch": 0.5879452878400722, "flos": 23769129559680.0, "grad_norm": 1.8337946976743424, "language_loss": 0.70162809, "learning_rate": 1.5317744619826824e-06, "loss": 0.72276014, "num_input_tokens_seen": 210853115, "step": 9779, "time_per_iteration": 2.715529680252075 }, { "auxiliary_loss_clip": 0.01121839, "auxiliary_loss_mlp": 0.00771635, "balance_loss_clip": 1.04201186, "balance_loss_mlp": 1.00009024, "epoch": 0.5880054110927401, "flos": 17824513530240.0, "grad_norm": 2.202026224542238, "language_loss": 0.66388619, "learning_rate": 1.5313958325361727e-06, "loss": 0.68282098, "num_input_tokens_seen": 210872090, "step": 9780, "time_per_iteration": 2.628286361694336 }, { "auxiliary_loss_clip": 0.01091434, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.04466867, "balance_loss_mlp": 1.02406991, "epoch": 0.5880655343454081, "flos": 19463404400640.0, "grad_norm": 1.8753551233884636, "language_loss": 0.72474289, "learning_rate": 1.5310172208591807e-06, "loss": 0.74604088, "num_input_tokens_seen": 210888490, "step": 9781, "time_per_iteration": 4.2804930210113525 }, { "auxiliary_loss_clip": 0.01092565, "auxiliary_loss_mlp": 0.00771373, "balance_loss_clip": 1.04225159, "balance_loss_mlp": 1.00005984, "epoch": 0.588125657598076, "flos": 21397588980480.0, "grad_norm": 1.5003005055277707, "language_loss": 0.70744377, "learning_rate": 1.5306386269660622e-06, "loss": 0.72608316, "num_input_tokens_seen": 210908220, "step": 9782, "time_per_iteration": 4.278367519378662 }, { "auxiliary_loss_clip": 0.01105689, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.03929675, "balance_loss_mlp": 1.02716005, "epoch": 0.588185780850744, "flos": 16034653797120.0, "grad_norm": 2.093864455539888, "language_loss": 0.70450729, "learning_rate": 1.5302600508711741e-06, "loss": 0.72598279, "num_input_tokens_seen": 210923945, "step": 9783, "time_per_iteration": 4.194809436798096 }, { "auxiliary_loss_clip": 0.01085302, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.04440248, "balance_loss_mlp": 1.02117932, "epoch": 0.588245904103412, "flos": 23728226947200.0, "grad_norm": 2.1947417455944653, "language_loss": 0.69071788, "learning_rate": 1.5298814925888719e-06, "loss": 0.71193242, "num_input_tokens_seen": 210941955, "step": 9784, "time_per_iteration": 2.7187066078186035 }, { "auxiliary_loss_clip": 0.01072816, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.03863633, "balance_loss_mlp": 1.02094078, "epoch": 0.58830602735608, "flos": 33802534558080.0, "grad_norm": 24.973572945721454, "language_loss": 0.69460654, "learning_rate": 1.5295029521335102e-06, "loss": 0.71568, "num_input_tokens_seen": 210963105, "step": 9785, "time_per_iteration": 2.878143548965454 }, { "auxiliary_loss_clip": 0.01107899, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.04268789, "balance_loss_mlp": 1.01706553, "epoch": 0.588366150608748, "flos": 17090714586240.0, "grad_norm": 1.9508012380203874, "language_loss": 0.77078086, "learning_rate": 1.5291244295194448e-06, "loss": 0.79215527, "num_input_tokens_seen": 210978720, "step": 9786, "time_per_iteration": 2.6095898151397705 }, { "auxiliary_loss_clip": 0.01101968, "auxiliary_loss_mlp": 0.01029534, "balance_loss_clip": 1.04132032, "balance_loss_mlp": 1.01609302, "epoch": 0.5884262738614159, "flos": 22127186033280.0, "grad_norm": 1.4529797212559594, "language_loss": 0.79197991, "learning_rate": 1.5287459247610276e-06, "loss": 0.81329501, "num_input_tokens_seen": 210998750, "step": 9787, "time_per_iteration": 4.223788261413574 }, { "auxiliary_loss_clip": 0.01081001, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.04142892, "balance_loss_mlp": 1.02382052, "epoch": 0.5884863971140839, "flos": 21031838743680.0, "grad_norm": 2.5032495709629186, "language_loss": 0.6604932, "learning_rate": 1.5283674378726116e-06, "loss": 0.68166327, "num_input_tokens_seen": 211017550, "step": 9788, "time_per_iteration": 2.770289659500122 }, { "auxiliary_loss_clip": 0.01089935, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04031539, "balance_loss_mlp": 1.02356613, "epoch": 0.5885465203667518, "flos": 23805112008960.0, "grad_norm": 2.4491161231159495, "language_loss": 0.80353689, "learning_rate": 1.5279889688685506e-06, "loss": 0.82482433, "num_input_tokens_seen": 211034135, "step": 9789, "time_per_iteration": 2.7129344940185547 }, { "auxiliary_loss_clip": 0.01088956, "auxiliary_loss_mlp": 0.00771498, "balance_loss_clip": 1.04013371, "balance_loss_mlp": 0.99999416, "epoch": 0.5886066436194198, "flos": 18880574319360.0, "grad_norm": 1.8752240370073765, "language_loss": 0.7074194, "learning_rate": 1.5276105177631944e-06, "loss": 0.72602391, "num_input_tokens_seen": 211053850, "step": 9790, "time_per_iteration": 2.7234628200531006 }, { "auxiliary_loss_clip": 0.01082257, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.04143536, "balance_loss_mlp": 1.02096915, "epoch": 0.5886667668720877, "flos": 24790141653120.0, "grad_norm": 1.7147674530197825, "language_loss": 0.83315635, "learning_rate": 1.527232084570895e-06, "loss": 0.85432208, "num_input_tokens_seen": 211072165, "step": 9791, "time_per_iteration": 2.711566686630249 }, { "auxiliary_loss_clip": 0.0110606, "auxiliary_loss_mlp": 0.01044469, "balance_loss_clip": 1.04232645, "balance_loss_mlp": 1.0296278, "epoch": 0.5887268901247558, "flos": 21614381516160.0, "grad_norm": 1.5737373299770356, "language_loss": 0.7653091, "learning_rate": 1.5268536693060026e-06, "loss": 0.78681433, "num_input_tokens_seen": 211089630, "step": 9792, "time_per_iteration": 2.634300947189331 }, { "auxiliary_loss_clip": 0.0105802, "auxiliary_loss_mlp": 0.01047083, "balance_loss_clip": 1.03111851, "balance_loss_mlp": 1.03123975, "epoch": 0.5887870133774237, "flos": 20481722974080.0, "grad_norm": 2.6665803472381935, "language_loss": 0.68956935, "learning_rate": 1.5264752719828662e-06, "loss": 0.7106204, "num_input_tokens_seen": 211106120, "step": 9793, "time_per_iteration": 2.7154650688171387 }, { "auxiliary_loss_clip": 0.01116924, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.04252207, "balance_loss_mlp": 1.01923692, "epoch": 0.5888471366300917, "flos": 19206283870080.0, "grad_norm": 1.9062241907170245, "language_loss": 0.60218275, "learning_rate": 1.5260968926158353e-06, "loss": 0.62368208, "num_input_tokens_seen": 211122450, "step": 9794, "time_per_iteration": 2.584721088409424 }, { "auxiliary_loss_clip": 0.01087928, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.04045248, "balance_loss_mlp": 1.0248251, "epoch": 0.5889072598827596, "flos": 19972904866560.0, "grad_norm": 1.5367259931320274, "language_loss": 0.65087652, "learning_rate": 1.525718531219257e-06, "loss": 0.67214543, "num_input_tokens_seen": 211141765, "step": 9795, "time_per_iteration": 2.6578221321105957 }, { "auxiliary_loss_clip": 0.01080946, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.03947282, "balance_loss_mlp": 1.02942848, "epoch": 0.5889673831354276, "flos": 20741249715840.0, "grad_norm": 1.5439612087123358, "language_loss": 0.74185097, "learning_rate": 1.5253401878074801e-06, "loss": 0.76307845, "num_input_tokens_seen": 211160475, "step": 9796, "time_per_iteration": 2.7106168270111084 }, { "auxiliary_loss_clip": 0.01094109, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.03922713, "balance_loss_mlp": 1.02194858, "epoch": 0.5890275063880956, "flos": 25300935008640.0, "grad_norm": 1.398085740010997, "language_loss": 0.82796204, "learning_rate": 1.5249618623948507e-06, "loss": 0.84925568, "num_input_tokens_seen": 211180480, "step": 9797, "time_per_iteration": 2.7226924896240234 }, { "auxiliary_loss_clip": 0.01089451, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.03643203, "balance_loss_mlp": 1.01857686, "epoch": 0.5890876296407636, "flos": 11765377964160.0, "grad_norm": 2.441249596431382, "language_loss": 0.792216, "learning_rate": 1.5245835549957152e-06, "loss": 0.81343186, "num_input_tokens_seen": 211198000, "step": 9798, "time_per_iteration": 2.661177396774292 }, { "auxiliary_loss_clip": 0.01116784, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.04251814, "balance_loss_mlp": 1.02085924, "epoch": 0.5891477528934316, "flos": 13589460380160.0, "grad_norm": 4.031600606780585, "language_loss": 0.74594498, "learning_rate": 1.5242052656244186e-06, "loss": 0.76744843, "num_input_tokens_seen": 211214765, "step": 9799, "time_per_iteration": 2.597598075866699 }, { "auxiliary_loss_clip": 0.0108372, "auxiliary_loss_mlp": 0.01033117, "balance_loss_clip": 1.03822446, "balance_loss_mlp": 1.01848447, "epoch": 0.5892078761460995, "flos": 15049193189760.0, "grad_norm": 1.9844034954522878, "language_loss": 0.7639305, "learning_rate": 1.5238269942953064e-06, "loss": 0.78509891, "num_input_tokens_seen": 211232335, "step": 9800, "time_per_iteration": 2.6959407329559326 }, { "auxiliary_loss_clip": 0.01068975, "auxiliary_loss_mlp": 0.01043567, "balance_loss_clip": 1.03649104, "balance_loss_mlp": 1.02863002, "epoch": 0.5892679993987675, "flos": 15778215624960.0, "grad_norm": 2.091540130493628, "language_loss": 0.78984964, "learning_rate": 1.523448741022722e-06, "loss": 0.81097507, "num_input_tokens_seen": 211249985, "step": 9801, "time_per_iteration": 2.7329885959625244 }, { "auxiliary_loss_clip": 0.01084752, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.04138374, "balance_loss_mlp": 1.01958394, "epoch": 0.5893281226514354, "flos": 25265203954560.0, "grad_norm": 1.6724920210450809, "language_loss": 0.66076094, "learning_rate": 1.5230705058210088e-06, "loss": 0.68194282, "num_input_tokens_seen": 211268425, "step": 9802, "time_per_iteration": 2.9191880226135254 }, { "auxiliary_loss_clip": 0.01106682, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.04172468, "balance_loss_mlp": 1.01782823, "epoch": 0.5893882459041034, "flos": 19458232842240.0, "grad_norm": 1.576394450599596, "language_loss": 0.78281248, "learning_rate": 1.5226922887045108e-06, "loss": 0.80418861, "num_input_tokens_seen": 211286680, "step": 9803, "time_per_iteration": 2.6395671367645264 }, { "auxiliary_loss_clip": 0.01110111, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.04354095, "balance_loss_mlp": 1.0227071, "epoch": 0.5894483691567713, "flos": 20634056553600.0, "grad_norm": 1.421228889325947, "language_loss": 0.73083454, "learning_rate": 1.5223140896875686e-06, "loss": 0.75230026, "num_input_tokens_seen": 211307700, "step": 9804, "time_per_iteration": 2.7451324462890625 }, { "auxiliary_loss_clip": 0.01091882, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.04156733, "balance_loss_mlp": 1.01769745, "epoch": 0.5895084924094394, "flos": 17778223877760.0, "grad_norm": 1.6374324136970364, "language_loss": 0.74669635, "learning_rate": 1.5219359087845234e-06, "loss": 0.76792479, "num_input_tokens_seen": 211324835, "step": 9805, "time_per_iteration": 2.6853296756744385 }, { "auxiliary_loss_clip": 0.01113863, "auxiliary_loss_mlp": 0.00772031, "balance_loss_clip": 1.04102564, "balance_loss_mlp": 1.00008976, "epoch": 0.5895686156621073, "flos": 20121072468480.0, "grad_norm": 1.677515475610003, "language_loss": 0.77973545, "learning_rate": 1.5215577460097174e-06, "loss": 0.79859436, "num_input_tokens_seen": 211344130, "step": 9806, "time_per_iteration": 2.6450774669647217 }, { "auxiliary_loss_clip": 0.01117555, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.0410825, "balance_loss_mlp": 1.01801682, "epoch": 0.5896287389147753, "flos": 20850058990080.0, "grad_norm": 1.7162663032269994, "language_loss": 0.76973009, "learning_rate": 1.5211796013774887e-06, "loss": 0.79122162, "num_input_tokens_seen": 211362915, "step": 9807, "time_per_iteration": 2.5557191371917725 }, { "auxiliary_loss_clip": 0.01115136, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.04593015, "balance_loss_mlp": 1.01563966, "epoch": 0.5896888621674432, "flos": 14537897043840.0, "grad_norm": 1.9630689597763404, "language_loss": 0.74407029, "learning_rate": 1.5208014749021786e-06, "loss": 0.76551819, "num_input_tokens_seen": 211380700, "step": 9808, "time_per_iteration": 2.649773359298706 }, { "auxiliary_loss_clip": 0.01069554, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 1.03687882, "balance_loss_mlp": 1.01540375, "epoch": 0.5897489854201112, "flos": 20886759711360.0, "grad_norm": 2.8224307817464194, "language_loss": 0.72173887, "learning_rate": 1.5204233665981236e-06, "loss": 0.74273449, "num_input_tokens_seen": 211400095, "step": 9809, "time_per_iteration": 2.8795154094696045 }, { "auxiliary_loss_clip": 0.01097105, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.03962231, "balance_loss_mlp": 1.02155459, "epoch": 0.5898091086727792, "flos": 20011149872640.0, "grad_norm": 1.9654509433248524, "language_loss": 0.82251418, "learning_rate": 1.5200452764796627e-06, "loss": 0.84384131, "num_input_tokens_seen": 211417810, "step": 9810, "time_per_iteration": 2.7300972938537598 }, { "auxiliary_loss_clip": 0.01108515, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.04266787, "balance_loss_mlp": 1.02213001, "epoch": 0.5898692319254472, "flos": 16253242012800.0, "grad_norm": 2.8325616643541043, "language_loss": 0.80945516, "learning_rate": 1.5196672045611336e-06, "loss": 0.83089411, "num_input_tokens_seen": 211436020, "step": 9811, "time_per_iteration": 2.6033973693847656 }, { "auxiliary_loss_clip": 0.01114433, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.04528522, "balance_loss_mlp": 1.01666236, "epoch": 0.5899293551781152, "flos": 20448541785600.0, "grad_norm": 2.9067717634400174, "language_loss": 0.77026772, "learning_rate": 1.5192891508568715e-06, "loss": 0.79172945, "num_input_tokens_seen": 211454335, "step": 9812, "time_per_iteration": 2.6283788681030273 }, { "auxiliary_loss_clip": 0.01085179, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.04126084, "balance_loss_mlp": 1.01832533, "epoch": 0.5899894784307831, "flos": 13881701433600.0, "grad_norm": 2.0160065726104426, "language_loss": 0.70596051, "learning_rate": 1.5189111153812133e-06, "loss": 0.72711378, "num_input_tokens_seen": 211472775, "step": 9813, "time_per_iteration": 2.7235190868377686 }, { "auxiliary_loss_clip": 0.01094818, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.04338694, "balance_loss_mlp": 1.02489126, "epoch": 0.5900496016834511, "flos": 20083797129600.0, "grad_norm": 10.075807478503481, "language_loss": 0.72172022, "learning_rate": 1.518533098148494e-06, "loss": 0.74305606, "num_input_tokens_seen": 211492195, "step": 9814, "time_per_iteration": 2.7245450019836426 }, { "auxiliary_loss_clip": 0.01093647, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.04272461, "balance_loss_mlp": 1.02348518, "epoch": 0.590109724936119, "flos": 20259148348800.0, "grad_norm": 1.7959189057174523, "language_loss": 0.78608483, "learning_rate": 1.5181550991730476e-06, "loss": 0.80739248, "num_input_tokens_seen": 211510220, "step": 9815, "time_per_iteration": 2.624587297439575 }, { "auxiliary_loss_clip": 0.0109595, "auxiliary_loss_mlp": 0.0077231, "balance_loss_clip": 1.04222631, "balance_loss_mlp": 1.00011277, "epoch": 0.590169848188787, "flos": 24235069806720.0, "grad_norm": 1.934955250523914, "language_loss": 0.75605524, "learning_rate": 1.5177771184692083e-06, "loss": 0.77473778, "num_input_tokens_seen": 211526260, "step": 9816, "time_per_iteration": 2.805889844894409 }, { "auxiliary_loss_clip": 0.01120987, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.04457593, "balance_loss_mlp": 1.02636814, "epoch": 0.590229971441455, "flos": 17784724239360.0, "grad_norm": 1.761702620923252, "language_loss": 0.81330854, "learning_rate": 1.517399156051309e-06, "loss": 0.8349182, "num_input_tokens_seen": 211542890, "step": 9817, "time_per_iteration": 2.5694470405578613 }, { "auxiliary_loss_clip": 0.0106411, "auxiliary_loss_mlp": 0.01046757, "balance_loss_clip": 1.03651428, "balance_loss_mlp": 1.03204691, "epoch": 0.590290094694123, "flos": 22236893147520.0, "grad_norm": 1.6227389463072333, "language_loss": 0.7634322, "learning_rate": 1.517021211933682e-06, "loss": 0.78454089, "num_input_tokens_seen": 211562685, "step": 9818, "time_per_iteration": 2.7369279861450195 }, { "auxiliary_loss_clip": 0.0108334, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.04248178, "balance_loss_mlp": 1.02451682, "epoch": 0.5903502179467909, "flos": 19098623831040.0, "grad_norm": 2.2508579930127333, "language_loss": 0.66751575, "learning_rate": 1.5166432861306592e-06, "loss": 0.68872648, "num_input_tokens_seen": 211579960, "step": 9819, "time_per_iteration": 2.683518648147583 }, { "auxiliary_loss_clip": 0.01121974, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.04451931, "balance_loss_mlp": 1.02100039, "epoch": 0.5904103411994589, "flos": 24235500769920.0, "grad_norm": 1.5861802995785013, "language_loss": 0.78221858, "learning_rate": 1.5162653786565714e-06, "loss": 0.80378044, "num_input_tokens_seen": 211599310, "step": 9820, "time_per_iteration": 2.67228627204895 }, { "auxiliary_loss_clip": 0.01010393, "auxiliary_loss_mlp": 0.01023264, "balance_loss_clip": 1.01880515, "balance_loss_mlp": 1.02123773, "epoch": 0.5904704644521268, "flos": 64876613045760.0, "grad_norm": 0.9671648573222682, "language_loss": 0.65189892, "learning_rate": 1.5158874895257487e-06, "loss": 0.67223543, "num_input_tokens_seen": 211658790, "step": 9821, "time_per_iteration": 4.79486083984375 }, { "auxiliary_loss_clip": 0.01079974, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.04072082, "balance_loss_mlp": 1.0247488, "epoch": 0.5905305877047948, "flos": 19609991804160.0, "grad_norm": 1.8549459171527238, "language_loss": 0.61307114, "learning_rate": 1.515509618752521e-06, "loss": 0.63425475, "num_input_tokens_seen": 211677240, "step": 9822, "time_per_iteration": 5.756153345108032 }, { "auxiliary_loss_clip": 0.01122858, "auxiliary_loss_mlp": 0.01041517, "balance_loss_clip": 1.04382062, "balance_loss_mlp": 1.02788556, "epoch": 0.5905907109574628, "flos": 18989634988800.0, "grad_norm": 2.151764899841445, "language_loss": 0.82442653, "learning_rate": 1.5151317663512173e-06, "loss": 0.84607029, "num_input_tokens_seen": 211695485, "step": 9823, "time_per_iteration": 2.6660759449005127 }, { "auxiliary_loss_clip": 0.01098497, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.04229546, "balance_loss_mlp": 1.0183413, "epoch": 0.5906508342101308, "flos": 22200407907840.0, "grad_norm": 1.984006151976339, "language_loss": 0.72755098, "learning_rate": 1.514753932336165e-06, "loss": 0.74885976, "num_input_tokens_seen": 211713090, "step": 9824, "time_per_iteration": 2.679081439971924 }, { "auxiliary_loss_clip": 0.01095276, "auxiliary_loss_mlp": 0.00772718, "balance_loss_clip": 1.04067087, "balance_loss_mlp": 1.00008035, "epoch": 0.5907109574627988, "flos": 20886687884160.0, "grad_norm": 2.158910240340413, "language_loss": 0.82870126, "learning_rate": 1.514376116721693e-06, "loss": 0.84738123, "num_input_tokens_seen": 211732510, "step": 9825, "time_per_iteration": 2.719106674194336 }, { "auxiliary_loss_clip": 0.0110445, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.04120886, "balance_loss_mlp": 1.02252591, "epoch": 0.5907710807154667, "flos": 21506649649920.0, "grad_norm": 1.7542204465206233, "language_loss": 0.76779485, "learning_rate": 1.5139983195221272e-06, "loss": 0.78918207, "num_input_tokens_seen": 211748695, "step": 9826, "time_per_iteration": 4.231219291687012 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.04213846, "balance_loss_mlp": 1.01828933, "epoch": 0.5908312039681347, "flos": 22018376759040.0, "grad_norm": 1.9593281360323977, "language_loss": 0.72049826, "learning_rate": 1.513620540751793e-06, "loss": 0.74176061, "num_input_tokens_seen": 211768545, "step": 9827, "time_per_iteration": 2.654449462890625 }, { "auxiliary_loss_clip": 0.01073518, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.03849053, "balance_loss_mlp": 1.02111387, "epoch": 0.5908913272208026, "flos": 18479523991680.0, "grad_norm": 1.6640399072146284, "language_loss": 0.79552126, "learning_rate": 1.5132427804250178e-06, "loss": 0.8165915, "num_input_tokens_seen": 211786665, "step": 9828, "time_per_iteration": 2.8060965538024902 }, { "auxiliary_loss_clip": 0.01065495, "auxiliary_loss_mlp": 0.01038324, "balance_loss_clip": 1.04091191, "balance_loss_mlp": 1.02510321, "epoch": 0.5909514504734706, "flos": 12312189682560.0, "grad_norm": 1.8739746775685384, "language_loss": 0.88231647, "learning_rate": 1.5128650385561241e-06, "loss": 0.90335464, "num_input_tokens_seen": 211801215, "step": 9829, "time_per_iteration": 2.819425106048584 }, { "auxiliary_loss_clip": 0.01023107, "auxiliary_loss_mlp": 0.01007549, "balance_loss_clip": 1.01821566, "balance_loss_mlp": 1.00632119, "epoch": 0.5910115737261386, "flos": 70213262451840.0, "grad_norm": 0.7698473487867592, "language_loss": 0.57849222, "learning_rate": 1.5124873151594376e-06, "loss": 0.59879881, "num_input_tokens_seen": 211857005, "step": 9830, "time_per_iteration": 3.1567955017089844 }, { "auxiliary_loss_clip": 0.01114755, "auxiliary_loss_mlp": 0.00772402, "balance_loss_clip": 1.04340577, "balance_loss_mlp": 1.0002377, "epoch": 0.5910716969788066, "flos": 22017766227840.0, "grad_norm": 2.1363303387386723, "language_loss": 0.75768107, "learning_rate": 1.5121096102492812e-06, "loss": 0.77655268, "num_input_tokens_seen": 211876675, "step": 9831, "time_per_iteration": 2.7048380374908447 }, { "auxiliary_loss_clip": 0.01089263, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.04322839, "balance_loss_mlp": 1.02142549, "epoch": 0.5911318202314745, "flos": 21251648021760.0, "grad_norm": 1.6552693507472749, "language_loss": 0.77847427, "learning_rate": 1.5117319238399767e-06, "loss": 0.79971302, "num_input_tokens_seen": 211895725, "step": 9832, "time_per_iteration": 2.716529369354248 }, { "auxiliary_loss_clip": 0.01105775, "auxiliary_loss_mlp": 0.01031029, "balance_loss_clip": 1.04159164, "balance_loss_mlp": 1.01780295, "epoch": 0.5911919434841425, "flos": 17821604528640.0, "grad_norm": 1.9563179904860062, "language_loss": 0.83245647, "learning_rate": 1.511354255945847e-06, "loss": 0.8538245, "num_input_tokens_seen": 211913860, "step": 9833, "time_per_iteration": 2.641958236694336 }, { "auxiliary_loss_clip": 0.0110771, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.04046118, "balance_loss_mlp": 1.02435589, "epoch": 0.5912520667368104, "flos": 20374781207040.0, "grad_norm": 1.5336556134798032, "language_loss": 0.74267918, "learning_rate": 1.5109766065812123e-06, "loss": 0.76413667, "num_input_tokens_seen": 211932880, "step": 9834, "time_per_iteration": 2.628453016281128 }, { "auxiliary_loss_clip": 0.01119479, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.04244208, "balance_loss_mlp": 1.02121329, "epoch": 0.5913121899894784, "flos": 17930557457280.0, "grad_norm": 2.771797648904754, "language_loss": 0.78298235, "learning_rate": 1.5105989757603942e-06, "loss": 0.80451727, "num_input_tokens_seen": 211948625, "step": 9835, "time_per_iteration": 2.5689404010772705 }, { "auxiliary_loss_clip": 0.01095655, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.03806067, "balance_loss_mlp": 1.0237323, "epoch": 0.5913723132421465, "flos": 22126934638080.0, "grad_norm": 1.8733256786117318, "language_loss": 0.73799431, "learning_rate": 1.5102213634977117e-06, "loss": 0.75931853, "num_input_tokens_seen": 211965355, "step": 9836, "time_per_iteration": 2.695117712020874 }, { "auxiliary_loss_clip": 0.01083057, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.03943884, "balance_loss_mlp": 1.02149653, "epoch": 0.5914324364948144, "flos": 15697918771200.0, "grad_norm": 1.9392468028622023, "language_loss": 0.82138634, "learning_rate": 1.5098437698074841e-06, "loss": 0.84257448, "num_input_tokens_seen": 211982245, "step": 9837, "time_per_iteration": 2.6912343502044678 }, { "auxiliary_loss_clip": 0.01078463, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.03632522, "balance_loss_mlp": 1.02026534, "epoch": 0.5914925597474824, "flos": 22747327367040.0, "grad_norm": 2.27741138864597, "language_loss": 0.79637218, "learning_rate": 1.5094661947040304e-06, "loss": 0.81750751, "num_input_tokens_seen": 212000250, "step": 9838, "time_per_iteration": 2.6449244022369385 }, { "auxiliary_loss_clip": 0.010718, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.04010475, "balance_loss_mlp": 1.02605057, "epoch": 0.5915526830001503, "flos": 18292788161280.0, "grad_norm": 1.9685283368258655, "language_loss": 0.69672906, "learning_rate": 1.5090886382016673e-06, "loss": 0.71784103, "num_input_tokens_seen": 212017505, "step": 9839, "time_per_iteration": 2.76196026802063 }, { "auxiliary_loss_clip": 0.01093291, "auxiliary_loss_mlp": 0.01043789, "balance_loss_clip": 1.04008913, "balance_loss_mlp": 1.0308131, "epoch": 0.5916128062528183, "flos": 17019072910080.0, "grad_norm": 2.7566603972322943, "language_loss": 0.65802211, "learning_rate": 1.5087111003147124e-06, "loss": 0.67939293, "num_input_tokens_seen": 212034595, "step": 9840, "time_per_iteration": 2.647179365158081 }, { "auxiliary_loss_clip": 0.01095524, "auxiliary_loss_mlp": 0.01030956, "balance_loss_clip": 1.04105091, "balance_loss_mlp": 1.0170027, "epoch": 0.5916729295054862, "flos": 24754231031040.0, "grad_norm": 1.7835451737672352, "language_loss": 0.81441593, "learning_rate": 1.5083335810574813e-06, "loss": 0.83568072, "num_input_tokens_seen": 212055775, "step": 9841, "time_per_iteration": 2.693742036819458 }, { "auxiliary_loss_clip": 0.01090733, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.04020691, "balance_loss_mlp": 1.01772296, "epoch": 0.5917330527581542, "flos": 15958199698560.0, "grad_norm": 1.7111294758223268, "language_loss": 0.69152761, "learning_rate": 1.507956080444291e-06, "loss": 0.71273863, "num_input_tokens_seen": 212074000, "step": 9842, "time_per_iteration": 2.6797986030578613 }, { "auxiliary_loss_clip": 0.01093141, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.03811431, "balance_loss_mlp": 1.02367949, "epoch": 0.5917931760108222, "flos": 23800730549760.0, "grad_norm": 3.159007391867861, "language_loss": 0.83409858, "learning_rate": 1.5075785984894549e-06, "loss": 0.85540152, "num_input_tokens_seen": 212091415, "step": 9843, "time_per_iteration": 2.7194371223449707 }, { "auxiliary_loss_clip": 0.01090728, "auxiliary_loss_mlp": 0.01031987, "balance_loss_clip": 1.03646731, "balance_loss_mlp": 1.01762211, "epoch": 0.5918532992634902, "flos": 23249609199360.0, "grad_norm": 5.395713728013965, "language_loss": 0.81329596, "learning_rate": 1.5072011352072875e-06, "loss": 0.83452308, "num_input_tokens_seen": 212105255, "step": 9844, "time_per_iteration": 2.7136270999908447 }, { "auxiliary_loss_clip": 0.01068008, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.03874016, "balance_loss_mlp": 1.01633775, "epoch": 0.5919134225161581, "flos": 19499853726720.0, "grad_norm": 1.8542895008446525, "language_loss": 0.74591327, "learning_rate": 1.5068236906121032e-06, "loss": 0.7668947, "num_input_tokens_seen": 212122765, "step": 9845, "time_per_iteration": 2.781914710998535 }, { "auxiliary_loss_clip": 0.01077949, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.03821266, "balance_loss_mlp": 1.01837575, "epoch": 0.5919735457688261, "flos": 38800940567040.0, "grad_norm": 1.69458434045341, "language_loss": 0.63799906, "learning_rate": 1.506446264718213e-06, "loss": 0.65912199, "num_input_tokens_seen": 212143960, "step": 9846, "time_per_iteration": 2.8427982330322266 }, { "auxiliary_loss_clip": 0.01076538, "auxiliary_loss_mlp": 0.00769552, "balance_loss_clip": 1.03801441, "balance_loss_mlp": 1.00004482, "epoch": 0.592033669021494, "flos": 22163994495360.0, "grad_norm": 1.809865828874733, "language_loss": 0.76013452, "learning_rate": 1.506068857539931e-06, "loss": 0.77859539, "num_input_tokens_seen": 212162005, "step": 9847, "time_per_iteration": 2.737806797027588 }, { "auxiliary_loss_clip": 0.01092495, "auxiliary_loss_mlp": 0.01031315, "balance_loss_clip": 1.03829622, "balance_loss_mlp": 1.01720047, "epoch": 0.592093792274162, "flos": 22710985781760.0, "grad_norm": 1.7217593328479819, "language_loss": 0.62444723, "learning_rate": 1.5056914690915667e-06, "loss": 0.64568532, "num_input_tokens_seen": 212181635, "step": 9848, "time_per_iteration": 2.768158197402954 }, { "auxiliary_loss_clip": 0.01108627, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.04256344, "balance_loss_mlp": 1.02609384, "epoch": 0.59215391552683, "flos": 22528954632960.0, "grad_norm": 1.7269094299177161, "language_loss": 0.75832105, "learning_rate": 1.5053140993874312e-06, "loss": 0.7797997, "num_input_tokens_seen": 212201615, "step": 9849, "time_per_iteration": 2.6506807804107666 }, { "auxiliary_loss_clip": 0.01095576, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.04088306, "balance_loss_mlp": 1.02223945, "epoch": 0.592214038779498, "flos": 24499013921280.0, "grad_norm": 2.077646783474588, "language_loss": 0.75440395, "learning_rate": 1.5049367484418353e-06, "loss": 0.7757231, "num_input_tokens_seen": 212219355, "step": 9850, "time_per_iteration": 2.738163471221924 }, { "auxiliary_loss_clip": 0.01079223, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.0389607, "balance_loss_mlp": 1.02532411, "epoch": 0.592274162032166, "flos": 21831353619840.0, "grad_norm": 2.0657919494048094, "language_loss": 0.75485742, "learning_rate": 1.5045594162690868e-06, "loss": 0.77603519, "num_input_tokens_seen": 212236710, "step": 9851, "time_per_iteration": 2.7006642818450928 }, { "auxiliary_loss_clip": 0.0109594, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.04149699, "balance_loss_mlp": 1.01846635, "epoch": 0.5923342852848339, "flos": 24608146417920.0, "grad_norm": 1.9468749498411155, "language_loss": 0.7089386, "learning_rate": 1.5041821028834954e-06, "loss": 0.73021513, "num_input_tokens_seen": 212256195, "step": 9852, "time_per_iteration": 2.706106424331665 }, { "auxiliary_loss_clip": 0.01104361, "auxiliary_loss_mlp": 0.0077249, "balance_loss_clip": 1.04451549, "balance_loss_mlp": 1.00008225, "epoch": 0.5923944085375019, "flos": 19938143479680.0, "grad_norm": 1.600717143056076, "language_loss": 0.80555183, "learning_rate": 1.5038048082993685e-06, "loss": 0.82432032, "num_input_tokens_seen": 212274085, "step": 9853, "time_per_iteration": 2.7119646072387695 }, { "auxiliary_loss_clip": 0.01088586, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.03719842, "balance_loss_mlp": 1.01654959, "epoch": 0.5924545317901698, "flos": 28658510812800.0, "grad_norm": 1.9598293021275044, "language_loss": 0.67597294, "learning_rate": 1.5034275325310124e-06, "loss": 0.69714832, "num_input_tokens_seen": 212295530, "step": 9854, "time_per_iteration": 2.7060039043426514 }, { "auxiliary_loss_clip": 0.01081304, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.03990042, "balance_loss_mlp": 1.01680636, "epoch": 0.5925146550428378, "flos": 19864885691520.0, "grad_norm": 1.7821900938554989, "language_loss": 0.88811159, "learning_rate": 1.5030502755927344e-06, "loss": 0.90921998, "num_input_tokens_seen": 212313770, "step": 9855, "time_per_iteration": 2.749842882156372 }, { "auxiliary_loss_clip": 0.01097397, "auxiliary_loss_mlp": 0.01031382, "balance_loss_clip": 1.04023433, "balance_loss_mlp": 1.01912177, "epoch": 0.5925747782955058, "flos": 15122989681920.0, "grad_norm": 1.7553886735756365, "language_loss": 0.86097872, "learning_rate": 1.5026730374988397e-06, "loss": 0.8822664, "num_input_tokens_seen": 212331525, "step": 9856, "time_per_iteration": 2.8213181495666504 }, { "auxiliary_loss_clip": 0.0110594, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.03984308, "balance_loss_mlp": 1.02389097, "epoch": 0.5926349015481738, "flos": 18405440190720.0, "grad_norm": 3.6746631679389536, "language_loss": 0.77349007, "learning_rate": 1.5022958182636332e-06, "loss": 0.79491156, "num_input_tokens_seen": 212347295, "step": 9857, "time_per_iteration": 2.6580264568328857 }, { "auxiliary_loss_clip": 0.0107388, "auxiliary_loss_mlp": 0.01051977, "balance_loss_clip": 1.03587079, "balance_loss_mlp": 1.03689682, "epoch": 0.5926950248008417, "flos": 23111138269440.0, "grad_norm": 2.383524132494838, "language_loss": 0.64598405, "learning_rate": 1.501918617901419e-06, "loss": 0.66724265, "num_input_tokens_seen": 212365750, "step": 9858, "time_per_iteration": 2.7002615928649902 }, { "auxiliary_loss_clip": 0.01103608, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.04055738, "balance_loss_mlp": 1.02088773, "epoch": 0.5927551480535097, "flos": 28033916192640.0, "grad_norm": 1.88700094462338, "language_loss": 0.77598989, "learning_rate": 1.501541436426501e-06, "loss": 0.79736185, "num_input_tokens_seen": 212385300, "step": 9859, "time_per_iteration": 4.434144496917725 }, { "auxiliary_loss_clip": 0.01078779, "auxiliary_loss_mlp": 0.00771508, "balance_loss_clip": 1.04448819, "balance_loss_mlp": 1.00007796, "epoch": 0.5928152713061776, "flos": 21798675221760.0, "grad_norm": 4.274702781757113, "language_loss": 0.74740881, "learning_rate": 1.5011642738531818e-06, "loss": 0.7659117, "num_input_tokens_seen": 212402140, "step": 9860, "time_per_iteration": 2.8576431274414062 }, { "auxiliary_loss_clip": 0.01080315, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.04223692, "balance_loss_mlp": 1.02289104, "epoch": 0.5928753945588456, "flos": 24316839118080.0, "grad_norm": 1.6207851458155365, "language_loss": 0.7622723, "learning_rate": 1.500787130195763e-06, "loss": 0.7834208, "num_input_tokens_seen": 212421790, "step": 9861, "time_per_iteration": 5.779749393463135 }, { "auxiliary_loss_clip": 0.01079641, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.03737628, "balance_loss_mlp": 1.0201298, "epoch": 0.5929355178115137, "flos": 26464619923200.0, "grad_norm": 2.31911103307255, "language_loss": 0.70733476, "learning_rate": 1.5004100054685465e-06, "loss": 0.72845423, "num_input_tokens_seen": 212442115, "step": 9862, "time_per_iteration": 2.7879045009613037 }, { "auxiliary_loss_clip": 0.01057596, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.03278732, "balance_loss_mlp": 1.02148342, "epoch": 0.5929956410641816, "flos": 24965995662720.0, "grad_norm": 1.7884457502004503, "language_loss": 0.78123254, "learning_rate": 1.500032899685832e-06, "loss": 0.80214959, "num_input_tokens_seen": 212459535, "step": 9863, "time_per_iteration": 2.7296791076660156 }, { "auxiliary_loss_clip": 0.01089944, "auxiliary_loss_mlp": 0.01040962, "balance_loss_clip": 1.03986549, "balance_loss_mlp": 1.02770567, "epoch": 0.5930557643168496, "flos": 26208325405440.0, "grad_norm": 2.4622472815237506, "language_loss": 0.70487082, "learning_rate": 1.499655812861921e-06, "loss": 0.72617984, "num_input_tokens_seen": 212479385, "step": 9864, "time_per_iteration": 2.6773011684417725 }, { "auxiliary_loss_clip": 0.01089195, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.03835356, "balance_loss_mlp": 1.0226891, "epoch": 0.5931158875695175, "flos": 27854937699840.0, "grad_norm": 1.4468399758370936, "language_loss": 0.67205417, "learning_rate": 1.4992787450111112e-06, "loss": 0.69330788, "num_input_tokens_seen": 212500060, "step": 9865, "time_per_iteration": 4.260905981063843 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.0103771, "balance_loss_clip": 1.04014802, "balance_loss_mlp": 1.02411962, "epoch": 0.5931760108221855, "flos": 15413650536960.0, "grad_norm": 1.9702875461989908, "language_loss": 0.77913535, "learning_rate": 1.4989016961477015e-06, "loss": 0.80050123, "num_input_tokens_seen": 212518590, "step": 9866, "time_per_iteration": 2.6692967414855957 }, { "auxiliary_loss_clip": 0.01090663, "auxiliary_loss_mlp": 0.01031022, "balance_loss_clip": 1.04043937, "balance_loss_mlp": 1.01891649, "epoch": 0.5932361340748534, "flos": 30188520581760.0, "grad_norm": 2.3223854732809364, "language_loss": 0.71955562, "learning_rate": 1.4985246662859903e-06, "loss": 0.74077249, "num_input_tokens_seen": 212538190, "step": 9867, "time_per_iteration": 2.73850679397583 }, { "auxiliary_loss_clip": 0.01094459, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.04182947, "balance_loss_mlp": 1.01644969, "epoch": 0.5932962573275214, "flos": 20157557708160.0, "grad_norm": 1.577108097655746, "language_loss": 0.66789985, "learning_rate": 1.4981476554402732e-06, "loss": 0.68914956, "num_input_tokens_seen": 212557820, "step": 9868, "time_per_iteration": 2.776890277862549 }, { "auxiliary_loss_clip": 0.01060162, "auxiliary_loss_mlp": 0.00771363, "balance_loss_clip": 1.03597963, "balance_loss_mlp": 1.00004768, "epoch": 0.5933563805801894, "flos": 25445906300160.0, "grad_norm": 1.613226423561444, "language_loss": 0.75353992, "learning_rate": 1.4977706636248478e-06, "loss": 0.77185524, "num_input_tokens_seen": 212577645, "step": 9869, "time_per_iteration": 2.8630988597869873 }, { "auxiliary_loss_clip": 0.010636, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.03897762, "balance_loss_mlp": 1.02469635, "epoch": 0.5934165038328574, "flos": 59995740337920.0, "grad_norm": 1.8583969258808255, "language_loss": 0.74005115, "learning_rate": 1.4973936908540091e-06, "loss": 0.76106727, "num_input_tokens_seen": 212603430, "step": 9870, "time_per_iteration": 3.0915732383728027 }, { "auxiliary_loss_clip": 0.01071863, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.03705025, "balance_loss_mlp": 1.01810646, "epoch": 0.5934766270855253, "flos": 24420548661120.0, "grad_norm": 2.145127507644007, "language_loss": 0.7232281, "learning_rate": 1.4970167371420517e-06, "loss": 0.7442562, "num_input_tokens_seen": 212620730, "step": 9871, "time_per_iteration": 2.7629406452178955 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.04104018, "balance_loss_mlp": 1.01764774, "epoch": 0.5935367503381933, "flos": 23513158264320.0, "grad_norm": 2.0164353140130835, "language_loss": 0.74587923, "learning_rate": 1.496639802503271e-06, "loss": 0.76705188, "num_input_tokens_seen": 212639745, "step": 9872, "time_per_iteration": 2.74772310256958 }, { "auxiliary_loss_clip": 0.01111382, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.04180598, "balance_loss_mlp": 1.02414966, "epoch": 0.5935968735908612, "flos": 18948337326720.0, "grad_norm": 2.3277369002939388, "language_loss": 0.79620034, "learning_rate": 1.4962628869519583e-06, "loss": 0.81769902, "num_input_tokens_seen": 212655915, "step": 9873, "time_per_iteration": 2.663547992706299 }, { "auxiliary_loss_clip": 0.01108216, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.04269648, "balance_loss_mlp": 1.021523, "epoch": 0.5936569968435292, "flos": 25483433034240.0, "grad_norm": 1.6892324145577737, "language_loss": 0.8490203, "learning_rate": 1.4958859905024078e-06, "loss": 0.87045169, "num_input_tokens_seen": 212676115, "step": 9874, "time_per_iteration": 2.654606580734253 }, { "auxiliary_loss_clip": 0.01019729, "auxiliary_loss_mlp": 0.01001192, "balance_loss_clip": 1.01379979, "balance_loss_mlp": 0.99991626, "epoch": 0.5937171200961973, "flos": 66378361789440.0, "grad_norm": 0.7079839888277836, "language_loss": 0.59980857, "learning_rate": 1.4955091131689115e-06, "loss": 0.62001777, "num_input_tokens_seen": 212737560, "step": 9875, "time_per_iteration": 3.3108227252960205 }, { "auxiliary_loss_clip": 0.01094208, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.03624558, "balance_loss_mlp": 1.01859426, "epoch": 0.5937772433488652, "flos": 14903467712640.0, "grad_norm": 5.919714877847386, "language_loss": 0.7768054, "learning_rate": 1.4951322549657594e-06, "loss": 0.79808253, "num_input_tokens_seen": 212755365, "step": 9876, "time_per_iteration": 2.6835005283355713 }, { "auxiliary_loss_clip": 0.01097876, "auxiliary_loss_mlp": 0.01028372, "balance_loss_clip": 1.03590453, "balance_loss_mlp": 1.01630843, "epoch": 0.5938373666015332, "flos": 22561489376640.0, "grad_norm": 1.528829961767438, "language_loss": 0.75805295, "learning_rate": 1.494755415907243e-06, "loss": 0.77931547, "num_input_tokens_seen": 212773875, "step": 9877, "time_per_iteration": 2.703756332397461 }, { "auxiliary_loss_clip": 0.0110632, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.03964424, "balance_loss_mlp": 1.01493096, "epoch": 0.5938974898542011, "flos": 18440883936000.0, "grad_norm": 2.6694319382348666, "language_loss": 0.81408948, "learning_rate": 1.4943785960076522e-06, "loss": 0.83543718, "num_input_tokens_seen": 212790590, "step": 9878, "time_per_iteration": 2.6299495697021484 }, { "auxiliary_loss_clip": 0.01090649, "auxiliary_loss_mlp": 0.00772164, "balance_loss_clip": 1.03885496, "balance_loss_mlp": 1.00006008, "epoch": 0.5939576131068691, "flos": 45586728270720.0, "grad_norm": 1.7408999007224344, "language_loss": 0.71310401, "learning_rate": 1.4940017952812754e-06, "loss": 0.73173207, "num_input_tokens_seen": 212812265, "step": 9879, "time_per_iteration": 2.9403438568115234 }, { "auxiliary_loss_clip": 0.01107517, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.04333889, "balance_loss_mlp": 1.02471602, "epoch": 0.594017736359537, "flos": 23587708942080.0, "grad_norm": 1.6220417937962182, "language_loss": 0.5754692, "learning_rate": 1.493625013742401e-06, "loss": 0.59691632, "num_input_tokens_seen": 212831915, "step": 9880, "time_per_iteration": 2.722222089767456 }, { "auxiliary_loss_clip": 0.01108825, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.04171181, "balance_loss_mlp": 1.02144003, "epoch": 0.594077859612205, "flos": 29457235589760.0, "grad_norm": 1.8505883622927, "language_loss": 0.77141905, "learning_rate": 1.4932482514053177e-06, "loss": 0.79285634, "num_input_tokens_seen": 212851350, "step": 9881, "time_per_iteration": 2.7424824237823486 }, { "auxiliary_loss_clip": 0.01104617, "auxiliary_loss_mlp": 0.01027481, "balance_loss_clip": 1.0387702, "balance_loss_mlp": 1.01456428, "epoch": 0.594137982864873, "flos": 16800089644800.0, "grad_norm": 2.611625845648677, "language_loss": 0.82625538, "learning_rate": 1.4928715082843112e-06, "loss": 0.84757638, "num_input_tokens_seen": 212867995, "step": 9882, "time_per_iteration": 2.6125638484954834 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.04283643, "balance_loss_mlp": 1.02419496, "epoch": 0.594198106117541, "flos": 12750263953920.0, "grad_norm": 2.4545417723722434, "language_loss": 0.79556072, "learning_rate": 1.492494784393667e-06, "loss": 0.81700939, "num_input_tokens_seen": 212885220, "step": 9883, "time_per_iteration": 2.6739277839660645 }, { "auxiliary_loss_clip": 0.01090609, "auxiliary_loss_mlp": 0.00770805, "balance_loss_clip": 1.04405499, "balance_loss_mlp": 1.00010085, "epoch": 0.5942582293702089, "flos": 20996538652800.0, "grad_norm": 2.530798381383893, "language_loss": 0.7459439, "learning_rate": 1.4921180797476725e-06, "loss": 0.76455808, "num_input_tokens_seen": 212903195, "step": 9884, "time_per_iteration": 2.720139503479004 }, { "auxiliary_loss_clip": 0.01118755, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.04366493, "balance_loss_mlp": 1.01757646, "epoch": 0.5943183526228769, "flos": 28291431772800.0, "grad_norm": 2.040352336443274, "language_loss": 0.66608262, "learning_rate": 1.4917413943606106e-06, "loss": 0.68757325, "num_input_tokens_seen": 212923340, "step": 9885, "time_per_iteration": 2.6618847846984863 }, { "auxiliary_loss_clip": 0.01093907, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04138327, "balance_loss_mlp": 1.02835155, "epoch": 0.5943784758755448, "flos": 26614619118720.0, "grad_norm": 2.630158617128694, "language_loss": 0.77534634, "learning_rate": 1.4913647282467667e-06, "loss": 0.79669893, "num_input_tokens_seen": 212942755, "step": 9886, "time_per_iteration": 2.7532429695129395 }, { "auxiliary_loss_clip": 0.01025813, "auxiliary_loss_mlp": 0.01001276, "balance_loss_clip": 1.01382208, "balance_loss_mlp": 0.99997658, "epoch": 0.5944385991282128, "flos": 64190935347840.0, "grad_norm": 0.9149518659336237, "language_loss": 0.64530778, "learning_rate": 1.490988081420423e-06, "loss": 0.66557866, "num_input_tokens_seen": 212999355, "step": 9887, "time_per_iteration": 3.060612440109253 }, { "auxiliary_loss_clip": 0.01097622, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.03770781, "balance_loss_mlp": 1.01940084, "epoch": 0.5944987223808808, "flos": 19571998193280.0, "grad_norm": 1.6915419105373903, "language_loss": 0.69181025, "learning_rate": 1.4906114538958615e-06, "loss": 0.71310759, "num_input_tokens_seen": 213018570, "step": 9888, "time_per_iteration": 2.617629051208496 }, { "auxiliary_loss_clip": 0.01088883, "auxiliary_loss_mlp": 0.01034911, "balance_loss_clip": 1.03844309, "balance_loss_mlp": 1.02113008, "epoch": 0.5945588456335488, "flos": 26177586341760.0, "grad_norm": 2.5005305893435685, "language_loss": 0.79495192, "learning_rate": 1.490234845687366e-06, "loss": 0.81618989, "num_input_tokens_seen": 213037735, "step": 9889, "time_per_iteration": 2.685150146484375 }, { "auxiliary_loss_clip": 0.01080162, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.03793621, "balance_loss_mlp": 1.01496744, "epoch": 0.5946189688862168, "flos": 20446494710400.0, "grad_norm": 1.6110540672551508, "language_loss": 0.70713383, "learning_rate": 1.4898582568092154e-06, "loss": 0.72820497, "num_input_tokens_seen": 213057160, "step": 9890, "time_per_iteration": 2.7299606800079346 }, { "auxiliary_loss_clip": 0.01088716, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.04451787, "balance_loss_mlp": 1.01896358, "epoch": 0.5946790921388847, "flos": 13437521850240.0, "grad_norm": 1.9451498476517268, "language_loss": 0.69461864, "learning_rate": 1.489481687275691e-06, "loss": 0.71583426, "num_input_tokens_seen": 213073630, "step": 9891, "time_per_iteration": 2.7253577709198 }, { "auxiliary_loss_clip": 0.01104108, "auxiliary_loss_mlp": 0.01040464, "balance_loss_clip": 1.04076028, "balance_loss_mlp": 1.02784514, "epoch": 0.5947392153915527, "flos": 20412272027520.0, "grad_norm": 1.8738043279095635, "language_loss": 0.53252602, "learning_rate": 1.4891051371010726e-06, "loss": 0.55397171, "num_input_tokens_seen": 213092450, "step": 9892, "time_per_iteration": 2.630176067352295 }, { "auxiliary_loss_clip": 0.01007775, "auxiliary_loss_mlp": 0.01004642, "balance_loss_clip": 1.01469183, "balance_loss_mlp": 1.00331867, "epoch": 0.5947993386442206, "flos": 65619138994560.0, "grad_norm": 0.662438980473289, "language_loss": 0.54533142, "learning_rate": 1.4887286062996375e-06, "loss": 0.56545562, "num_input_tokens_seen": 213155465, "step": 9893, "time_per_iteration": 3.3319764137268066 }, { "auxiliary_loss_clip": 0.01079474, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04197478, "balance_loss_mlp": 1.02362406, "epoch": 0.5948594618968887, "flos": 23183103168000.0, "grad_norm": 1.5803116085974762, "language_loss": 0.74965519, "learning_rate": 1.4883520948856658e-06, "loss": 0.77080828, "num_input_tokens_seen": 213174875, "step": 9894, "time_per_iteration": 2.708012104034424 }, { "auxiliary_loss_clip": 0.01084394, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.04066491, "balance_loss_mlp": 1.01860142, "epoch": 0.5949195851495566, "flos": 13626771632640.0, "grad_norm": 1.7370359553625463, "language_loss": 0.77732074, "learning_rate": 1.487975602873434e-06, "loss": 0.79847538, "num_input_tokens_seen": 213192695, "step": 9895, "time_per_iteration": 2.6831347942352295 }, { "auxiliary_loss_clip": 0.01067508, "auxiliary_loss_mlp": 0.01037328, "balance_loss_clip": 1.03781974, "balance_loss_mlp": 1.0233922, "epoch": 0.5949797084022246, "flos": 19751012599680.0, "grad_norm": 1.6095460497638086, "language_loss": 0.79347014, "learning_rate": 1.4875991302772182e-06, "loss": 0.81451851, "num_input_tokens_seen": 213211195, "step": 9896, "time_per_iteration": 2.7621328830718994 }, { "auxiliary_loss_clip": 0.01106477, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.04062951, "balance_loss_mlp": 1.02315736, "epoch": 0.5950398316548925, "flos": 25773878407680.0, "grad_norm": 1.5421424712505716, "language_loss": 0.83955193, "learning_rate": 1.4872226771112954e-06, "loss": 0.86097461, "num_input_tokens_seen": 213231975, "step": 9897, "time_per_iteration": 2.7152647972106934 }, { "auxiliary_loss_clip": 0.01092695, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.04191113, "balance_loss_mlp": 1.02490425, "epoch": 0.5950999549075605, "flos": 23039029716480.0, "grad_norm": 1.9245000057416703, "language_loss": 0.70950294, "learning_rate": 1.486846243389939e-06, "loss": 0.73080134, "num_input_tokens_seen": 213249760, "step": 9898, "time_per_iteration": 4.332275867462158 }, { "auxiliary_loss_clip": 0.01105674, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.03863168, "balance_loss_mlp": 1.02892375, "epoch": 0.5951600781602284, "flos": 32446367637120.0, "grad_norm": 2.443382879492767, "language_loss": 0.64050412, "learning_rate": 1.4864698291274251e-06, "loss": 0.66201067, "num_input_tokens_seen": 213269890, "step": 9899, "time_per_iteration": 2.747209072113037 }, { "auxiliary_loss_clip": 0.01117539, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.04378319, "balance_loss_mlp": 1.01740563, "epoch": 0.5952202014128964, "flos": 23800874204160.0, "grad_norm": 1.865552618204713, "language_loss": 0.71956146, "learning_rate": 1.4860934343380267e-06, "loss": 0.74102432, "num_input_tokens_seen": 213289400, "step": 9900, "time_per_iteration": 5.790768146514893 }, { "auxiliary_loss_clip": 0.01114892, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.04192626, "balance_loss_mlp": 1.01949835, "epoch": 0.5952803246655644, "flos": 22492182084480.0, "grad_norm": 1.7457638078039162, "language_loss": 0.84428406, "learning_rate": 1.4857170590360169e-06, "loss": 0.86575621, "num_input_tokens_seen": 213308040, "step": 9901, "time_per_iteration": 2.7782936096191406 }, { "auxiliary_loss_clip": 0.00993307, "auxiliary_loss_mlp": 0.01008976, "balance_loss_clip": 1.01768923, "balance_loss_mlp": 1.00779581, "epoch": 0.5953404479182324, "flos": 51234688851840.0, "grad_norm": 0.8002603783256921, "language_loss": 0.58178693, "learning_rate": 1.4853407032356674e-06, "loss": 0.60180974, "num_input_tokens_seen": 213358585, "step": 9902, "time_per_iteration": 3.245389699935913 }, { "auxiliary_loss_clip": 0.01059574, "auxiliary_loss_mlp": 0.01029206, "balance_loss_clip": 1.03823233, "balance_loss_mlp": 1.01596808, "epoch": 0.5954005711709004, "flos": 23112682554240.0, "grad_norm": 2.326170730098328, "language_loss": 0.77513373, "learning_rate": 1.4849643669512503e-06, "loss": 0.79602152, "num_input_tokens_seen": 213379585, "step": 9903, "time_per_iteration": 2.938472032546997 }, { "auxiliary_loss_clip": 0.01080471, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.04236233, "balance_loss_mlp": 1.02275109, "epoch": 0.5954606944235683, "flos": 35954732736000.0, "grad_norm": 3.664262182530453, "language_loss": 0.7767508, "learning_rate": 1.4845880501970362e-06, "loss": 0.79791057, "num_input_tokens_seen": 213401465, "step": 9904, "time_per_iteration": 4.397410869598389 }, { "auxiliary_loss_clip": 0.01102001, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.04016399, "balance_loss_mlp": 1.02507877, "epoch": 0.5955208176762363, "flos": 30443665864320.0, "grad_norm": 1.9431813333035064, "language_loss": 0.72943354, "learning_rate": 1.4842117529872942e-06, "loss": 0.7508347, "num_input_tokens_seen": 213422720, "step": 9905, "time_per_iteration": 2.7936177253723145 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.04223228, "balance_loss_mlp": 1.01717925, "epoch": 0.5955809409289042, "flos": 17640112083840.0, "grad_norm": 1.9824269605474862, "language_loss": 0.70172507, "learning_rate": 1.483835475336295e-06, "loss": 0.72312379, "num_input_tokens_seen": 213439480, "step": 9906, "time_per_iteration": 2.6985738277435303 }, { "auxiliary_loss_clip": 0.01106299, "auxiliary_loss_mlp": 0.01032912, "balance_loss_clip": 1.04149914, "balance_loss_mlp": 1.01987052, "epoch": 0.5956410641815723, "flos": 24279887001600.0, "grad_norm": 1.8692952809001842, "language_loss": 0.75197554, "learning_rate": 1.4834592172583057e-06, "loss": 0.77336764, "num_input_tokens_seen": 213458895, "step": 9907, "time_per_iteration": 2.6980481147766113 }, { "auxiliary_loss_clip": 0.01088924, "auxiliary_loss_mlp": 0.01032034, "balance_loss_clip": 1.03741193, "balance_loss_mlp": 1.0194813, "epoch": 0.5957011874342402, "flos": 35734277013120.0, "grad_norm": 1.635771489703633, "language_loss": 0.67245162, "learning_rate": 1.483082978767595e-06, "loss": 0.69366121, "num_input_tokens_seen": 213481730, "step": 9908, "time_per_iteration": 2.7698655128479004 }, { "auxiliary_loss_clip": 0.01040116, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.03187275, "balance_loss_mlp": 1.0195055, "epoch": 0.5957613106869082, "flos": 21245004005760.0, "grad_norm": 1.9181869047737456, "language_loss": 0.76516539, "learning_rate": 1.4827067598784298e-06, "loss": 0.78588629, "num_input_tokens_seen": 213497225, "step": 9909, "time_per_iteration": 2.8098058700561523 }, { "auxiliary_loss_clip": 0.0103764, "auxiliary_loss_mlp": 0.01004774, "balance_loss_clip": 1.01340699, "balance_loss_mlp": 1.00373673, "epoch": 0.5958214339395761, "flos": 65940969876480.0, "grad_norm": 0.9280508663350204, "language_loss": 0.73383075, "learning_rate": 1.4823305606050753e-06, "loss": 0.75425494, "num_input_tokens_seen": 213556890, "step": 9910, "time_per_iteration": 3.228283166885376 }, { "auxiliary_loss_clip": 0.0109102, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.03882253, "balance_loss_mlp": 1.02188206, "epoch": 0.5958815571922441, "flos": 23218690567680.0, "grad_norm": 2.4798653486938544, "language_loss": 0.69676727, "learning_rate": 1.481954380961799e-06, "loss": 0.71802914, "num_input_tokens_seen": 213575800, "step": 9911, "time_per_iteration": 2.6699378490448 }, { "auxiliary_loss_clip": 0.01116036, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.04485154, "balance_loss_mlp": 1.01942098, "epoch": 0.595941680444912, "flos": 16538623568640.0, "grad_norm": 1.9669774674890577, "language_loss": 0.65873277, "learning_rate": 1.4815782209628631e-06, "loss": 0.68022704, "num_input_tokens_seen": 213592740, "step": 9912, "time_per_iteration": 2.642876386642456 }, { "auxiliary_loss_clip": 0.0108881, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.04177618, "balance_loss_mlp": 1.02360988, "epoch": 0.59600180369758, "flos": 27818883423360.0, "grad_norm": 1.9028573243158677, "language_loss": 0.73863906, "learning_rate": 1.4812020806225337e-06, "loss": 0.7598986, "num_input_tokens_seen": 213611970, "step": 9913, "time_per_iteration": 2.860369920730591 }, { "auxiliary_loss_clip": 0.01083137, "auxiliary_loss_mlp": 0.00770309, "balance_loss_clip": 1.03919995, "balance_loss_mlp": 1.0000217, "epoch": 0.596061926950248, "flos": 29491566013440.0, "grad_norm": 2.1966155200103907, "language_loss": 0.79778421, "learning_rate": 1.4808259599550738e-06, "loss": 0.81631863, "num_input_tokens_seen": 213632230, "step": 9914, "time_per_iteration": 2.790907382965088 }, { "auxiliary_loss_clip": 0.01079867, "auxiliary_loss_mlp": 0.01029281, "balance_loss_clip": 1.03796613, "balance_loss_mlp": 1.01610804, "epoch": 0.596122050202916, "flos": 16836790366080.0, "grad_norm": 1.724717360749454, "language_loss": 0.67540228, "learning_rate": 1.4804498589747448e-06, "loss": 0.69649374, "num_input_tokens_seen": 213649645, "step": 9915, "time_per_iteration": 2.701197385787964 }, { "auxiliary_loss_clip": 0.01088406, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.03837395, "balance_loss_mlp": 1.0187242, "epoch": 0.596182173455584, "flos": 20996646393600.0, "grad_norm": 1.462048268018942, "language_loss": 0.78788066, "learning_rate": 1.4800737776958095e-06, "loss": 0.8090741, "num_input_tokens_seen": 213668850, "step": 9916, "time_per_iteration": 2.7466511726379395 }, { "auxiliary_loss_clip": 0.01093274, "auxiliary_loss_mlp": 0.01031597, "balance_loss_clip": 1.03742838, "balance_loss_mlp": 1.01851332, "epoch": 0.5962422967082519, "flos": 16065680169600.0, "grad_norm": 1.8319257164110343, "language_loss": 0.8272475, "learning_rate": 1.4796977161325286e-06, "loss": 0.84849626, "num_input_tokens_seen": 213685695, "step": 9917, "time_per_iteration": 2.6762564182281494 }, { "auxiliary_loss_clip": 0.01090404, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.04083288, "balance_loss_mlp": 1.02195954, "epoch": 0.5963024199609199, "flos": 12166966995840.0, "grad_norm": 1.8036319058685593, "language_loss": 0.76979315, "learning_rate": 1.4793216742991625e-06, "loss": 0.79104197, "num_input_tokens_seen": 213703515, "step": 9918, "time_per_iteration": 2.707718849182129 }, { "auxiliary_loss_clip": 0.01108865, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.04414129, "balance_loss_mlp": 1.02538431, "epoch": 0.5963625432135878, "flos": 28074280101120.0, "grad_norm": 2.6956936924639012, "language_loss": 0.78955698, "learning_rate": 1.4789456522099707e-06, "loss": 0.8110314, "num_input_tokens_seen": 213724170, "step": 9919, "time_per_iteration": 2.732933759689331 }, { "auxiliary_loss_clip": 0.01091105, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.04111147, "balance_loss_mlp": 1.02323401, "epoch": 0.5964226664662559, "flos": 19860324664320.0, "grad_norm": 1.8773735409019414, "language_loss": 0.77863061, "learning_rate": 1.4785696498792122e-06, "loss": 0.79991376, "num_input_tokens_seen": 213740620, "step": 9920, "time_per_iteration": 2.6758365631103516 }, { "auxiliary_loss_clip": 0.01105504, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.04226005, "balance_loss_mlp": 1.02303123, "epoch": 0.5964827897189238, "flos": 12932618325120.0, "grad_norm": 2.199993791667526, "language_loss": 0.82559252, "learning_rate": 1.4781936673211446e-06, "loss": 0.84701777, "num_input_tokens_seen": 213755390, "step": 9921, "time_per_iteration": 2.631972312927246 }, { "auxiliary_loss_clip": 0.0110339, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.0396421, "balance_loss_mlp": 1.01888192, "epoch": 0.5965429129715918, "flos": 18150797698560.0, "grad_norm": 3.5044992121063103, "language_loss": 0.80699342, "learning_rate": 1.4778177045500252e-06, "loss": 0.82835329, "num_input_tokens_seen": 213773225, "step": 9922, "time_per_iteration": 2.646479606628418 }, { "auxiliary_loss_clip": 0.01107944, "auxiliary_loss_mlp": 0.00770214, "balance_loss_clip": 1.04096532, "balance_loss_mlp": 1.000036, "epoch": 0.5966030362242597, "flos": 21763231476480.0, "grad_norm": 1.7423002236659255, "language_loss": 0.77125442, "learning_rate": 1.477441761580111e-06, "loss": 0.79003608, "num_input_tokens_seen": 213791860, "step": 9923, "time_per_iteration": 2.646597385406494 }, { "auxiliary_loss_clip": 0.01105997, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.04343677, "balance_loss_mlp": 1.02382815, "epoch": 0.5966631594769277, "flos": 18807208790400.0, "grad_norm": 1.7872252192325138, "language_loss": 0.76111019, "learning_rate": 1.4770658384256573e-06, "loss": 0.78255856, "num_input_tokens_seen": 213809455, "step": 9924, "time_per_iteration": 2.784302234649658 }, { "auxiliary_loss_clip": 0.01098024, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.03841281, "balance_loss_mlp": 1.02270854, "epoch": 0.5967232827295956, "flos": 14064163545600.0, "grad_norm": 2.5918588496554222, "language_loss": 0.66627729, "learning_rate": 1.4766899351009204e-06, "loss": 0.6876303, "num_input_tokens_seen": 213826615, "step": 9925, "time_per_iteration": 2.6964471340179443 }, { "auxiliary_loss_clip": 0.01088743, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.04202008, "balance_loss_mlp": 1.0219934, "epoch": 0.5967834059822636, "flos": 17238235743360.0, "grad_norm": 2.607968523577736, "language_loss": 0.71629661, "learning_rate": 1.4763140516201528e-06, "loss": 0.7375375, "num_input_tokens_seen": 213844495, "step": 9926, "time_per_iteration": 2.739656448364258 }, { "auxiliary_loss_clip": 0.01076071, "auxiliary_loss_mlp": 0.00771823, "balance_loss_clip": 1.04067254, "balance_loss_mlp": 1.0001483, "epoch": 0.5968435292349316, "flos": 42520244284800.0, "grad_norm": 1.798681806501109, "language_loss": 0.70456839, "learning_rate": 1.4759381879976088e-06, "loss": 0.72304738, "num_input_tokens_seen": 213869125, "step": 9927, "time_per_iteration": 2.9877870082855225 }, { "auxiliary_loss_clip": 0.01071922, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.03775859, "balance_loss_mlp": 1.01547647, "epoch": 0.5969036524875996, "flos": 37630898945280.0, "grad_norm": 1.7276883821850428, "language_loss": 0.63847625, "learning_rate": 1.4755623442475415e-06, "loss": 0.6594857, "num_input_tokens_seen": 213891115, "step": 9928, "time_per_iteration": 2.889533042907715 }, { "auxiliary_loss_clip": 0.01115406, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.04134023, "balance_loss_mlp": 1.02103138, "epoch": 0.5969637757402676, "flos": 23148377694720.0, "grad_norm": 1.6663701476220254, "language_loss": 0.69803309, "learning_rate": 1.4751865203842022e-06, "loss": 0.71951973, "num_input_tokens_seen": 213911925, "step": 9929, "time_per_iteration": 2.6571357250213623 }, { "auxiliary_loss_clip": 0.01073832, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.04385591, "balance_loss_mlp": 1.02244925, "epoch": 0.5970238989929355, "flos": 24020934877440.0, "grad_norm": 1.7972325287685906, "language_loss": 0.76839757, "learning_rate": 1.4748107164218431e-06, "loss": 0.78948194, "num_input_tokens_seen": 213930715, "step": 9930, "time_per_iteration": 2.7475857734680176 }, { "auxiliary_loss_clip": 0.0109514, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.04357862, "balance_loss_mlp": 1.02017856, "epoch": 0.5970840222456035, "flos": 19426883247360.0, "grad_norm": 1.7574249474808616, "language_loss": 0.68748617, "learning_rate": 1.4744349323747146e-06, "loss": 0.70878506, "num_input_tokens_seen": 213950015, "step": 9931, "time_per_iteration": 2.713695526123047 }, { "auxiliary_loss_clip": 0.01025314, "auxiliary_loss_mlp": 0.01000381, "balance_loss_clip": 1.01468325, "balance_loss_mlp": 0.99920666, "epoch": 0.5971441454982714, "flos": 62976615235200.0, "grad_norm": 0.8553027537300191, "language_loss": 0.64182514, "learning_rate": 1.474059168257065e-06, "loss": 0.66208208, "num_input_tokens_seen": 214003330, "step": 9932, "time_per_iteration": 3.106821060180664 }, { "auxiliary_loss_clip": 0.01084112, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.03818321, "balance_loss_mlp": 1.01869833, "epoch": 0.5972042687509395, "flos": 20266223328000.0, "grad_norm": 2.9993889514324463, "language_loss": 0.73966062, "learning_rate": 1.4736834240831454e-06, "loss": 0.76082295, "num_input_tokens_seen": 214021680, "step": 9933, "time_per_iteration": 2.718324899673462 }, { "auxiliary_loss_clip": 0.01028586, "auxiliary_loss_mlp": 0.01004687, "balance_loss_clip": 1.02009809, "balance_loss_mlp": 1.00334597, "epoch": 0.5972643920036074, "flos": 71652383832960.0, "grad_norm": 0.6592973095113355, "language_loss": 0.52000248, "learning_rate": 1.473307699867203e-06, "loss": 0.54033524, "num_input_tokens_seen": 214090265, "step": 9934, "time_per_iteration": 3.265408515930176 }, { "auxiliary_loss_clip": 0.01038691, "auxiliary_loss_mlp": 0.01008472, "balance_loss_clip": 1.01466894, "balance_loss_mlp": 1.00733399, "epoch": 0.5973245152562754, "flos": 56892702263040.0, "grad_norm": 0.8334850866606021, "language_loss": 0.54153717, "learning_rate": 1.4729319956234849e-06, "loss": 0.5620088, "num_input_tokens_seen": 214146375, "step": 9935, "time_per_iteration": 3.07120680809021 }, { "auxiliary_loss_clip": 0.01095451, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.04008901, "balance_loss_mlp": 1.01956391, "epoch": 0.5973846385089433, "flos": 24164361884160.0, "grad_norm": 1.5706852760220016, "language_loss": 0.66061485, "learning_rate": 1.4725563113662394e-06, "loss": 0.68190181, "num_input_tokens_seen": 214165340, "step": 9936, "time_per_iteration": 2.724457263946533 }, { "auxiliary_loss_clip": 0.01060903, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.03609622, "balance_loss_mlp": 1.02246332, "epoch": 0.5974447617616113, "flos": 17670599752320.0, "grad_norm": 1.9876387260879245, "language_loss": 0.6771605, "learning_rate": 1.4721806471097103e-06, "loss": 0.69812608, "num_input_tokens_seen": 214181360, "step": 9937, "time_per_iteration": 2.75978422164917 }, { "auxiliary_loss_clip": 0.0111018, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.04208851, "balance_loss_mlp": 1.01846123, "epoch": 0.5975048850142792, "flos": 22892514140160.0, "grad_norm": 2.408863368051578, "language_loss": 0.77660179, "learning_rate": 1.4718050028681442e-06, "loss": 0.79802668, "num_input_tokens_seen": 214198525, "step": 9938, "time_per_iteration": 4.499311447143555 }, { "auxiliary_loss_clip": 0.01105785, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.03925014, "balance_loss_mlp": 1.02100301, "epoch": 0.5975650082669473, "flos": 24353108876160.0, "grad_norm": 1.4410606641316148, "language_loss": 0.75726342, "learning_rate": 1.4714293786557855e-06, "loss": 0.77866983, "num_input_tokens_seen": 214218710, "step": 9939, "time_per_iteration": 4.202291011810303 }, { "auxiliary_loss_clip": 0.01073866, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.04116249, "balance_loss_mlp": 1.01718175, "epoch": 0.5976251315196152, "flos": 20923352691840.0, "grad_norm": 4.812638761028828, "language_loss": 0.68618965, "learning_rate": 1.471053774486878e-06, "loss": 0.70725775, "num_input_tokens_seen": 214237800, "step": 9940, "time_per_iteration": 4.418368339538574 }, { "auxiliary_loss_clip": 0.01090139, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.04158998, "balance_loss_mlp": 1.02415049, "epoch": 0.5976852547722832, "flos": 35844594658560.0, "grad_norm": 1.3494600203677949, "language_loss": 0.70370513, "learning_rate": 1.470678190375664e-06, "loss": 0.72497392, "num_input_tokens_seen": 214260355, "step": 9941, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01092498, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.03824401, "balance_loss_mlp": 1.02123034, "epoch": 0.5977453780249512, "flos": 12855948744960.0, "grad_norm": 1.9808022638780955, "language_loss": 0.77407408, "learning_rate": 1.470302626336386e-06, "loss": 0.79534429, "num_input_tokens_seen": 214277120, "step": 9942, "time_per_iteration": 2.6881802082061768 }, { "auxiliary_loss_clip": 0.01071168, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.03963232, "balance_loss_mlp": 1.02418923, "epoch": 0.5978055012776191, "flos": 20959155573120.0, "grad_norm": 1.9541019064521015, "language_loss": 0.76172185, "learning_rate": 1.4699270823832857e-06, "loss": 0.78280699, "num_input_tokens_seen": 214295300, "step": 9943, "time_per_iteration": 4.4215734004974365 }, { "auxiliary_loss_clip": 0.0105205, "auxiliary_loss_mlp": 0.01034121, "balance_loss_clip": 1.03876281, "balance_loss_mlp": 1.02149105, "epoch": 0.5978656245302871, "flos": 34058003063040.0, "grad_norm": 1.735048648764757, "language_loss": 0.62473679, "learning_rate": 1.4695515585306032e-06, "loss": 0.64559853, "num_input_tokens_seen": 214317050, "step": 9944, "time_per_iteration": 2.8701138496398926 }, { "auxiliary_loss_clip": 0.0109987, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.04420114, "balance_loss_mlp": 1.02530718, "epoch": 0.597925747782955, "flos": 37373275624320.0, "grad_norm": 1.7121148929704375, "language_loss": 0.72442955, "learning_rate": 1.4691760547925795e-06, "loss": 0.74581611, "num_input_tokens_seen": 214337470, "step": 9945, "time_per_iteration": 2.7868094444274902 }, { "auxiliary_loss_clip": 0.01063078, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.03817308, "balance_loss_mlp": 1.02280903, "epoch": 0.5979858710356231, "flos": 25374803328000.0, "grad_norm": 2.215747344961558, "language_loss": 0.66905904, "learning_rate": 1.4688005711834522e-06, "loss": 0.6900481, "num_input_tokens_seen": 214357975, "step": 9946, "time_per_iteration": 2.83195161819458 }, { "auxiliary_loss_clip": 0.01104512, "auxiliary_loss_mlp": 0.01042624, "balance_loss_clip": 1.03969336, "balance_loss_mlp": 1.0275619, "epoch": 0.598045994288291, "flos": 13698413308800.0, "grad_norm": 1.928704516420183, "language_loss": 0.88898396, "learning_rate": 1.468425107717461e-06, "loss": 0.91045535, "num_input_tokens_seen": 214374125, "step": 9947, "time_per_iteration": 2.5993123054504395 }, { "auxiliary_loss_clip": 0.01112155, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.04039431, "balance_loss_mlp": 1.02080822, "epoch": 0.598106117540959, "flos": 21981352815360.0, "grad_norm": 1.8699586676771087, "language_loss": 0.72236538, "learning_rate": 1.4680496644088432e-06, "loss": 0.74381137, "num_input_tokens_seen": 214393395, "step": 9948, "time_per_iteration": 2.6766860485076904 }, { "auxiliary_loss_clip": 0.01093809, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.03969812, "balance_loss_mlp": 1.02129257, "epoch": 0.5981662407936269, "flos": 20559362221440.0, "grad_norm": 1.8848269321833362, "language_loss": 0.89223683, "learning_rate": 1.4676742412718347e-06, "loss": 0.91352975, "num_input_tokens_seen": 214411550, "step": 9949, "time_per_iteration": 2.731804370880127 }, { "auxiliary_loss_clip": 0.01105698, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.0420059, "balance_loss_mlp": 1.01814604, "epoch": 0.5982263640462949, "flos": 14063840323200.0, "grad_norm": 2.0634992965968917, "language_loss": 0.70250058, "learning_rate": 1.467298838320673e-06, "loss": 0.72386169, "num_input_tokens_seen": 214429780, "step": 9950, "time_per_iteration": 2.666879415512085 }, { "auxiliary_loss_clip": 0.01103442, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.0392406, "balance_loss_mlp": 1.01904809, "epoch": 0.5982864872989628, "flos": 17707228646400.0, "grad_norm": 1.610292824709656, "language_loss": 0.78345191, "learning_rate": 1.4669234555695921e-06, "loss": 0.80480444, "num_input_tokens_seen": 214447775, "step": 9951, "time_per_iteration": 2.624361753463745 }, { "auxiliary_loss_clip": 0.01096152, "auxiliary_loss_mlp": 0.01038536, "balance_loss_clip": 1.0411104, "balance_loss_mlp": 1.02471995, "epoch": 0.5983466105516309, "flos": 16764789553920.0, "grad_norm": 1.4677439185999286, "language_loss": 0.73951542, "learning_rate": 1.4665480930328275e-06, "loss": 0.76086229, "num_input_tokens_seen": 214467245, "step": 9952, "time_per_iteration": 2.780212640762329 }, { "auxiliary_loss_clip": 0.01097597, "auxiliary_loss_mlp": 0.00771764, "balance_loss_clip": 1.04058945, "balance_loss_mlp": 1.0000577, "epoch": 0.5984067338042988, "flos": 20042714949120.0, "grad_norm": 2.0876696722134493, "language_loss": 0.79496032, "learning_rate": 1.466172750724613e-06, "loss": 0.81365395, "num_input_tokens_seen": 214484385, "step": 9953, "time_per_iteration": 2.6629557609558105 }, { "auxiliary_loss_clip": 0.01088175, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.04368794, "balance_loss_mlp": 1.02172363, "epoch": 0.5984668570569668, "flos": 26319900026880.0, "grad_norm": 1.571611875852805, "language_loss": 0.69577867, "learning_rate": 1.4657974286591807e-06, "loss": 0.71700311, "num_input_tokens_seen": 214503465, "step": 9954, "time_per_iteration": 2.772745132446289 }, { "auxiliary_loss_clip": 0.01092663, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.03927422, "balance_loss_mlp": 1.02299023, "epoch": 0.5985269803096348, "flos": 20593728558720.0, "grad_norm": 1.8709505635033254, "language_loss": 0.73055756, "learning_rate": 1.4654221268507637e-06, "loss": 0.75183785, "num_input_tokens_seen": 214520725, "step": 9955, "time_per_iteration": 2.6827971935272217 }, { "auxiliary_loss_clip": 0.01118308, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.04205883, "balance_loss_mlp": 1.0209837, "epoch": 0.5985871035623027, "flos": 26865382942080.0, "grad_norm": 1.5476020192092728, "language_loss": 0.68627518, "learning_rate": 1.4650468453135934e-06, "loss": 0.70780075, "num_input_tokens_seen": 214540675, "step": 9956, "time_per_iteration": 2.6055126190185547 }, { "auxiliary_loss_clip": 0.01120333, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.0435667, "balance_loss_mlp": 1.02041864, "epoch": 0.5986472268149707, "flos": 19609704495360.0, "grad_norm": 5.767015828905461, "language_loss": 0.74026513, "learning_rate": 1.4646715840618999e-06, "loss": 0.76180387, "num_input_tokens_seen": 214559910, "step": 9957, "time_per_iteration": 2.670759677886963 }, { "auxiliary_loss_clip": 0.01082315, "auxiliary_loss_mlp": 0.01029692, "balance_loss_clip": 1.04125023, "balance_loss_mlp": 1.01696002, "epoch": 0.5987073500676386, "flos": 21794616984960.0, "grad_norm": 2.0517993540808157, "language_loss": 0.84612942, "learning_rate": 1.4642963431099138e-06, "loss": 0.86724949, "num_input_tokens_seen": 214575960, "step": 9958, "time_per_iteration": 2.710693597793579 }, { "auxiliary_loss_clip": 0.01088695, "auxiliary_loss_mlp": 0.00771117, "balance_loss_clip": 1.04130435, "balance_loss_mlp": 1.00005364, "epoch": 0.5987674733203067, "flos": 24314361079680.0, "grad_norm": 1.9589439151063424, "language_loss": 0.6649909, "learning_rate": 1.463921122471864e-06, "loss": 0.68358904, "num_input_tokens_seen": 214594230, "step": 9959, "time_per_iteration": 2.7052528858184814 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.04048181, "balance_loss_mlp": 1.01915514, "epoch": 0.5988275965729746, "flos": 21320201128320.0, "grad_norm": 1.6803724665796522, "language_loss": 0.83453488, "learning_rate": 1.4635459221619796e-06, "loss": 0.85591239, "num_input_tokens_seen": 214613130, "step": 9960, "time_per_iteration": 2.698373317718506 }, { "auxiliary_loss_clip": 0.0110105, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.04384398, "balance_loss_mlp": 1.01451361, "epoch": 0.5988877198256426, "flos": 25118041933440.0, "grad_norm": 1.4637618649833892, "language_loss": 0.79449862, "learning_rate": 1.4631707421944868e-06, "loss": 0.81577832, "num_input_tokens_seen": 214634470, "step": 9961, "time_per_iteration": 2.763143539428711 }, { "auxiliary_loss_clip": 0.01115923, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.04150534, "balance_loss_mlp": 1.02107751, "epoch": 0.5989478430783105, "flos": 26429104350720.0, "grad_norm": 1.7720947984672266, "language_loss": 0.66938126, "learning_rate": 1.4627955825836136e-06, "loss": 0.69088268, "num_input_tokens_seen": 214654030, "step": 9962, "time_per_iteration": 2.6398210525512695 }, { "auxiliary_loss_clip": 0.01100963, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.03867447, "balance_loss_mlp": 1.02583313, "epoch": 0.5990079663309785, "flos": 25778439434880.0, "grad_norm": 1.3371562951805418, "language_loss": 0.74043596, "learning_rate": 1.4624204433435857e-06, "loss": 0.76184916, "num_input_tokens_seen": 214676985, "step": 9963, "time_per_iteration": 2.716456651687622 }, { "auxiliary_loss_clip": 0.01105789, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.04120398, "balance_loss_mlp": 1.02003562, "epoch": 0.5990680895836464, "flos": 36831779118720.0, "grad_norm": 1.8119605341465645, "language_loss": 0.68010569, "learning_rate": 1.4620453244886281e-06, "loss": 0.70149863, "num_input_tokens_seen": 214700105, "step": 9964, "time_per_iteration": 2.764112710952759 }, { "auxiliary_loss_clip": 0.01082495, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04189765, "balance_loss_mlp": 1.0158987, "epoch": 0.5991282128363145, "flos": 24133550993280.0, "grad_norm": 1.838028773427246, "language_loss": 0.76536453, "learning_rate": 1.4616702260329662e-06, "loss": 0.78648221, "num_input_tokens_seen": 214717885, "step": 9965, "time_per_iteration": 2.6872916221618652 }, { "auxiliary_loss_clip": 0.01100107, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.03997707, "balance_loss_mlp": 1.01664448, "epoch": 0.5991883360889824, "flos": 10304064956160.0, "grad_norm": 1.881941118756219, "language_loss": 0.77352554, "learning_rate": 1.4612951479908229e-06, "loss": 0.79482305, "num_input_tokens_seen": 214733680, "step": 9966, "time_per_iteration": 2.645473003387451 }, { "auxiliary_loss_clip": 0.01080024, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.04003799, "balance_loss_mlp": 1.01742721, "epoch": 0.5992484593416504, "flos": 23951196622080.0, "grad_norm": 1.4675663731632993, "language_loss": 0.73089266, "learning_rate": 1.460920090376422e-06, "loss": 0.75198722, "num_input_tokens_seen": 214753285, "step": 9967, "time_per_iteration": 2.7043392658233643 }, { "auxiliary_loss_clip": 0.0111042, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.04168642, "balance_loss_mlp": 1.02200305, "epoch": 0.5993085825943184, "flos": 11944105061760.0, "grad_norm": 2.0432757361111724, "language_loss": 0.68492925, "learning_rate": 1.4605450532039847e-06, "loss": 0.70639145, "num_input_tokens_seen": 214767810, "step": 9968, "time_per_iteration": 2.618802070617676 }, { "auxiliary_loss_clip": 0.01104497, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.03805614, "balance_loss_mlp": 1.02315605, "epoch": 0.5993687058469863, "flos": 19026838500480.0, "grad_norm": 1.5933947258371375, "language_loss": 0.79251635, "learning_rate": 1.4601700364877334e-06, "loss": 0.81393319, "num_input_tokens_seen": 214786040, "step": 9969, "time_per_iteration": 2.6758008003234863 }, { "auxiliary_loss_clip": 0.01100647, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.03998137, "balance_loss_mlp": 1.0176506, "epoch": 0.5994288290996543, "flos": 14282967242880.0, "grad_norm": 1.6601112189929519, "language_loss": 0.80936122, "learning_rate": 1.4597950402418889e-06, "loss": 0.83067989, "num_input_tokens_seen": 214803110, "step": 9970, "time_per_iteration": 2.7434401512145996 }, { "auxiliary_loss_clip": 0.01064445, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.0378437, "balance_loss_mlp": 1.02879751, "epoch": 0.5994889523523222, "flos": 19206643006080.0, "grad_norm": 2.015109530583561, "language_loss": 0.61666113, "learning_rate": 1.4594200644806697e-06, "loss": 0.6377483, "num_input_tokens_seen": 214819945, "step": 9971, "time_per_iteration": 2.6593470573425293 }, { "auxiliary_loss_clip": 0.01112816, "auxiliary_loss_mlp": 0.01033245, "balance_loss_clip": 1.04096997, "balance_loss_mlp": 1.02065659, "epoch": 0.5995490756049903, "flos": 28037040675840.0, "grad_norm": 1.7466561522631148, "language_loss": 0.79054534, "learning_rate": 1.4590451092182962e-06, "loss": 0.81200594, "num_input_tokens_seen": 214838810, "step": 9972, "time_per_iteration": 2.657733917236328 }, { "auxiliary_loss_clip": 0.01077287, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.03948355, "balance_loss_mlp": 1.0220139, "epoch": 0.5996091988576582, "flos": 29052953038080.0, "grad_norm": 2.7295276371688657, "language_loss": 0.76414442, "learning_rate": 1.4586701744689864e-06, "loss": 0.78527337, "num_input_tokens_seen": 214857040, "step": 9973, "time_per_iteration": 2.804370880126953 }, { "auxiliary_loss_clip": 0.01080222, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.03798461, "balance_loss_mlp": 1.01820862, "epoch": 0.5996693221103262, "flos": 20813968800000.0, "grad_norm": 2.687412315258338, "language_loss": 0.65429473, "learning_rate": 1.4582952602469578e-06, "loss": 0.6754117, "num_input_tokens_seen": 214873375, "step": 9974, "time_per_iteration": 2.7193095684051514 }, { "auxiliary_loss_clip": 0.01106109, "auxiliary_loss_mlp": 0.01032556, "balance_loss_clip": 1.0399034, "balance_loss_mlp": 1.01984227, "epoch": 0.5997294453629941, "flos": 23768914078080.0, "grad_norm": 1.3699302504221633, "language_loss": 0.74378854, "learning_rate": 1.457920366566428e-06, "loss": 0.76517522, "num_input_tokens_seen": 214893900, "step": 9975, "time_per_iteration": 2.6727962493896484 }, { "auxiliary_loss_clip": 0.01117306, "auxiliary_loss_mlp": 0.01031631, "balance_loss_clip": 1.04184341, "balance_loss_mlp": 1.01771951, "epoch": 0.5997895686156621, "flos": 20960017499520.0, "grad_norm": 1.8689128111534072, "language_loss": 0.77081978, "learning_rate": 1.457545493441611e-06, "loss": 0.79230917, "num_input_tokens_seen": 214912110, "step": 9976, "time_per_iteration": 2.5855295658111572 }, { "auxiliary_loss_clip": 0.01101132, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.04325271, "balance_loss_mlp": 1.0225029, "epoch": 0.59984969186833, "flos": 28365443746560.0, "grad_norm": 2.489782776024688, "language_loss": 0.74998355, "learning_rate": 1.4571706408867237e-06, "loss": 0.77135837, "num_input_tokens_seen": 214930140, "step": 9977, "time_per_iteration": 4.355423212051392 }, { "auxiliary_loss_clip": 0.01081083, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.03771675, "balance_loss_mlp": 1.01639032, "epoch": 0.5999098151209981, "flos": 22565906749440.0, "grad_norm": 1.7961745328309484, "language_loss": 0.69053113, "learning_rate": 1.4567958089159802e-06, "loss": 0.71163881, "num_input_tokens_seen": 214949200, "step": 9978, "time_per_iteration": 2.687735080718994 }, { "auxiliary_loss_clip": 0.01124045, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.04541636, "balance_loss_mlp": 1.02081037, "epoch": 0.599969938373666, "flos": 18768712389120.0, "grad_norm": 1.9378111201967976, "language_loss": 0.81427479, "learning_rate": 1.456420997543594e-06, "loss": 0.8358658, "num_input_tokens_seen": 214965775, "step": 9979, "time_per_iteration": 5.60455322265625 }, { "auxiliary_loss_clip": 0.01113469, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.04139137, "balance_loss_mlp": 1.02011895, "epoch": 0.600030061626334, "flos": 11327231865600.0, "grad_norm": 2.0199004568827577, "language_loss": 0.70054936, "learning_rate": 1.4560462067837782e-06, "loss": 0.72201335, "num_input_tokens_seen": 214982480, "step": 9980, "time_per_iteration": 2.5815303325653076 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01032543, "balance_loss_clip": 1.03971553, "balance_loss_mlp": 1.01786244, "epoch": 0.600090184879002, "flos": 16578664254720.0, "grad_norm": 2.2746227330860327, "language_loss": 0.686566, "learning_rate": 1.4556714366507445e-06, "loss": 0.70794439, "num_input_tokens_seen": 214998110, "step": 9981, "time_per_iteration": 2.635133743286133 }, { "auxiliary_loss_clip": 0.01106547, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.04316497, "balance_loss_mlp": 1.02458215, "epoch": 0.6001503081316699, "flos": 23618627573760.0, "grad_norm": 1.8281310539755133, "language_loss": 0.78525096, "learning_rate": 1.4552966871587048e-06, "loss": 0.80668187, "num_input_tokens_seen": 215017995, "step": 9982, "time_per_iteration": 4.6227052211761475 }, { "auxiliary_loss_clip": 0.01066865, "auxiliary_loss_mlp": 0.01043371, "balance_loss_clip": 1.03895831, "balance_loss_mlp": 1.02730179, "epoch": 0.6002104313843379, "flos": 20667668705280.0, "grad_norm": 1.558592797835216, "language_loss": 0.73127562, "learning_rate": 1.4549219583218686e-06, "loss": 0.75237799, "num_input_tokens_seen": 215038285, "step": 9983, "time_per_iteration": 2.851017951965332 }, { "auxiliary_loss_clip": 0.01075266, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.03699243, "balance_loss_mlp": 1.01962018, "epoch": 0.6002705546370058, "flos": 22455229968000.0, "grad_norm": 4.7484025968689325, "language_loss": 0.78227878, "learning_rate": 1.454547250154447e-06, "loss": 0.80336481, "num_input_tokens_seen": 215057825, "step": 9984, "time_per_iteration": 2.6935315132141113 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 215057825, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.384161463224238e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }