{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20009018487900196, "eval_steps": 500, "global_step": 3328, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 2.43573999, "balance_loss_mlp": 1.76983953, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.00561300220404, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "step": 1, "time_per_iteration": 18.059409618377686 }, { "auxiliary_loss_clip": 0.03380539, "auxiliary_loss_mlp": 0.01459449, "balance_loss_clip": 1.62786555, "balance_loss_mlp": 1.18936849, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 34.93149751452764, "language_loss": 1.82606053, "learning_rate": 4.4628432569317594e-07, "loss": 1.87446034, "num_input_tokens_seen": 36175, "step": 2, "time_per_iteration": 2.6318798065185547 }, { "auxiliary_loss_clip": 0.03320229, "auxiliary_loss_mlp": 0.01440978, "balance_loss_clip": 1.62577581, "balance_loss_mlp": 1.18882656, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 32.71870482280511, "language_loss": 1.57573509, "learning_rate": 7.073439208833112e-07, "loss": 1.62334716, "num_input_tokens_seen": 54870, "step": 3, "time_per_iteration": 2.6362481117248535 }, { "auxiliary_loss_clip": 0.03361497, "auxiliary_loss_mlp": 0.01451404, "balance_loss_clip": 1.62418985, "balance_loss_mlp": 1.15500188, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 51.2387172839747, "language_loss": 1.67362881, "learning_rate": 8.925686513863519e-07, "loss": 1.72175777, "num_input_tokens_seen": 74575, "step": 4, "time_per_iteration": 2.7070822715759277 }, { "auxiliary_loss_clip": 0.03402497, "auxiliary_loss_mlp": 0.01505358, "balance_loss_clip": 1.62493396, "balance_loss_mlp": 1.21715808, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 56.088721215944275, "language_loss": 1.91627169, "learning_rate": 1.0362401141348472e-06, "loss": 1.96535027, "num_input_tokens_seen": 92580, "step": 5, "time_per_iteration": 2.91436767578125 }, { "auxiliary_loss_clip": 0.03370454, "auxiliary_loss_mlp": 0.01515599, "balance_loss_clip": 1.61556244, "balance_loss_mlp": 1.22110426, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 33.397169652317885, "language_loss": 1.60591149, "learning_rate": 1.153628246576487e-06, "loss": 1.65477204, "num_input_tokens_seen": 109705, "step": 6, "time_per_iteration": 2.994969367980957 }, { "auxiliary_loss_clip": 0.03354239, "auxiliary_loss_mlp": 0.01486417, "balance_loss_clip": 1.61577415, "balance_loss_mlp": 1.20336628, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 24.6270766983672, "language_loss": 1.53276002, "learning_rate": 1.2528784983718962e-06, "loss": 1.58116663, "num_input_tokens_seen": 129425, "step": 7, "time_per_iteration": 3.0675876140594482 }, { "auxiliary_loss_clip": 0.03321216, "auxiliary_loss_mlp": 0.0144328, "balance_loss_clip": 1.61205018, "balance_loss_mlp": 1.16499734, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 31.71613063643349, "language_loss": 1.43881059, "learning_rate": 1.338852977079528e-06, "loss": 1.48645568, "num_input_tokens_seen": 149210, "step": 8, "time_per_iteration": 3.172358751296997 }, { "auxiliary_loss_clip": 0.03368839, "auxiliary_loss_mlp": 0.01496105, "balance_loss_clip": 1.6120348, "balance_loss_mlp": 1.21229148, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 28.204490849684397, "language_loss": 1.4969244, "learning_rate": 1.4146878417666224e-06, "loss": 1.54557395, "num_input_tokens_seen": 169055, "step": 9, "time_per_iteration": 3.112215280532837 }, { "auxiliary_loss_clip": 0.03308365, "auxiliary_loss_mlp": 0.01475035, "balance_loss_clip": 1.61541438, "balance_loss_mlp": 1.20647991, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 23.420774723604698, "language_loss": 1.44714785, "learning_rate": 1.4825244398280232e-06, "loss": 1.49498188, "num_input_tokens_seen": 188045, "step": 10, "time_per_iteration": 2.9495606422424316 }, { "auxiliary_loss_clip": 0.03364194, "auxiliary_loss_mlp": 0.01494262, "balance_loss_clip": 1.62042511, "balance_loss_mlp": 1.22036684, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 18.353281468858004, "language_loss": 1.4520936, "learning_rate": 1.5438901072051983e-06, "loss": 1.50067806, "num_input_tokens_seen": 207035, "step": 11, "time_per_iteration": 3.0797431468963623 }, { "auxiliary_loss_clip": 0.03292683, "auxiliary_loss_mlp": 0.0145154, "balance_loss_clip": 1.60771322, "balance_loss_mlp": 1.17554641, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 16.61869254675767, "language_loss": 1.45121813, "learning_rate": 1.5999125722696629e-06, "loss": 1.49866033, "num_input_tokens_seen": 223225, "step": 12, "time_per_iteration": 2.9887659549713135 }, { "auxiliary_loss_clip": 0.03321669, "auxiliary_loss_mlp": 0.01405912, "balance_loss_clip": 1.61740756, "balance_loss_mlp": 1.14765704, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 14.02187318243825, "language_loss": 1.23759985, "learning_rate": 1.6514482443788434e-06, "loss": 1.28487587, "num_input_tokens_seen": 242570, "step": 13, "time_per_iteration": 3.032742977142334 }, { "auxiliary_loss_clip": 0.03287474, "auxiliary_loss_mlp": 0.01470749, "balance_loss_clip": 1.61299658, "balance_loss_mlp": 1.20257616, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 5.790568956401358, "language_loss": 1.20684385, "learning_rate": 1.6991628240650723e-06, "loss": 1.254426, "num_input_tokens_seen": 261215, "step": 14, "time_per_iteration": 3.002887487411499 }, { "auxiliary_loss_clip": 0.03272826, "auxiliary_loss_mlp": 0.01431255, "balance_loss_clip": 1.6181426, "balance_loss_mlp": 1.16804111, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 6.353887091300461, "language_loss": 1.12925518, "learning_rate": 1.7435840350181584e-06, "loss": 1.176296, "num_input_tokens_seen": 280035, "step": 15, "time_per_iteration": 3.0238780975341797 }, { "auxiliary_loss_clip": 0.03238489, "auxiliary_loss_mlp": 0.01411651, "balance_loss_clip": 1.60288334, "balance_loss_mlp": 1.16197944, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 4.670310144758637, "language_loss": 1.11125767, "learning_rate": 1.7851373027727038e-06, "loss": 1.15775907, "num_input_tokens_seen": 300265, "step": 16, "time_per_iteration": 4.605847120285034 }, { "auxiliary_loss_clip": 0.03223993, "auxiliary_loss_mlp": 0.01417304, "balance_loss_clip": 1.60910368, "balance_loss_mlp": 1.17774093, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 8.838429022323517, "language_loss": 1.12645221, "learning_rate": 1.8241705979033208e-06, "loss": 1.17286515, "num_input_tokens_seen": 317375, "step": 17, "time_per_iteration": 4.579033851623535 }, { "auxiliary_loss_clip": 0.03161492, "auxiliary_loss_mlp": 0.01379312, "balance_loss_clip": 1.60685277, "balance_loss_mlp": 1.1475693, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 3.823557061532633, "language_loss": 1.08069181, "learning_rate": 1.860972167459798e-06, "loss": 1.12609982, "num_input_tokens_seen": 337975, "step": 18, "time_per_iteration": 3.0132579803466797 }, { "auxiliary_loss_clip": 0.0318761, "auxiliary_loss_mlp": 0.01403306, "balance_loss_clip": 1.60585093, "balance_loss_mlp": 1.13799417, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 4.403621106373983, "language_loss": 1.02445412, "learning_rate": 1.89578346593066e-06, "loss": 1.07036328, "num_input_tokens_seen": 356635, "step": 19, "time_per_iteration": 3.016176462173462 }, { "auxiliary_loss_clip": 0.0313029, "auxiliary_loss_mlp": 0.01342049, "balance_loss_clip": 1.60759044, "balance_loss_mlp": 1.12155962, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.958333686933058, "language_loss": 1.16706228, "learning_rate": 1.928808765521199e-06, "loss": 1.21178555, "num_input_tokens_seen": 375625, "step": 20, "time_per_iteration": 3.0274486541748047 }, { "auxiliary_loss_clip": 0.03118109, "auxiliary_loss_mlp": 0.01378536, "balance_loss_clip": 1.58886433, "balance_loss_mlp": 1.1298182, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 4.333519066420982, "language_loss": 1.06129968, "learning_rate": 1.9602224192552076e-06, "loss": 1.10626626, "num_input_tokens_seen": 394350, "step": 21, "time_per_iteration": 2.9418578147888184 }, { "auxiliary_loss_clip": 0.03013912, "auxiliary_loss_mlp": 0.0137937, "balance_loss_clip": 1.57028937, "balance_loss_mlp": 1.14552903, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 3.63841390311849, "language_loss": 1.05861485, "learning_rate": 1.9901744328983746e-06, "loss": 1.10254765, "num_input_tokens_seen": 413255, "step": 22, "time_per_iteration": 2.9651288986206055 }, { "auxiliary_loss_clip": 0.02966296, "auxiliary_loss_mlp": 0.01334065, "balance_loss_clip": 1.57175612, "balance_loss_mlp": 1.12377954, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 2.8746130742538347, "language_loss": 0.9177655, "learning_rate": 2.018794797290208e-06, "loss": 0.96076906, "num_input_tokens_seen": 433065, "step": 23, "time_per_iteration": 3.049853563308716 }, { "auxiliary_loss_clip": 0.02932793, "auxiliary_loss_mlp": 0.01362183, "balance_loss_clip": 1.56404662, "balance_loss_mlp": 1.14236116, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 3.0897201135857735, "language_loss": 1.08192635, "learning_rate": 2.046196897962839e-06, "loss": 1.12487614, "num_input_tokens_seen": 451175, "step": 24, "time_per_iteration": 3.0543172359466553 }, { "auxiliary_loss_clip": 0.02823838, "auxiliary_loss_mlp": 0.01329007, "balance_loss_clip": 1.55692792, "balance_loss_mlp": 1.11853111, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 4.111246686692462, "language_loss": 1.01367807, "learning_rate": 2.0724802282696944e-06, "loss": 1.05520654, "num_input_tokens_seen": 468775, "step": 25, "time_per_iteration": 3.0059614181518555 }, { "auxiliary_loss_clip": 0.02818207, "auxiliary_loss_mlp": 0.01309454, "balance_loss_clip": 1.55974329, "balance_loss_mlp": 1.10012197, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 2.7163042439620018, "language_loss": 1.0669204, "learning_rate": 2.0977325700720194e-06, "loss": 1.10819697, "num_input_tokens_seen": 488530, "step": 26, "time_per_iteration": 3.1159534454345703 }, { "auxiliary_loss_clip": 0.0276047, "auxiliary_loss_mlp": 0.01325034, "balance_loss_clip": 1.54973662, "balance_loss_mlp": 1.12533486, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 2.562596284241794, "language_loss": 0.95537072, "learning_rate": 2.122031762649933e-06, "loss": 0.99622583, "num_input_tokens_seen": 510495, "step": 27, "time_per_iteration": 3.018643617630005 }, { "auxiliary_loss_clip": 0.02736222, "auxiliary_loss_mlp": 0.01311707, "balance_loss_clip": 1.55399776, "balance_loss_mlp": 1.13089037, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 2.42975125432869, "language_loss": 1.06393945, "learning_rate": 2.1454471497582483e-06, "loss": 1.10441875, "num_input_tokens_seen": 528605, "step": 28, "time_per_iteration": 2.9263083934783936 }, { "auxiliary_loss_clip": 0.0270011, "auxiliary_loss_mlp": 0.0131913, "balance_loss_clip": 1.53841436, "balance_loss_mlp": 1.13297284, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 4.42090805909513, "language_loss": 1.02493238, "learning_rate": 2.1680407726407727e-06, "loss": 1.06512475, "num_input_tokens_seen": 548515, "step": 29, "time_per_iteration": 3.0062997341156006 }, { "auxiliary_loss_clip": 0.0269246, "auxiliary_loss_mlp": 0.01312758, "balance_loss_clip": 1.53459728, "balance_loss_mlp": 1.12631428, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 3.1534114534186446, "language_loss": 1.19265521, "learning_rate": 2.189868360711334e-06, "loss": 1.23270726, "num_input_tokens_seen": 564025, "step": 30, "time_per_iteration": 2.931145429611206 }, { "auxiliary_loss_clip": 0.02610377, "auxiliary_loss_mlp": 0.01337183, "balance_loss_clip": 1.52116311, "balance_loss_mlp": 1.15665221, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 2.735994596991484, "language_loss": 1.02616811, "learning_rate": 2.2109801597326265e-06, "loss": 1.06564379, "num_input_tokens_seen": 583345, "step": 31, "time_per_iteration": 2.993251085281372 }, { "auxiliary_loss_clip": 0.02582044, "auxiliary_loss_mlp": 0.01331305, "balance_loss_clip": 1.522609, "balance_loss_mlp": 1.15163302, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 3.9056907796043654, "language_loss": 0.95266509, "learning_rate": 2.2314216284658796e-06, "loss": 0.99179864, "num_input_tokens_seen": 600010, "step": 32, "time_per_iteration": 2.9459571838378906 }, { "auxiliary_loss_clip": 0.02564836, "auxiliary_loss_mlp": 0.01302659, "balance_loss_clip": 1.51811624, "balance_loss_mlp": 1.13586164, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 3.226486022987097, "language_loss": 0.95143497, "learning_rate": 2.2512340280885094e-06, "loss": 0.99010992, "num_input_tokens_seen": 616295, "step": 33, "time_per_iteration": 2.9855570793151855 }, { "auxiliary_loss_clip": 0.02421202, "auxiliary_loss_mlp": 0.01304214, "balance_loss_clip": 1.48474145, "balance_loss_mlp": 1.14676213, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.1714659525821247, "language_loss": 0.91547924, "learning_rate": 2.270454923596497e-06, "loss": 0.9527334, "num_input_tokens_seen": 637640, "step": 34, "time_per_iteration": 2.981541872024536 }, { "auxiliary_loss_clip": 0.02375249, "auxiliary_loss_mlp": 0.01271963, "balance_loss_clip": 1.45095515, "balance_loss_mlp": 1.11689591, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 2.2635429103650386, "language_loss": 0.76603377, "learning_rate": 2.2891186125067434e-06, "loss": 0.80250585, "num_input_tokens_seen": 659710, "step": 35, "time_per_iteration": 3.2267208099365234 }, { "auxiliary_loss_clip": 0.02347187, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 1.46356034, "balance_loss_mlp": 1.13238275, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.3605884715298506, "language_loss": 0.88713098, "learning_rate": 2.307256493152974e-06, "loss": 0.92336679, "num_input_tokens_seen": 679670, "step": 36, "time_per_iteration": 2.948162078857422 }, { "auxiliary_loss_clip": 0.02289192, "auxiliary_loss_mlp": 0.01338204, "balance_loss_clip": 1.45043015, "balance_loss_mlp": 1.19105196, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.4929063351918166, "language_loss": 0.93038809, "learning_rate": 2.3248973825097614e-06, "loss": 0.96666199, "num_input_tokens_seen": 700170, "step": 37, "time_per_iteration": 2.9556422233581543 }, { "auxiliary_loss_clip": 0.02249098, "auxiliary_loss_mlp": 0.01276785, "balance_loss_clip": 1.44485605, "balance_loss_mlp": 1.15500069, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.177909778954084, "language_loss": 1.03952074, "learning_rate": 2.3420677916238357e-06, "loss": 1.07477951, "num_input_tokens_seen": 718545, "step": 38, "time_per_iteration": 2.9959065914154053 }, { "auxiliary_loss_clip": 0.02216028, "auxiliary_loss_mlp": 0.01260768, "balance_loss_clip": 1.43807542, "balance_loss_mlp": 1.13726676, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.22652515093943, "language_loss": 0.85297108, "learning_rate": 2.358792165262154e-06, "loss": 0.887739, "num_input_tokens_seen": 739865, "step": 39, "time_per_iteration": 3.035399913787842 }, { "auxiliary_loss_clip": 0.02192275, "auxiliary_loss_mlp": 0.01250434, "balance_loss_clip": 1.4289664, "balance_loss_mlp": 1.12216496, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 3.258228308703562, "language_loss": 0.90279335, "learning_rate": 2.3750930912143747e-06, "loss": 0.93722045, "num_input_tokens_seen": 755770, "step": 40, "time_per_iteration": 3.060368299484253 }, { "auxiliary_loss_clip": 0.02142113, "auxiliary_loss_mlp": 0.01273783, "balance_loss_clip": 1.41895449, "balance_loss_mlp": 1.16086745, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 3.245861029799582, "language_loss": 0.93271625, "learning_rate": 2.3909914837471044e-06, "loss": 0.9668752, "num_input_tokens_seen": 773440, "step": 41, "time_per_iteration": 2.9518353939056396 }, { "auxiliary_loss_clip": 0.02105753, "auxiliary_loss_mlp": 0.01254821, "balance_loss_clip": 1.41097844, "balance_loss_mlp": 1.15168142, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 3.3039479788253536, "language_loss": 0.97533798, "learning_rate": 2.4065067449483835e-06, "loss": 1.0089438, "num_input_tokens_seen": 790455, "step": 42, "time_per_iteration": 2.933177947998047 }, { "auxiliary_loss_clip": 0.020675, "auxiliary_loss_mlp": 0.01298422, "balance_loss_clip": 1.41198874, "balance_loss_mlp": 1.19189644, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 3.15071165872949, "language_loss": 0.97562659, "learning_rate": 2.4216569070848724e-06, "loss": 1.00928593, "num_input_tokens_seen": 810645, "step": 43, "time_per_iteration": 2.9760589599609375 }, { "auxiliary_loss_clip": 0.02086351, "auxiliary_loss_mlp": 0.01314601, "balance_loss_clip": 1.41042757, "balance_loss_mlp": 1.20283043, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.3612650137146574, "language_loss": 0.93435001, "learning_rate": 2.4364587585915504e-06, "loss": 0.96835947, "num_input_tokens_seen": 827470, "step": 44, "time_per_iteration": 2.9239895343780518 }, { "auxiliary_loss_clip": 0.02043996, "auxiliary_loss_mlp": 0.01272131, "balance_loss_clip": 1.40557313, "balance_loss_mlp": 1.17399764, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.1476860292916644, "language_loss": 0.98677421, "learning_rate": 2.450927955901469e-06, "loss": 1.01993537, "num_input_tokens_seen": 847285, "step": 45, "time_per_iteration": 2.9626305103302 }, { "auxiliary_loss_clip": 0.02018804, "auxiliary_loss_mlp": 0.01228873, "balance_loss_clip": 1.39126372, "balance_loss_mlp": 1.14208817, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.8862192248435494, "language_loss": 1.02800822, "learning_rate": 2.465079122983384e-06, "loss": 1.06048501, "num_input_tokens_seen": 867545, "step": 46, "time_per_iteration": 2.9913573265075684 }, { "auxiliary_loss_clip": 0.0198766, "auxiliary_loss_mlp": 0.01272862, "balance_loss_clip": 1.38388658, "balance_loss_mlp": 1.182549, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.1076645953887696, "language_loss": 0.87839413, "learning_rate": 2.4789259401737868e-06, "loss": 0.9109993, "num_input_tokens_seen": 889915, "step": 47, "time_per_iteration": 3.0189881324768066 }, { "auxiliary_loss_clip": 0.01949271, "auxiliary_loss_mlp": 0.01255947, "balance_loss_clip": 1.37360096, "balance_loss_mlp": 1.16963911, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 4.4561049138068, "language_loss": 0.87809587, "learning_rate": 2.492481223656015e-06, "loss": 0.91014802, "num_input_tokens_seen": 908975, "step": 48, "time_per_iteration": 2.863565444946289 }, { "auxiliary_loss_clip": 0.01949016, "auxiliary_loss_mlp": 0.0124182, "balance_loss_clip": 1.36337733, "balance_loss_mlp": 1.15069616, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.9451035624229855, "language_loss": 0.89691317, "learning_rate": 2.5057569967437924e-06, "loss": 0.9288215, "num_input_tokens_seen": 929810, "step": 49, "time_per_iteration": 2.9967453479766846 }, { "auxiliary_loss_clip": 0.0194038, "auxiliary_loss_mlp": 0.01234077, "balance_loss_clip": 1.35742152, "balance_loss_mlp": 1.14996314, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 3.162716210197168, "language_loss": 0.90914285, "learning_rate": 2.51876455396287e-06, "loss": 0.94088745, "num_input_tokens_seen": 948650, "step": 50, "time_per_iteration": 2.8832523822784424 }, { "auxiliary_loss_clip": 0.01938537, "auxiliary_loss_mlp": 0.01199505, "balance_loss_clip": 1.36240602, "balance_loss_mlp": 1.11844242, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 6.098010360158733, "language_loss": 0.86977792, "learning_rate": 2.5315145187866316e-06, "loss": 0.90115827, "num_input_tokens_seen": 966455, "step": 51, "time_per_iteration": 2.9061717987060547 }, { "auxiliary_loss_clip": 0.01895637, "auxiliary_loss_mlp": 0.01206588, "balance_loss_clip": 1.35252357, "balance_loss_mlp": 1.12829173, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 2.043292881862276, "language_loss": 0.95171362, "learning_rate": 2.5440168957651953e-06, "loss": 0.98273587, "num_input_tokens_seen": 988110, "step": 52, "time_per_iteration": 3.0266616344451904 }, { "auxiliary_loss_clip": 0.01893195, "auxiliary_loss_mlp": 0.01241159, "balance_loss_clip": 1.34894896, "balance_loss_mlp": 1.16162264, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 4.2358840345824635, "language_loss": 0.92323011, "learning_rate": 2.5562811176888872e-06, "loss": 0.95457363, "num_input_tokens_seen": 1008550, "step": 53, "time_per_iteration": 2.8850226402282715 }, { "auxiliary_loss_clip": 0.01882736, "auxiliary_loss_mlp": 0.01197045, "balance_loss_clip": 1.35264134, "balance_loss_mlp": 1.11669779, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.290226623360683, "language_loss": 0.8260113, "learning_rate": 2.5683160883431093e-06, "loss": 0.85680908, "num_input_tokens_seen": 1026840, "step": 54, "time_per_iteration": 2.9433553218841553 }, { "auxiliary_loss_clip": 0.01880073, "auxiliary_loss_mlp": 0.01210775, "balance_loss_clip": 1.34162152, "balance_loss_mlp": 1.13233542, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.911577423572303, "language_loss": 0.81303245, "learning_rate": 2.580130221340046e-06, "loss": 0.84394085, "num_input_tokens_seen": 1048875, "step": 55, "time_per_iteration": 3.0040643215179443 }, { "auxiliary_loss_clip": 0.01870075, "auxiliary_loss_mlp": 0.0120375, "balance_loss_clip": 1.33644819, "balance_loss_mlp": 1.12521541, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 2.639118679342801, "language_loss": 0.87089968, "learning_rate": 2.5917314754514246e-06, "loss": 0.90163803, "num_input_tokens_seen": 1066435, "step": 56, "time_per_iteration": 2.830453395843506 }, { "auxiliary_loss_clip": 0.01869912, "auxiliary_loss_mlp": 0.01161425, "balance_loss_clip": 1.32921791, "balance_loss_mlp": 1.08851671, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 2.101574700040827, "language_loss": 0.92890096, "learning_rate": 2.6031273868139713e-06, "loss": 0.95921433, "num_input_tokens_seen": 1090330, "step": 57, "time_per_iteration": 7.0071024894714355 }, { "auxiliary_loss_clip": 0.01833802, "auxiliary_loss_mlp": 0.0121675, "balance_loss_clip": 1.33333457, "balance_loss_mlp": 1.14493799, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 14.610065921505914, "language_loss": 0.9972856, "learning_rate": 2.614325098333948e-06, "loss": 1.02779114, "num_input_tokens_seen": 1109840, "step": 58, "time_per_iteration": 2.830960273742676 }, { "auxiliary_loss_clip": 0.0181804, "auxiliary_loss_mlp": 0.01199311, "balance_loss_clip": 1.32073379, "balance_loss_mlp": 1.12835753, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 2.120622270947527, "language_loss": 0.88172519, "learning_rate": 2.625331386578098e-06, "loss": 0.91189873, "num_input_tokens_seen": 1128415, "step": 59, "time_per_iteration": 2.8507089614868164 }, { "auxiliary_loss_clip": 0.01839573, "auxiliary_loss_mlp": 0.01163328, "balance_loss_clip": 1.32924581, "balance_loss_mlp": 1.09075332, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 2.021991994360373, "language_loss": 0.93542433, "learning_rate": 2.63615268640451e-06, "loss": 0.96545339, "num_input_tokens_seen": 1146515, "step": 60, "time_per_iteration": 2.8517534732818604 }, { "auxiliary_loss_clip": 0.0181893, "auxiliary_loss_mlp": 0.01176948, "balance_loss_clip": 1.31414318, "balance_loss_mlp": 1.10923755, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.908283338489548, "language_loss": 0.90021706, "learning_rate": 2.6467951135575943e-06, "loss": 0.9301759, "num_input_tokens_seen": 1166330, "step": 61, "time_per_iteration": 2.8853390216827393 }, { "auxiliary_loss_clip": 0.01803943, "auxiliary_loss_mlp": 0.01142904, "balance_loss_clip": 1.31131864, "balance_loss_mlp": 1.07581341, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 1.8428161811646855, "language_loss": 0.88479733, "learning_rate": 2.657264485425803e-06, "loss": 0.91426575, "num_input_tokens_seen": 1186010, "step": 62, "time_per_iteration": 2.8860812187194824 }, { "auxiliary_loss_clip": 0.01785338, "auxiliary_loss_mlp": 0.0116457, "balance_loss_clip": 1.30233741, "balance_loss_mlp": 1.09504724, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 2.4385306002926512, "language_loss": 0.96280968, "learning_rate": 2.6675663401385186e-06, "loss": 0.99230874, "num_input_tokens_seen": 1204985, "step": 63, "time_per_iteration": 2.9081404209136963 }, { "auxiliary_loss_clip": 0.01795068, "auxiliary_loss_mlp": 0.01171321, "balance_loss_clip": 1.31071985, "balance_loss_mlp": 1.10499322, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 3.0781639748926697, "language_loss": 0.98840165, "learning_rate": 2.677705954159056e-06, "loss": 1.01806557, "num_input_tokens_seen": 1223545, "step": 64, "time_per_iteration": 2.893603801727295 }, { "auxiliary_loss_clip": 0.01801311, "auxiliary_loss_mlp": 0.01151112, "balance_loss_clip": 1.30960393, "balance_loss_mlp": 1.08368695, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.4813676281781554, "language_loss": 0.85397774, "learning_rate": 2.6876883585136904e-06, "loss": 0.88350195, "num_input_tokens_seen": 1241175, "step": 65, "time_per_iteration": 2.8768796920776367 }, { "auxiliary_loss_clip": 0.01777474, "auxiliary_loss_mlp": 0.01155217, "balance_loss_clip": 1.29563761, "balance_loss_mlp": 1.087888, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 1.8550079005121831, "language_loss": 0.85281348, "learning_rate": 2.697518353781685e-06, "loss": 0.88214046, "num_input_tokens_seen": 1259315, "step": 66, "time_per_iteration": 2.769274950027466 }, { "auxiliary_loss_clip": 0.01779987, "auxiliary_loss_mlp": 0.01151372, "balance_loss_clip": 1.29312515, "balance_loss_mlp": 1.07650828, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 2.74895944689593, "language_loss": 0.96567476, "learning_rate": 2.7072005239581103e-06, "loss": 0.99498826, "num_input_tokens_seen": 1277055, "step": 67, "time_per_iteration": 2.889369249343872 }, { "auxiliary_loss_clip": 0.01752442, "auxiliary_loss_mlp": 0.01152779, "balance_loss_clip": 1.28765118, "balance_loss_mlp": 1.08120584, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.109359538419204, "language_loss": 0.94516367, "learning_rate": 2.7167392492896727e-06, "loss": 0.97421581, "num_input_tokens_seen": 1294355, "step": 68, "time_per_iteration": 2.8107409477233887 }, { "auxiliary_loss_clip": 0.01747204, "auxiliary_loss_mlp": 0.0115424, "balance_loss_clip": 1.28511512, "balance_loss_mlp": 1.08476448, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.2931216646069092, "language_loss": 0.96014255, "learning_rate": 2.7261387181735195e-06, "loss": 0.98915702, "num_input_tokens_seen": 1313525, "step": 69, "time_per_iteration": 2.8138387203216553 }, { "auxiliary_loss_clip": 0.01741342, "auxiliary_loss_mlp": 0.01160375, "balance_loss_clip": 1.28807163, "balance_loss_mlp": 1.09581161, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.1764096137707494, "language_loss": 0.98070192, "learning_rate": 2.7354029381999196e-06, "loss": 1.00971913, "num_input_tokens_seen": 1330505, "step": 70, "time_per_iteration": 2.8319084644317627 }, { "auxiliary_loss_clip": 0.0174721, "auxiliary_loss_mlp": 0.01145619, "balance_loss_clip": 1.27791202, "balance_loss_mlp": 1.07685876, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 2.9300158782571324, "language_loss": 0.94016141, "learning_rate": 2.7445357464116983e-06, "loss": 0.96908975, "num_input_tokens_seen": 1349615, "step": 71, "time_per_iteration": 2.8469433784484863 }, { "auxiliary_loss_clip": 0.01815227, "auxiliary_loss_mlp": 0.01294388, "balance_loss_clip": 1.43495834, "balance_loss_mlp": 1.25490558, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.409331683106634, "language_loss": 0.65682542, "learning_rate": 2.75354081884615e-06, "loss": 0.68792164, "num_input_tokens_seen": 1410275, "step": 72, "time_per_iteration": 3.2019593715667725 }, { "auxiliary_loss_clip": 0.01799527, "auxiliary_loss_mlp": 0.01271558, "balance_loss_clip": 1.43197393, "balance_loss_mlp": 1.2316941, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.25068040880696, "language_loss": 0.63694263, "learning_rate": 2.7624216794188286e-06, "loss": 0.66765356, "num_input_tokens_seen": 1473020, "step": 73, "time_per_iteration": 3.3545596599578857 }, { "auxiliary_loss_clip": 0.01720805, "auxiliary_loss_mlp": 0.01140553, "balance_loss_clip": 1.26912856, "balance_loss_mlp": 1.07279444, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.554977860093902, "language_loss": 0.86212188, "learning_rate": 2.771181708202938e-06, "loss": 0.89073551, "num_input_tokens_seen": 1490385, "step": 74, "time_per_iteration": 2.823498487472534 }, { "auxiliary_loss_clip": 0.0172287, "auxiliary_loss_mlp": 0.01162493, "balance_loss_clip": 1.26811171, "balance_loss_mlp": 1.09344697, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 3.0087618017840105, "language_loss": 0.97196102, "learning_rate": 2.779824149153005e-06, "loss": 1.00081468, "num_input_tokens_seen": 1509725, "step": 75, "time_per_iteration": 2.888415575027466 }, { "auxiliary_loss_clip": 0.0170198, "auxiliary_loss_mlp": 0.01142315, "balance_loss_clip": 1.26420689, "balance_loss_mlp": 1.07608271, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.6610382542709043, "language_loss": 0.87740695, "learning_rate": 2.788352117317012e-06, "loss": 0.90584993, "num_input_tokens_seen": 1527245, "step": 76, "time_per_iteration": 2.9226863384246826 }, { "auxiliary_loss_clip": 0.01702512, "auxiliary_loss_mlp": 0.01145374, "balance_loss_clip": 1.26239479, "balance_loss_mlp": 1.07656646, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 2.4272090643104574, "language_loss": 0.91791159, "learning_rate": 2.796768605577095e-06, "loss": 0.94639051, "num_input_tokens_seen": 1548930, "step": 77, "time_per_iteration": 2.8720929622650146 }, { "auxiliary_loss_clip": 0.01693018, "auxiliary_loss_mlp": 0.01165978, "balance_loss_clip": 1.26398146, "balance_loss_mlp": 1.09569168, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.2822185142383034, "language_loss": 0.9211635, "learning_rate": 2.80507649095533e-06, "loss": 0.94975346, "num_input_tokens_seen": 1565695, "step": 78, "time_per_iteration": 2.7832391262054443 }, { "auxiliary_loss_clip": 0.01689271, "auxiliary_loss_mlp": 0.01153255, "balance_loss_clip": 1.25836253, "balance_loss_mlp": 1.08482933, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 2.263191265943929, "language_loss": 0.82771945, "learning_rate": 2.813278540517843e-06, "loss": 0.85614467, "num_input_tokens_seen": 1582625, "step": 79, "time_per_iteration": 2.7723355293273926 }, { "auxiliary_loss_clip": 0.01702468, "auxiliary_loss_mlp": 0.01130708, "balance_loss_clip": 1.26147008, "balance_loss_mlp": 1.0609467, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 1.9992491725405546, "language_loss": 0.91272199, "learning_rate": 2.8213774169075505e-06, "loss": 0.94105375, "num_input_tokens_seen": 1601725, "step": 80, "time_per_iteration": 2.742046356201172 }, { "auxiliary_loss_clip": 0.01671156, "auxiliary_loss_mlp": 0.01144048, "balance_loss_clip": 1.25365841, "balance_loss_mlp": 1.07371473, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.0371265012476742, "language_loss": 0.95241439, "learning_rate": 2.829375683533245e-06, "loss": 0.9805665, "num_input_tokens_seen": 1622420, "step": 81, "time_per_iteration": 2.8996386528015137 }, { "auxiliary_loss_clip": 0.01686092, "auxiliary_loss_mlp": 0.01147828, "balance_loss_clip": 1.25779653, "balance_loss_mlp": 1.08149946, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 2.9441337112970296, "language_loss": 0.96288472, "learning_rate": 2.8372758094402803e-06, "loss": 0.99122393, "num_input_tokens_seen": 1640715, "step": 82, "time_per_iteration": 2.819120407104492 }, { "auxiliary_loss_clip": 0.01668255, "auxiliary_loss_mlp": 0.01156428, "balance_loss_clip": 1.2461338, "balance_loss_mlp": 1.08709574, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 2.6601797838877856, "language_loss": 0.86762071, "learning_rate": 2.84508017388607e-06, "loss": 0.89586747, "num_input_tokens_seen": 1662210, "step": 83, "time_per_iteration": 2.7959344387054443 }, { "auxiliary_loss_clip": 0.01662665, "auxiliary_loss_mlp": 0.01154043, "balance_loss_clip": 1.24844718, "balance_loss_mlp": 1.084234, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.5416281292503986, "language_loss": 0.92081314, "learning_rate": 2.852791070641559e-06, "loss": 0.94898021, "num_input_tokens_seen": 1681070, "step": 84, "time_per_iteration": 2.7176246643066406 }, { "auxiliary_loss_clip": 0.01647627, "auxiliary_loss_mlp": 0.01154949, "balance_loss_clip": 1.36429358, "balance_loss_mlp": 1.11527622, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.4023430227621099, "language_loss": 0.6252538, "learning_rate": 2.8604107120381682e-06, "loss": 0.65327954, "num_input_tokens_seen": 1747140, "step": 85, "time_per_iteration": 3.296835422515869 }, { "auxiliary_loss_clip": 0.01649469, "auxiliary_loss_mlp": 0.0112642, "balance_loss_clip": 1.23797417, "balance_loss_mlp": 1.05642033, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 1.805253124779358, "language_loss": 0.90709531, "learning_rate": 2.8679412327780482e-06, "loss": 0.93485421, "num_input_tokens_seen": 1767475, "step": 86, "time_per_iteration": 2.761484146118164 }, { "auxiliary_loss_clip": 0.01653351, "auxiliary_loss_mlp": 0.01158608, "balance_loss_clip": 1.24437881, "balance_loss_mlp": 1.08741617, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.3398213465495776, "language_loss": 0.81961077, "learning_rate": 2.8753846935240833e-06, "loss": 0.8477304, "num_input_tokens_seen": 1784980, "step": 87, "time_per_iteration": 2.763185739517212 }, { "auxiliary_loss_clip": 0.01641581, "auxiliary_loss_mlp": 0.01152623, "balance_loss_clip": 1.24129367, "balance_loss_mlp": 1.08457828, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 3.1951080427559857, "language_loss": 0.95790672, "learning_rate": 2.8827430842847267e-06, "loss": 0.98584872, "num_input_tokens_seen": 1803030, "step": 88, "time_per_iteration": 2.7855517864227295 }, { "auxiliary_loss_clip": 0.01658657, "auxiliary_loss_mlp": 0.01147064, "balance_loss_clip": 1.24130976, "balance_loss_mlp": 1.07978201, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 3.405407923072192, "language_loss": 0.86023164, "learning_rate": 2.8900183276075957e-06, "loss": 0.88828892, "num_input_tokens_seen": 1822865, "step": 89, "time_per_iteration": 2.7517924308776855 }, { "auxiliary_loss_clip": 0.01647446, "auxiliary_loss_mlp": 0.01133456, "balance_loss_clip": 1.23541856, "balance_loss_mlp": 1.06727123, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.130771496386599, "language_loss": 0.9150058, "learning_rate": 2.8972122815946455e-06, "loss": 0.94281483, "num_input_tokens_seen": 1842435, "step": 90, "time_per_iteration": 2.7526872158050537 }, { "auxiliary_loss_clip": 0.01629409, "auxiliary_loss_mlp": 0.01133822, "balance_loss_clip": 1.23219132, "balance_loss_mlp": 1.06582534, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 2.6928798867856796, "language_loss": 0.86073506, "learning_rate": 2.90432674275074e-06, "loss": 0.88836741, "num_input_tokens_seen": 1860065, "step": 91, "time_per_iteration": 2.7995588779449463 }, { "auxiliary_loss_clip": 0.01628638, "auxiliary_loss_mlp": 0.01138916, "balance_loss_clip": 1.22774827, "balance_loss_mlp": 1.07335091, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 5.062847798051961, "language_loss": 0.87041199, "learning_rate": 2.91136344867656e-06, "loss": 0.8980875, "num_input_tokens_seen": 1878135, "step": 92, "time_per_iteration": 2.7813079357147217 }, { "auxiliary_loss_clip": 0.01620799, "auxiliary_loss_mlp": 0.01174163, "balance_loss_clip": 1.21933174, "balance_loss_mlp": 1.10650027, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 4.340668874696889, "language_loss": 0.9210887, "learning_rate": 2.918324080615938e-06, "loss": 0.94903833, "num_input_tokens_seen": 1894895, "step": 93, "time_per_iteration": 2.7582218647003174 }, { "auxiliary_loss_clip": 0.0163427, "auxiliary_loss_mlp": 0.01153574, "balance_loss_clip": 1.22659743, "balance_loss_mlp": 1.08238208, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 4.327341326162078, "language_loss": 0.87578797, "learning_rate": 2.925210265866963e-06, "loss": 0.90366644, "num_input_tokens_seen": 1913220, "step": 94, "time_per_iteration": 2.783581256866455 }, { "auxiliary_loss_clip": 0.01570285, "auxiliary_loss_mlp": 0.01051726, "balance_loss_clip": 1.31970167, "balance_loss_mlp": 1.01376939, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.3608185384271176, "language_loss": 0.68098927, "learning_rate": 2.932023580065507e-06, "loss": 0.70720935, "num_input_tokens_seen": 1970970, "step": 95, "time_per_iteration": 3.1328847408294678 }, { "auxiliary_loss_clip": 0.01612519, "auxiliary_loss_mlp": 0.01150182, "balance_loss_clip": 1.21488237, "balance_loss_mlp": 1.08318627, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 6.736145376327001, "language_loss": 0.90221369, "learning_rate": 2.9387655493491906e-06, "loss": 0.92984068, "num_input_tokens_seen": 1988930, "step": 96, "time_per_iteration": 2.8015241622924805 }, { "auxiliary_loss_clip": 0.01605814, "auxiliary_loss_mlp": 0.01142022, "balance_loss_clip": 1.21851277, "balance_loss_mlp": 1.08003318, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 3.8307865500968044, "language_loss": 0.89869905, "learning_rate": 2.9454376524092147e-06, "loss": 0.92617744, "num_input_tokens_seen": 2006285, "step": 97, "time_per_iteration": 4.387299060821533 }, { "auxiliary_loss_clip": 0.01593214, "auxiliary_loss_mlp": 0.01140673, "balance_loss_clip": 1.2102325, "balance_loss_mlp": 1.07200789, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 2.291581893082518, "language_loss": 0.76274347, "learning_rate": 2.952041322436969e-06, "loss": 0.79008234, "num_input_tokens_seen": 2024905, "step": 98, "time_per_iteration": 2.751507043838501 }, { "auxiliary_loss_clip": 0.01533926, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.29271698, "balance_loss_mlp": 1.00129879, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0388395506080574, "language_loss": 0.65518898, "learning_rate": 2.9585779489718204e-06, "loss": 0.68089598, "num_input_tokens_seen": 2086220, "step": 99, "time_per_iteration": 3.3125040531158447 }, { "auxiliary_loss_clip": 0.01595694, "auxiliary_loss_mlp": 0.01142556, "balance_loss_clip": 1.21028757, "balance_loss_mlp": 1.07217503, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 2.051483688350497, "language_loss": 0.90885437, "learning_rate": 2.9650488796560464e-06, "loss": 0.93623686, "num_input_tokens_seen": 2103365, "step": 100, "time_per_iteration": 2.7632548809051514 }, { "auxiliary_loss_clip": 0.01607235, "auxiliary_loss_mlp": 0.01150276, "balance_loss_clip": 1.21294045, "balance_loss_mlp": 1.08394814, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 2.0181737234491566, "language_loss": 0.91081136, "learning_rate": 2.971455421902446e-06, "loss": 0.9383865, "num_input_tokens_seen": 2121995, "step": 101, "time_per_iteration": 2.7214279174804688 }, { "auxiliary_loss_clip": 0.015938, "auxiliary_loss_mlp": 0.01152009, "balance_loss_clip": 1.21248627, "balance_loss_mlp": 1.08124638, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.076276442041171, "language_loss": 0.90774924, "learning_rate": 2.9777988444798075e-06, "loss": 0.93520737, "num_input_tokens_seen": 2141815, "step": 102, "time_per_iteration": 2.8389108180999756 }, { "auxiliary_loss_clip": 0.01588155, "auxiliary_loss_mlp": 0.01133785, "balance_loss_clip": 1.20914173, "balance_loss_mlp": 1.06912589, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 2.3272829989328456, "language_loss": 0.88006896, "learning_rate": 2.9840803790210285e-06, "loss": 0.90728837, "num_input_tokens_seen": 2161125, "step": 103, "time_per_iteration": 2.768784761428833 }, { "auxiliary_loss_clip": 0.01588751, "auxiliary_loss_mlp": 0.01136216, "balance_loss_clip": 1.21138883, "balance_loss_mlp": 1.06998372, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 1.9182889224259552, "language_loss": 0.93644351, "learning_rate": 2.990301221458371e-06, "loss": 0.96369314, "num_input_tokens_seen": 2179510, "step": 104, "time_per_iteration": 2.7109038829803467 }, { "auxiliary_loss_clip": 0.01579421, "auxiliary_loss_mlp": 0.01146524, "balance_loss_clip": 1.20086741, "balance_loss_mlp": 1.08258009, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 3.0437899698059367, "language_loss": 0.96655375, "learning_rate": 2.9964625333900544e-06, "loss": 0.99381316, "num_input_tokens_seen": 2197870, "step": 105, "time_per_iteration": 2.7254133224487305 }, { "auxiliary_loss_clip": 0.01578331, "auxiliary_loss_mlp": 0.01158544, "balance_loss_clip": 1.20144236, "balance_loss_mlp": 1.08768642, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 3.1837681777002302, "language_loss": 0.87119448, "learning_rate": 3.002565443382063e-06, "loss": 0.89856327, "num_input_tokens_seen": 2217495, "step": 106, "time_per_iteration": 2.7705447673797607 }, { "auxiliary_loss_clip": 0.01561845, "auxiliary_loss_mlp": 0.01143018, "balance_loss_clip": 1.18746924, "balance_loss_mlp": 1.0751636, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.228856706842439, "language_loss": 0.83398581, "learning_rate": 3.008611048208843e-06, "loss": 0.86103439, "num_input_tokens_seen": 2236520, "step": 107, "time_per_iteration": 2.6885263919830322 }, { "auxiliary_loss_clip": 0.01469631, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.25210869, "balance_loss_mlp": 1.00179863, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9900995959758047, "language_loss": 0.64796811, "learning_rate": 3.014600414036285e-06, "loss": 0.67299712, "num_input_tokens_seen": 2300140, "step": 108, "time_per_iteration": 3.278621196746826 }, { "auxiliary_loss_clip": 0.01552898, "auxiliary_loss_mlp": 0.01132858, "balance_loss_clip": 1.18960094, "balance_loss_mlp": 1.06424141, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 2.019247660217844, "language_loss": 0.97709465, "learning_rate": 3.0205345775501937e-06, "loss": 1.00395215, "num_input_tokens_seen": 2317320, "step": 109, "time_per_iteration": 2.750502347946167 }, { "auxiliary_loss_clip": 0.01550996, "auxiliary_loss_mlp": 0.01140204, "balance_loss_clip": 1.19136214, "balance_loss_mlp": 1.07430482, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.9540987754213832, "language_loss": 0.84243041, "learning_rate": 3.0264145470332218e-06, "loss": 0.86934245, "num_input_tokens_seen": 2337820, "step": 110, "time_per_iteration": 2.82443904876709 }, { "auxiliary_loss_clip": 0.01544634, "auxiliary_loss_mlp": 0.01151549, "balance_loss_clip": 1.18396342, "balance_loss_mlp": 1.08493507, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.4580319150483563, "language_loss": 0.82940048, "learning_rate": 3.032241303393073e-06, "loss": 0.85636234, "num_input_tokens_seen": 2358560, "step": 111, "time_per_iteration": 2.8308968544006348 }, { "auxiliary_loss_clip": 0.0154596, "auxiliary_loss_mlp": 0.01133366, "balance_loss_clip": 1.18776846, "balance_loss_mlp": 1.06970847, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.356589096997363, "language_loss": 0.93989801, "learning_rate": 3.0380158011446e-06, "loss": 0.9666912, "num_input_tokens_seen": 2379005, "step": 112, "time_per_iteration": 2.8007922172546387 }, { "auxiliary_loss_clip": 0.01549647, "auxiliary_loss_mlp": 0.01136979, "balance_loss_clip": 1.18394601, "balance_loss_mlp": 1.07322621, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.521639841990545, "language_loss": 0.79509294, "learning_rate": 3.0437389693482466e-06, "loss": 0.82195914, "num_input_tokens_seen": 2395610, "step": 113, "time_per_iteration": 2.7599966526031494 }, { "auxiliary_loss_clip": 0.0153736, "auxiliary_loss_mlp": 0.01131524, "balance_loss_clip": 1.18028498, "balance_loss_mlp": 1.06562555, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 2.343117351168218, "language_loss": 0.93439317, "learning_rate": 3.0494117125071475e-06, "loss": 0.96108204, "num_input_tokens_seen": 2415005, "step": 114, "time_per_iteration": 2.723540782928467 }, { "auxiliary_loss_clip": 0.01544971, "auxiliary_loss_mlp": 0.01138932, "balance_loss_clip": 1.17997146, "balance_loss_mlp": 1.07918465, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 1.9509019191057126, "language_loss": 0.9463321, "learning_rate": 3.055034911425055e-06, "loss": 0.97317111, "num_input_tokens_seen": 2433965, "step": 115, "time_per_iteration": 2.7077698707580566 }, { "auxiliary_loss_clip": 0.01537699, "auxiliary_loss_mlp": 0.01118178, "balance_loss_clip": 1.17675614, "balance_loss_mlp": 1.05151677, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 10.363795807176915, "language_loss": 0.82148951, "learning_rate": 3.0606094240271244e-06, "loss": 0.84804827, "num_input_tokens_seen": 2451605, "step": 116, "time_per_iteration": 2.681190013885498 }, { "auxiliary_loss_clip": 0.01528803, "auxiliary_loss_mlp": 0.01126189, "balance_loss_clip": 1.17677391, "balance_loss_mlp": 1.06219721, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.4150591879391627, "language_loss": 0.88368428, "learning_rate": 3.0661360861454656e-06, "loss": 0.91023421, "num_input_tokens_seen": 2472035, "step": 117, "time_per_iteration": 2.776143789291382 }, { "auxiliary_loss_clip": 0.01527909, "auxiliary_loss_mlp": 0.01146127, "balance_loss_clip": 1.17495561, "balance_loss_mlp": 1.08041906, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 2.3639764059040265, "language_loss": 0.8454417, "learning_rate": 3.071615712271274e-06, "loss": 0.87218207, "num_input_tokens_seen": 2489285, "step": 118, "time_per_iteration": 2.7110469341278076 }, { "auxiliary_loss_clip": 0.01538161, "auxiliary_loss_mlp": 0.01163868, "balance_loss_clip": 1.1759789, "balance_loss_mlp": 1.0984937, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 2.231843342078736, "language_loss": 0.99319011, "learning_rate": 3.0770490962752172e-06, "loss": 1.02021039, "num_input_tokens_seen": 2506460, "step": 119, "time_per_iteration": 2.674121856689453 }, { "auxiliary_loss_clip": 0.01540018, "auxiliary_loss_mlp": 0.01120611, "balance_loss_clip": 1.17242217, "balance_loss_mlp": 1.05738258, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 2.7981733983226764, "language_loss": 0.8963809, "learning_rate": 3.082437012097686e-06, "loss": 0.92298722, "num_input_tokens_seen": 2525565, "step": 120, "time_per_iteration": 2.745962381362915 }, { "auxiliary_loss_clip": 0.01524916, "auxiliary_loss_mlp": 0.01129465, "balance_loss_clip": 1.1734432, "balance_loss_mlp": 1.06513989, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.797716104424251, "language_loss": 0.93491542, "learning_rate": 3.0877802144103967e-06, "loss": 0.96145928, "num_input_tokens_seen": 2546605, "step": 121, "time_per_iteration": 2.7924466133117676 }, { "auxiliary_loss_clip": 0.01526294, "auxiliary_loss_mlp": 0.0114832, "balance_loss_clip": 1.17395604, "balance_loss_mlp": 1.08490098, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.3704869501778285, "language_loss": 0.90462255, "learning_rate": 3.09307943925077e-06, "loss": 0.93136871, "num_input_tokens_seen": 2560730, "step": 122, "time_per_iteration": 2.930413246154785 }, { "auxiliary_loss_clip": 0.01521826, "auxiliary_loss_mlp": 0.01146566, "balance_loss_clip": 1.1681807, "balance_loss_mlp": 1.07861674, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.4867163179710037, "language_loss": 0.92660481, "learning_rate": 3.0983354046304154e-06, "loss": 0.95328873, "num_input_tokens_seen": 2579550, "step": 123, "time_per_iteration": 2.7484309673309326 }, { "auxiliary_loss_clip": 0.01519363, "auxiliary_loss_mlp": 0.01127611, "balance_loss_clip": 1.16324139, "balance_loss_mlp": 1.0651449, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 2.366639004459226, "language_loss": 0.71187961, "learning_rate": 3.103548811118979e-06, "loss": 0.73834932, "num_input_tokens_seen": 2600390, "step": 124, "time_per_iteration": 2.8419976234436035 }, { "auxiliary_loss_clip": 0.01506936, "auxiliary_loss_mlp": 0.01125571, "balance_loss_clip": 1.16464007, "balance_loss_mlp": 1.06167519, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.1632751269766106, "language_loss": 0.88450015, "learning_rate": 3.108720342404542e-06, "loss": 0.91082525, "num_input_tokens_seen": 2620770, "step": 125, "time_per_iteration": 2.823296308517456 }, { "auxiliary_loss_clip": 0.01522239, "auxiliary_loss_mlp": 0.01142214, "balance_loss_clip": 1.16456664, "balance_loss_mlp": 1.07912827, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 2.6632616920164067, "language_loss": 0.82381976, "learning_rate": 3.1138506658316945e-06, "loss": 0.85046428, "num_input_tokens_seen": 2639900, "step": 126, "time_per_iteration": 2.7325809001922607 }, { "auxiliary_loss_clip": 0.015153, "auxiliary_loss_mlp": 0.01142869, "balance_loss_clip": 1.16330886, "balance_loss_mlp": 1.08088017, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 3.925284628341409, "language_loss": 0.6743899, "learning_rate": 3.1189404329183404e-06, "loss": 0.7009716, "num_input_tokens_seen": 2657450, "step": 127, "time_per_iteration": 2.709821939468384 }, { "auxiliary_loss_clip": 0.01503057, "auxiliary_loss_mlp": 0.01132416, "balance_loss_clip": 1.165169, "balance_loss_mlp": 1.06861567, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.0535131533503734, "language_loss": 0.8819322, "learning_rate": 3.1239902798522317e-06, "loss": 0.90828693, "num_input_tokens_seen": 2678150, "step": 128, "time_per_iteration": 2.764707565307617 }, { "auxiliary_loss_clip": 0.01505955, "auxiliary_loss_mlp": 0.01144223, "balance_loss_clip": 1.16043079, "balance_loss_mlp": 1.08042252, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 2.6427711693827005, "language_loss": 0.84719259, "learning_rate": 3.129000827968184e-06, "loss": 0.87369436, "num_input_tokens_seen": 2698290, "step": 129, "time_per_iteration": 2.7472774982452393 }, { "auxiliary_loss_clip": 0.01497871, "auxiliary_loss_mlp": 0.01130211, "balance_loss_clip": 1.15871263, "balance_loss_mlp": 1.06655347, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.366492959329914, "language_loss": 0.97564614, "learning_rate": 3.133972684206866e-06, "loss": 1.00192702, "num_input_tokens_seen": 2717630, "step": 130, "time_per_iteration": 2.6955018043518066 }, { "auxiliary_loss_clip": 0.01492272, "auxiliary_loss_mlp": 0.01134965, "balance_loss_clip": 1.15630865, "balance_loss_mlp": 1.06987715, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 2.2164470079204572, "language_loss": 0.82658112, "learning_rate": 3.138906441556014e-06, "loss": 0.85285342, "num_input_tokens_seen": 2735835, "step": 131, "time_per_iteration": 2.722247362136841 }, { "auxiliary_loss_clip": 0.01500937, "auxiliary_loss_mlp": 0.01128359, "balance_loss_clip": 1.15885806, "balance_loss_mlp": 1.06694245, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 2.7663180664822193, "language_loss": 0.82781422, "learning_rate": 3.143802679474861e-06, "loss": 0.85410714, "num_input_tokens_seen": 2756335, "step": 132, "time_per_iteration": 2.7937612533569336 }, { "auxiliary_loss_clip": 0.01491919, "auxiliary_loss_mlp": 0.01128624, "balance_loss_clip": 1.15346444, "balance_loss_mlp": 1.0664922, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.182366740159355, "language_loss": 0.95499313, "learning_rate": 3.1486619643025565e-06, "loss": 0.98119843, "num_input_tokens_seen": 2775090, "step": 133, "time_per_iteration": 2.7380354404449463 }, { "auxiliary_loss_clip": 0.01487746, "auxiliary_loss_mlp": 0.0112871, "balance_loss_clip": 1.16170454, "balance_loss_mlp": 1.06843781, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 1.8164116645967854, "language_loss": 0.73478442, "learning_rate": 3.153484849651286e-06, "loss": 0.76094896, "num_input_tokens_seen": 2795320, "step": 134, "time_per_iteration": 2.7483408451080322 }, { "auxiliary_loss_clip": 0.01484621, "auxiliary_loss_mlp": 0.01132134, "balance_loss_clip": 1.15115011, "balance_loss_mlp": 1.06695068, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 5.027018494085059, "language_loss": 0.88792509, "learning_rate": 3.1582718767847806e-06, "loss": 0.91409266, "num_input_tokens_seen": 2812815, "step": 135, "time_per_iteration": 2.6838128566741943 }, { "auxiliary_loss_clip": 0.01487119, "auxiliary_loss_mlp": 0.0113257, "balance_loss_clip": 1.15490174, "balance_loss_mlp": 1.06714821, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.9282722528396903, "language_loss": 0.89138198, "learning_rate": 3.1630235749828485e-06, "loss": 0.91757882, "num_input_tokens_seen": 2830445, "step": 136, "time_per_iteration": 2.726475238800049 }, { "auxiliary_loss_clip": 0.01483417, "auxiliary_loss_mlp": 0.01110724, "balance_loss_clip": 1.1494019, "balance_loss_mlp": 1.05078554, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.2984339413846078, "language_loss": 0.84091324, "learning_rate": 3.1677404618925676e-06, "loss": 0.86685467, "num_input_tokens_seen": 2846965, "step": 137, "time_per_iteration": 7.4708640575408936 }, { "auxiliary_loss_clip": 0.01481848, "auxiliary_loss_mlp": 0.01118837, "balance_loss_clip": 1.1500535, "balance_loss_mlp": 1.05894589, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 1.69378413504035, "language_loss": 0.9018681, "learning_rate": 3.1724230438666953e-06, "loss": 0.92787492, "num_input_tokens_seen": 2867520, "step": 138, "time_per_iteration": 4.311830520629883 }, { "auxiliary_loss_clip": 0.01469655, "auxiliary_loss_mlp": 0.01123604, "balance_loss_clip": 1.14824438, "balance_loss_mlp": 1.05904007, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.1515203004813785, "language_loss": 0.91478992, "learning_rate": 3.177071816289865e-06, "loss": 0.94072247, "num_input_tokens_seen": 2885675, "step": 139, "time_per_iteration": 2.7678122520446777 }, { "auxiliary_loss_clip": 0.01486799, "auxiliary_loss_mlp": 0.01124947, "balance_loss_clip": 1.15521085, "balance_loss_mlp": 1.06195688, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 2.305315677890536, "language_loss": 0.85667789, "learning_rate": 3.181687263893095e-06, "loss": 0.88279533, "num_input_tokens_seen": 2905960, "step": 140, "time_per_iteration": 2.8557639122009277 }, { "auxiliary_loss_clip": 0.01473538, "auxiliary_loss_mlp": 0.01122701, "balance_loss_clip": 1.14923954, "balance_loss_mlp": 1.06166625, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 2.3443620963590455, "language_loss": 0.84346074, "learning_rate": 3.186269861057098e-06, "loss": 0.86942315, "num_input_tokens_seen": 2922780, "step": 141, "time_per_iteration": 2.7656807899475098 }, { "auxiliary_loss_clip": 0.01477141, "auxiliary_loss_mlp": 0.01135217, "balance_loss_clip": 1.14718878, "balance_loss_mlp": 1.07360983, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.29020652115343, "language_loss": 0.8105557, "learning_rate": 3.1908200721048745e-06, "loss": 0.83667928, "num_input_tokens_seen": 2938765, "step": 142, "time_per_iteration": 2.747598171234131 }, { "auxiliary_loss_clip": 0.01378886, "auxiliary_loss_mlp": 0.01060004, "balance_loss_clip": 1.19240355, "balance_loss_mlp": 1.03406358, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.056887207538052, "language_loss": 0.66899812, "learning_rate": 3.195338351584042e-06, "loss": 0.69338703, "num_input_tokens_seen": 3006665, "step": 143, "time_per_iteration": 3.346982002258301 }, { "auxiliary_loss_clip": 0.01467707, "auxiliary_loss_mlp": 0.01123721, "balance_loss_clip": 1.14666772, "balance_loss_mlp": 1.06273365, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.6467048454978523, "language_loss": 0.84356761, "learning_rate": 3.1998251445393258e-06, "loss": 0.86948192, "num_input_tokens_seen": 3024335, "step": 144, "time_per_iteration": 2.762087345123291 }, { "auxiliary_loss_clip": 0.01455701, "auxiliary_loss_mlp": 0.01114511, "balance_loss_clip": 1.14058816, "balance_loss_mlp": 1.05085373, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 1.8692883316747366, "language_loss": 0.88353741, "learning_rate": 3.204280886775619e-06, "loss": 0.90923953, "num_input_tokens_seen": 3043300, "step": 145, "time_per_iteration": 2.7050039768218994 }, { "auxiliary_loss_clip": 0.01470385, "auxiliary_loss_mlp": 0.01121817, "balance_loss_clip": 1.14247775, "balance_loss_mlp": 1.05873132, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 1.860830881508538, "language_loss": 0.86182559, "learning_rate": 3.208706005112005e-06, "loss": 0.88774765, "num_input_tokens_seen": 3064610, "step": 146, "time_per_iteration": 2.741013288497925 }, { "auxiliary_loss_clip": 0.01356998, "auxiliary_loss_mlp": 0.01029681, "balance_loss_clip": 1.18072379, "balance_loss_mlp": 1.00431335, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8598047517885464, "language_loss": 0.60122073, "learning_rate": 3.213100917627104e-06, "loss": 0.6250875, "num_input_tokens_seen": 3130385, "step": 147, "time_per_iteration": 3.27382230758667 }, { "auxiliary_loss_clip": 0.01463009, "auxiliary_loss_mlp": 0.01123472, "balance_loss_clip": 1.14658976, "balance_loss_mlp": 1.06548882, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 1.8116070485228748, "language_loss": 0.84620225, "learning_rate": 3.2174660338961135e-06, "loss": 0.87206709, "num_input_tokens_seen": 3149760, "step": 148, "time_per_iteration": 2.72910475730896 }, { "auxiliary_loss_clip": 0.01466623, "auxiliary_loss_mlp": 0.01144944, "balance_loss_clip": 1.14777792, "balance_loss_mlp": 1.07985532, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 2.5530775415688205, "language_loss": 0.88680327, "learning_rate": 3.2218017552198588e-06, "loss": 0.91291893, "num_input_tokens_seen": 3164500, "step": 149, "time_per_iteration": 2.688528537750244 }, { "auxiliary_loss_clip": 0.01463954, "auxiliary_loss_mlp": 0.01114885, "balance_loss_clip": 1.14290714, "balance_loss_mlp": 1.05728304, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 2.1996557200804823, "language_loss": 0.93269086, "learning_rate": 3.226108474846181e-06, "loss": 0.95847929, "num_input_tokens_seen": 3182455, "step": 150, "time_per_iteration": 2.7901580333709717 }, { "auxiliary_loss_clip": 0.01450819, "auxiliary_loss_mlp": 0.01114571, "balance_loss_clip": 1.13812149, "balance_loss_mlp": 1.05839944, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 4.690239135210318, "language_loss": 0.7421813, "learning_rate": 3.2303865781839817e-06, "loss": 0.7678352, "num_input_tokens_seen": 3203995, "step": 151, "time_per_iteration": 2.79590106010437 }, { "auxiliary_loss_clip": 0.01463077, "auxiliary_loss_mlp": 0.01128244, "balance_loss_clip": 1.14311624, "balance_loss_mlp": 1.06954527, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 4.291097242497492, "language_loss": 0.88460332, "learning_rate": 3.234636443010188e-06, "loss": 0.9105165, "num_input_tokens_seen": 3222575, "step": 152, "time_per_iteration": 2.701775550842285 }, { "auxiliary_loss_clip": 0.01462099, "auxiliary_loss_mlp": 0.01122264, "balance_loss_clip": 1.14743185, "balance_loss_mlp": 1.06275451, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 3.861411936226758, "language_loss": 0.83918798, "learning_rate": 3.238858439669943e-06, "loss": 0.8650316, "num_input_tokens_seen": 3240180, "step": 153, "time_per_iteration": 2.730654716491699 }, { "auxiliary_loss_clip": 0.01453756, "auxiliary_loss_mlp": 0.01136244, "balance_loss_clip": 1.14024806, "balance_loss_mlp": 1.07554269, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 1.8788427995178905, "language_loss": 0.89924759, "learning_rate": 3.2430529312702712e-06, "loss": 0.92514759, "num_input_tokens_seen": 3259800, "step": 154, "time_per_iteration": 2.8150386810302734 }, { "auxiliary_loss_clip": 0.01457041, "auxiliary_loss_mlp": 0.01148182, "balance_loss_clip": 1.1422174, "balance_loss_mlp": 1.08934021, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.155148564981828, "language_loss": 0.89730597, "learning_rate": 3.2472202738674737e-06, "loss": 0.9233582, "num_input_tokens_seen": 3280400, "step": 155, "time_per_iteration": 2.7780215740203857 }, { "auxiliary_loss_clip": 0.01462257, "auxiliary_loss_mlp": 0.01115972, "balance_loss_clip": 1.14140153, "balance_loss_mlp": 1.0580368, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 2.6722626388977986, "language_loss": 0.86758631, "learning_rate": 3.2513608166485063e-06, "loss": 0.8933686, "num_input_tokens_seen": 3297600, "step": 156, "time_per_iteration": 2.7195818424224854 }, { "auxiliary_loss_clip": 0.01460326, "auxiliary_loss_mlp": 0.01116019, "balance_loss_clip": 1.14530039, "balance_loss_mlp": 1.05770147, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 2.3212743339319926, "language_loss": 0.99652225, "learning_rate": 3.2554749021065498e-06, "loss": 1.0222857, "num_input_tokens_seen": 3313635, "step": 157, "time_per_iteration": 2.7530624866485596 }, { "auxiliary_loss_clip": 0.01445494, "auxiliary_loss_mlp": 0.01139991, "balance_loss_clip": 1.14011836, "balance_loss_mlp": 1.08162606, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.2650385025378834, "language_loss": 0.88388717, "learning_rate": 3.2595628662110186e-06, "loss": 0.90974212, "num_input_tokens_seen": 3333735, "step": 158, "time_per_iteration": 2.744640588760376 }, { "auxiliary_loss_clip": 0.01451838, "auxiliary_loss_mlp": 0.01122147, "balance_loss_clip": 1.13977575, "balance_loss_mlp": 1.0630666, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.1807440045696165, "language_loss": 0.86407602, "learning_rate": 3.2636250385721982e-06, "loss": 0.88981581, "num_input_tokens_seen": 3348800, "step": 159, "time_per_iteration": 2.7330005168914795 }, { "auxiliary_loss_clip": 0.01441743, "auxiliary_loss_mlp": 0.01137796, "balance_loss_clip": 1.13474953, "balance_loss_mlp": 1.07752383, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 1.7296815250329798, "language_loss": 0.86756837, "learning_rate": 3.2676617426007263e-06, "loss": 0.89336377, "num_input_tokens_seen": 3368595, "step": 160, "time_per_iteration": 2.844817876815796 }, { "auxiliary_loss_clip": 0.01447614, "auxiliary_loss_mlp": 0.0112266, "balance_loss_clip": 1.13978457, "balance_loss_mlp": 1.06725168, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.462408333273543, "language_loss": 0.91543746, "learning_rate": 3.2716732956621042e-06, "loss": 0.94114017, "num_input_tokens_seen": 3384975, "step": 161, "time_per_iteration": 2.667666435241699 }, { "auxiliary_loss_clip": 0.01453392, "auxiliary_loss_mlp": 0.01111804, "balance_loss_clip": 1.14104879, "balance_loss_mlp": 1.05610919, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.7914334411859298, "language_loss": 0.91582954, "learning_rate": 3.2756600092264203e-06, "loss": 0.94148147, "num_input_tokens_seen": 3404755, "step": 162, "time_per_iteration": 2.6779961585998535 }, { "auxiliary_loss_clip": 0.0131522, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.15019548, "balance_loss_mlp": 1.03358769, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.183297200633083, "language_loss": 0.72292268, "learning_rate": 3.279622189013474e-06, "loss": 0.74664438, "num_input_tokens_seen": 3467210, "step": 163, "time_per_iteration": 3.226755142211914 }, { "auxiliary_loss_clip": 0.01439788, "auxiliary_loss_mlp": 0.01116102, "balance_loss_clip": 1.13873029, "balance_loss_mlp": 1.05921507, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 3.3372881081540937, "language_loss": 0.84684807, "learning_rate": 3.283560135133457e-06, "loss": 0.87240696, "num_input_tokens_seen": 3483220, "step": 164, "time_per_iteration": 2.768935203552246 }, { "auxiliary_loss_clip": 0.01430933, "auxiliary_loss_mlp": 0.0110117, "balance_loss_clip": 1.13048434, "balance_loss_mlp": 1.04533219, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 4.079659732294038, "language_loss": 0.89080763, "learning_rate": 3.2874741422233565e-06, "loss": 0.91612864, "num_input_tokens_seen": 3501465, "step": 165, "time_per_iteration": 2.673292875289917 }, { "auxiliary_loss_clip": 0.01433192, "auxiliary_loss_mlp": 0.01128138, "balance_loss_clip": 1.13111067, "balance_loss_mlp": 1.06819916, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 1.7359539169577796, "language_loss": 0.79931343, "learning_rate": 3.2913644995792465e-06, "loss": 0.82492673, "num_input_tokens_seen": 3520480, "step": 166, "time_per_iteration": 2.762742757797241 }, { "auxiliary_loss_clip": 0.01438026, "auxiliary_loss_mlp": 0.01129718, "balance_loss_clip": 1.13488948, "balance_loss_mlp": 1.07066131, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.3252666324684585, "language_loss": 0.92125285, "learning_rate": 3.2952314912845914e-06, "loss": 0.94693023, "num_input_tokens_seen": 3539570, "step": 167, "time_per_iteration": 2.970964193344116 }, { "auxiliary_loss_clip": 0.01429698, "auxiliary_loss_mlp": 0.01133324, "balance_loss_clip": 1.13294363, "balance_loss_mlp": 1.07734346, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 13.512238716069085, "language_loss": 0.90781063, "learning_rate": 3.299075396334735e-06, "loss": 0.93344086, "num_input_tokens_seen": 3555465, "step": 168, "time_per_iteration": 2.8039841651916504 }, { "auxiliary_loss_clip": 0.01424367, "auxiliary_loss_mlp": 0.01104795, "balance_loss_clip": 1.12848639, "balance_loss_mlp": 1.04700291, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.6705351130563955, "language_loss": 0.87173021, "learning_rate": 3.3028964887576868e-06, "loss": 0.89702177, "num_input_tokens_seen": 3578970, "step": 169, "time_per_iteration": 2.8215444087982178 }, { "auxiliary_loss_clip": 0.01425902, "auxiliary_loss_mlp": 0.01110538, "balance_loss_clip": 1.13139379, "balance_loss_mlp": 1.05317438, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 1.7404257397879006, "language_loss": 0.84622329, "learning_rate": 3.306695037731344e-06, "loss": 0.87158769, "num_input_tokens_seen": 3597275, "step": 170, "time_per_iteration": 2.6759181022644043 }, { "auxiliary_loss_clip": 0.0143612, "auxiliary_loss_mlp": 0.01137162, "balance_loss_clip": 1.13149834, "balance_loss_mlp": 1.07874942, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.174517661608974, "language_loss": 0.89936447, "learning_rate": 3.3104713076972827e-06, "loss": 0.92509729, "num_input_tokens_seen": 3618905, "step": 171, "time_per_iteration": 2.800394058227539 }, { "auxiliary_loss_clip": 0.01430673, "auxiliary_loss_mlp": 0.01108779, "balance_loss_clip": 1.1347487, "balance_loss_mlp": 1.05382347, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.938241860949196, "language_loss": 0.88895655, "learning_rate": 3.314225558471224e-06, "loss": 0.91435111, "num_input_tokens_seen": 3639610, "step": 172, "time_per_iteration": 2.755190849304199 }, { "auxiliary_loss_clip": 0.01418638, "auxiliary_loss_mlp": 0.01118471, "balance_loss_clip": 1.12744904, "balance_loss_mlp": 1.06270456, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.7925778946034159, "language_loss": 0.80943549, "learning_rate": 3.317958045350308e-06, "loss": 0.83480656, "num_input_tokens_seen": 3664030, "step": 173, "time_per_iteration": 2.751945734024048 }, { "auxiliary_loss_clip": 0.01429615, "auxiliary_loss_mlp": 0.01107965, "balance_loss_clip": 1.13108575, "balance_loss_mlp": 1.05534625, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 2.1644843911099216, "language_loss": 0.82763064, "learning_rate": 3.3216690192172596e-06, "loss": 0.85300648, "num_input_tokens_seen": 3683615, "step": 174, "time_per_iteration": 2.676630735397339 }, { "auxiliary_loss_clip": 0.01423443, "auxiliary_loss_mlp": 0.01120976, "balance_loss_clip": 1.12816644, "balance_loss_mlp": 1.06523335, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 2.331494685324117, "language_loss": 0.72837007, "learning_rate": 3.325358726641591e-06, "loss": 0.75381434, "num_input_tokens_seen": 3704540, "step": 175, "time_per_iteration": 2.6876866817474365 }, { "auxiliary_loss_clip": 0.01425333, "auxiliary_loss_mlp": 0.01127215, "balance_loss_clip": 1.12866652, "balance_loss_mlp": 1.06980324, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 4.811985773634618, "language_loss": 0.97983754, "learning_rate": 3.329027409977902e-06, "loss": 1.00536299, "num_input_tokens_seen": 3721320, "step": 176, "time_per_iteration": 2.8159937858581543 }, { "auxiliary_loss_clip": 0.0141033, "auxiliary_loss_mlp": 0.01130651, "balance_loss_clip": 1.12546706, "balance_loss_mlp": 1.07738805, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.8326118759658585, "language_loss": 0.76926064, "learning_rate": 3.3326753074614087e-06, "loss": 0.7946704, "num_input_tokens_seen": 3739385, "step": 177, "time_per_iteration": 5.7707555294036865 }, { "auxiliary_loss_clip": 0.01421858, "auxiliary_loss_mlp": 0.01104718, "balance_loss_clip": 1.12455702, "balance_loss_mlp": 1.05002475, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 2.6517911185675014, "language_loss": 0.76942402, "learning_rate": 3.3363026533007716e-06, "loss": 0.79468977, "num_input_tokens_seen": 3756360, "step": 178, "time_per_iteration": 4.337082386016846 }, { "auxiliary_loss_clip": 0.01430293, "auxiliary_loss_mlp": 0.01109414, "balance_loss_clip": 1.1303575, "balance_loss_mlp": 1.05252683, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.6843360372821925, "language_loss": 0.84022826, "learning_rate": 3.3399096777683303e-06, "loss": 0.86562538, "num_input_tokens_seen": 3773930, "step": 179, "time_per_iteration": 2.6826629638671875 }, { "auxiliary_loss_clip": 0.01418094, "auxiliary_loss_mlp": 0.01108667, "balance_loss_clip": 1.12202275, "balance_loss_mlp": 1.05158973, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.0256655839140083, "language_loss": 0.83674574, "learning_rate": 3.3434966072878213e-06, "loss": 0.86201334, "num_input_tokens_seen": 3793630, "step": 180, "time_per_iteration": 2.7483785152435303 }, { "auxiliary_loss_clip": 0.01421326, "auxiliary_loss_mlp": 0.01120347, "balance_loss_clip": 1.12740374, "balance_loss_mlp": 1.0646286, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 3.253139118534122, "language_loss": 0.77958715, "learning_rate": 3.3470636645196674e-06, "loss": 0.80500388, "num_input_tokens_seen": 3813610, "step": 181, "time_per_iteration": 2.698941469192505 }, { "auxiliary_loss_clip": 0.01414948, "auxiliary_loss_mlp": 0.01130231, "balance_loss_clip": 1.12188053, "balance_loss_mlp": 1.07577634, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 2.56637338396407, "language_loss": 0.76438594, "learning_rate": 3.3506110684439156e-06, "loss": 0.78983772, "num_input_tokens_seen": 3831390, "step": 182, "time_per_iteration": 2.6951375007629395 }, { "auxiliary_loss_clip": 0.01412526, "auxiliary_loss_mlp": 0.01126665, "balance_loss_clip": 1.12167537, "balance_loss_mlp": 1.0702554, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.083158831639218, "language_loss": 0.87484097, "learning_rate": 3.3541390344409054e-06, "loss": 0.90023291, "num_input_tokens_seen": 3849705, "step": 183, "time_per_iteration": 2.733753204345703 }, { "auxiliary_loss_clip": 0.01415922, "auxiliary_loss_mlp": 0.01110585, "balance_loss_clip": 1.12529624, "balance_loss_mlp": 1.05922985, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 3.105080129831269, "language_loss": 0.86911464, "learning_rate": 3.357647774369736e-06, "loss": 0.89437973, "num_input_tokens_seen": 3869230, "step": 184, "time_per_iteration": 2.6783828735351562 }, { "auxiliary_loss_clip": 0.01410648, "auxiliary_loss_mlp": 0.01108321, "balance_loss_clip": 1.12499499, "balance_loss_mlp": 1.05203021, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.8650514063709744, "language_loss": 0.83885491, "learning_rate": 3.3611374966446085e-06, "loss": 0.86404455, "num_input_tokens_seen": 3889735, "step": 185, "time_per_iteration": 2.6863327026367188 }, { "auxiliary_loss_clip": 0.01419384, "auxiliary_loss_mlp": 0.01107812, "balance_loss_clip": 1.12355363, "balance_loss_mlp": 1.04999495, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 2.8933407749520743, "language_loss": 0.71027243, "learning_rate": 3.3646084063091142e-06, "loss": 0.73554444, "num_input_tokens_seen": 3908855, "step": 186, "time_per_iteration": 2.819805383682251 }, { "auxiliary_loss_clip": 0.01415699, "auxiliary_loss_mlp": 0.01108312, "balance_loss_clip": 1.12262082, "balance_loss_mlp": 1.05574071, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.4244794785226733, "language_loss": 1.01999915, "learning_rate": 3.3680607051085194e-06, "loss": 1.04523933, "num_input_tokens_seen": 3923865, "step": 187, "time_per_iteration": 2.65875506401062 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.01107995, "balance_loss_clip": 1.12269068, "balance_loss_mlp": 1.05253887, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 2.0089158406542524, "language_loss": 0.74998611, "learning_rate": 3.371494591560139e-06, "loss": 0.77511597, "num_input_tokens_seen": 3946870, "step": 188, "time_per_iteration": 2.8631174564361572 }, { "auxiliary_loss_clip": 0.01298557, "auxiliary_loss_mlp": 0.01067058, "balance_loss_clip": 1.14124644, "balance_loss_mlp": 1.04474187, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7620731385906954, "language_loss": 0.56192517, "learning_rate": 3.3749102610218297e-06, "loss": 0.5855813, "num_input_tokens_seen": 4010005, "step": 189, "time_per_iteration": 3.2704074382781982 }, { "auxiliary_loss_clip": 0.01402206, "auxiliary_loss_mlp": 0.011217, "balance_loss_clip": 1.11730003, "balance_loss_mlp": 1.06662548, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.640219984380571, "language_loss": 0.95085573, "learning_rate": 3.3783079057586833e-06, "loss": 0.97609472, "num_input_tokens_seen": 4029035, "step": 190, "time_per_iteration": 2.6898255348205566 }, { "auxiliary_loss_clip": 0.01405088, "auxiliary_loss_mlp": 0.01103893, "balance_loss_clip": 1.11979234, "balance_loss_mlp": 1.05167961, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 4.133813113517846, "language_loss": 0.8463847, "learning_rate": 3.3816877150079665e-06, "loss": 0.8714745, "num_input_tokens_seen": 4046995, "step": 191, "time_per_iteration": 2.71589994430542 }, { "auxiliary_loss_clip": 0.01403196, "auxiliary_loss_mlp": 0.01118385, "balance_loss_clip": 1.11570346, "balance_loss_mlp": 1.06624269, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 2.0065119945705887, "language_loss": 0.91894913, "learning_rate": 3.385049875042367e-06, "loss": 0.94416493, "num_input_tokens_seen": 4065865, "step": 192, "time_per_iteration": 2.775974988937378 }, { "auxiliary_loss_clip": 0.01398496, "auxiliary_loss_mlp": 0.01118924, "balance_loss_clip": 1.11665678, "balance_loss_mlp": 1.06117916, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.10033302347605, "language_loss": 0.86923265, "learning_rate": 3.3883945692315938e-06, "loss": 0.89440691, "num_input_tokens_seen": 4085305, "step": 193, "time_per_iteration": 2.792947292327881 }, { "auxiliary_loss_clip": 0.01402535, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.11514282, "balance_loss_mlp": 1.05061066, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.2253165290939076, "language_loss": 0.92296255, "learning_rate": 3.3917219781023906e-06, "loss": 0.94801068, "num_input_tokens_seen": 4105185, "step": 194, "time_per_iteration": 2.6886558532714844 }, { "auxiliary_loss_clip": 0.01407209, "auxiliary_loss_mlp": 0.01108641, "balance_loss_clip": 1.11930478, "balance_loss_mlp": 1.05630851, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 2.4241235245311503, "language_loss": 0.89768875, "learning_rate": 3.3950322793970014e-06, "loss": 0.92284721, "num_input_tokens_seen": 4123160, "step": 195, "time_per_iteration": 2.654517889022827 }, { "auxiliary_loss_clip": 0.01400339, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.11779022, "balance_loss_mlp": 1.05981565, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 3.1130999341447385, "language_loss": 0.86019921, "learning_rate": 3.3983256481301445e-06, "loss": 0.88534749, "num_input_tokens_seen": 4140425, "step": 196, "time_per_iteration": 2.643598794937134 }, { "auxiliary_loss_clip": 0.01398067, "auxiliary_loss_mlp": 0.01107082, "balance_loss_clip": 1.11464977, "balance_loss_mlp": 1.05308056, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 3.666533247373141, "language_loss": 0.93052697, "learning_rate": 3.4016022566445335e-06, "loss": 0.95557845, "num_input_tokens_seen": 4159555, "step": 197, "time_per_iteration": 2.7120354175567627 }, { "auxiliary_loss_clip": 0.01396424, "auxiliary_loss_mlp": 0.01112388, "balance_loss_clip": 1.11625624, "balance_loss_mlp": 1.05943501, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 1.9614954763997827, "language_loss": 0.79043806, "learning_rate": 3.4048622746649966e-06, "loss": 0.81552619, "num_input_tokens_seen": 4180480, "step": 198, "time_per_iteration": 2.774059772491455 }, { "auxiliary_loss_clip": 0.0139305, "auxiliary_loss_mlp": 0.01120527, "balance_loss_clip": 1.11708748, "balance_loss_mlp": 1.06821764, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 1.8823459083646328, "language_loss": 0.88239717, "learning_rate": 3.4081058693512278e-06, "loss": 0.90753293, "num_input_tokens_seen": 4198835, "step": 199, "time_per_iteration": 2.6808881759643555 }, { "auxiliary_loss_clip": 0.01403709, "auxiliary_loss_mlp": 0.0112899, "balance_loss_clip": 1.11951399, "balance_loss_mlp": 1.07200766, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 2.0663906916258497, "language_loss": 0.81151628, "learning_rate": 3.411333205349222e-06, "loss": 0.83684325, "num_input_tokens_seen": 4219335, "step": 200, "time_per_iteration": 2.625380516052246 }, { "auxiliary_loss_clip": 0.0140201, "auxiliary_loss_mlp": 0.01104413, "balance_loss_clip": 1.11633158, "balance_loss_mlp": 1.05048287, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 2.253120238884594, "language_loss": 0.87696433, "learning_rate": 3.4145444448414217e-06, "loss": 0.90202856, "num_input_tokens_seen": 4236940, "step": 201, "time_per_iteration": 2.6062326431274414 }, { "auxiliary_loss_clip": 0.01399494, "auxiliary_loss_mlp": 0.01115643, "balance_loss_clip": 1.11764228, "balance_loss_mlp": 1.0614028, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 2.088192664231089, "language_loss": 0.84052485, "learning_rate": 3.4177397475956223e-06, "loss": 0.86567622, "num_input_tokens_seen": 4256755, "step": 202, "time_per_iteration": 2.6981592178344727 }, { "auxiliary_loss_clip": 0.01388741, "auxiliary_loss_mlp": 0.0111019, "balance_loss_clip": 1.11006808, "balance_loss_mlp": 1.05771446, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 1.7861279575653157, "language_loss": 0.89964712, "learning_rate": 3.4209192710126685e-06, "loss": 0.92463642, "num_input_tokens_seen": 4276505, "step": 203, "time_per_iteration": 2.668757438659668 }, { "auxiliary_loss_clip": 0.01276289, "auxiliary_loss_mlp": 0.01095021, "balance_loss_clip": 1.12578154, "balance_loss_mlp": 1.07470798, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.0265297625980543, "language_loss": 0.61255801, "learning_rate": 3.4240831701729837e-06, "loss": 0.63627112, "num_input_tokens_seen": 4330965, "step": 204, "time_per_iteration": 3.161599636077881 }, { "auxiliary_loss_clip": 0.01396271, "auxiliary_loss_mlp": 0.01111806, "balance_loss_clip": 1.11291122, "balance_loss_mlp": 1.05930579, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.3248674300118184, "language_loss": 0.91324663, "learning_rate": 3.4272315978819516e-06, "loss": 0.93832743, "num_input_tokens_seen": 4348200, "step": 205, "time_per_iteration": 2.6764047145843506 }, { "auxiliary_loss_clip": 0.01404558, "auxiliary_loss_mlp": 0.0112167, "balance_loss_clip": 1.11773109, "balance_loss_mlp": 1.06773925, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 2.1088315130515207, "language_loss": 0.89305568, "learning_rate": 3.4303647047142043e-06, "loss": 0.91831797, "num_input_tokens_seen": 4365460, "step": 206, "time_per_iteration": 2.7157227993011475 }, { "auxiliary_loss_clip": 0.0139534, "auxiliary_loss_mlp": 0.01100957, "balance_loss_clip": 1.11176991, "balance_loss_mlp": 1.04888678, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.399816031687551, "language_loss": 0.95542914, "learning_rate": 3.43348263905683e-06, "loss": 0.9803921, "num_input_tokens_seen": 4383650, "step": 207, "time_per_iteration": 2.611348867416382 }, { "auxiliary_loss_clip": 0.01393005, "auxiliary_loss_mlp": 0.01117764, "balance_loss_clip": 1.11658561, "balance_loss_mlp": 1.06497812, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 1.8144323603981871, "language_loss": 0.75985783, "learning_rate": 3.436585547151547e-06, "loss": 0.78496552, "num_input_tokens_seen": 4403765, "step": 208, "time_per_iteration": 2.7184154987335205 }, { "auxiliary_loss_clip": 0.0138146, "auxiliary_loss_mlp": 0.01108623, "balance_loss_clip": 1.11071992, "balance_loss_mlp": 1.05576587, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 2.2326965650696855, "language_loss": 0.98386943, "learning_rate": 3.4396735731358586e-06, "loss": 1.00877023, "num_input_tokens_seen": 4421935, "step": 209, "time_per_iteration": 2.7354249954223633 }, { "auxiliary_loss_clip": 0.01387012, "auxiliary_loss_mlp": 0.0111836, "balance_loss_clip": 1.11136842, "balance_loss_mlp": 1.06490695, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 9.084733304650118, "language_loss": 0.85514843, "learning_rate": 3.4427468590832302e-06, "loss": 0.88020217, "num_input_tokens_seen": 4441470, "step": 210, "time_per_iteration": 2.888749122619629 }, { "auxiliary_loss_clip": 0.01384384, "auxiliary_loss_mlp": 0.01121559, "balance_loss_clip": 1.11018038, "balance_loss_mlp": 1.07115781, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 3.431917100192063, "language_loss": 0.97194636, "learning_rate": 3.445805545042314e-06, "loss": 0.99700582, "num_input_tokens_seen": 4459950, "step": 211, "time_per_iteration": 2.7465193271636963 }, { "auxiliary_loss_clip": 0.01393556, "auxiliary_loss_mlp": 0.01123542, "balance_loss_clip": 1.11511767, "balance_loss_mlp": 1.06999326, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.3992368053115163, "language_loss": 0.9508543, "learning_rate": 3.448849769075239e-06, "loss": 0.97602528, "num_input_tokens_seen": 4478390, "step": 212, "time_per_iteration": 2.6340651512145996 }, { "auxiliary_loss_clip": 0.01381697, "auxiliary_loss_mlp": 0.01116386, "balance_loss_clip": 1.112149, "balance_loss_mlp": 1.06381512, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.701444843398511, "language_loss": 0.76078421, "learning_rate": 3.4518796672950093e-06, "loss": 0.78576505, "num_input_tokens_seen": 4501665, "step": 213, "time_per_iteration": 2.9250640869140625 }, { "auxiliary_loss_clip": 0.01385821, "auxiliary_loss_mlp": 0.01111776, "balance_loss_clip": 1.11002433, "balance_loss_mlp": 1.06056333, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 3.5300370267625922, "language_loss": 0.86698866, "learning_rate": 3.4548953739020187e-06, "loss": 0.89196461, "num_input_tokens_seen": 4519055, "step": 214, "time_per_iteration": 2.645289659500122 }, { "auxiliary_loss_clip": 0.01383455, "auxiliary_loss_mlp": 0.01128262, "balance_loss_clip": 1.1159339, "balance_loss_mlp": 1.07359219, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.14433888305053, "language_loss": 0.77582061, "learning_rate": 3.4578970212197196e-06, "loss": 0.80093777, "num_input_tokens_seen": 4540870, "step": 215, "time_per_iteration": 2.7315175533294678 }, { "auxiliary_loss_clip": 0.01391951, "auxiliary_loss_mlp": 0.01115104, "balance_loss_clip": 1.11440635, "balance_loss_mlp": 1.0638206, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.2964706747038233, "language_loss": 0.90423942, "learning_rate": 3.460884739729461e-06, "loss": 0.92930996, "num_input_tokens_seen": 4560395, "step": 216, "time_per_iteration": 2.724698781967163 }, { "auxiliary_loss_clip": 0.01384729, "auxiliary_loss_mlp": 0.01113374, "balance_loss_clip": 1.10847259, "balance_loss_mlp": 1.06096959, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 3.60062834696173, "language_loss": 0.93473232, "learning_rate": 3.463858658104523e-06, "loss": 0.95971346, "num_input_tokens_seen": 4575785, "step": 217, "time_per_iteration": 5.762276649475098 }, { "auxiliary_loss_clip": 0.01377712, "auxiliary_loss_mlp": 0.0110874, "balance_loss_clip": 1.10726643, "balance_loss_mlp": 1.05433273, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 1.943339896357513, "language_loss": 0.93811166, "learning_rate": 3.4668189032433696e-06, "loss": 0.96297616, "num_input_tokens_seen": 4594985, "step": 218, "time_per_iteration": 5.832701206207275 }, { "auxiliary_loss_clip": 0.01372884, "auxiliary_loss_mlp": 0.01106717, "balance_loss_clip": 1.10647273, "balance_loss_mlp": 1.05552888, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 2.252873600345955, "language_loss": 0.86196327, "learning_rate": 3.46976560030214e-06, "loss": 0.88675928, "num_input_tokens_seen": 4616125, "step": 219, "time_per_iteration": 2.794581651687622 }, { "auxiliary_loss_clip": 0.0137885, "auxiliary_loss_mlp": 0.01102953, "balance_loss_clip": 1.10957599, "balance_loss_mlp": 1.05188394, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 1.897987121161891, "language_loss": 0.8748548, "learning_rate": 3.4726988727263976e-06, "loss": 0.89967287, "num_input_tokens_seen": 4637795, "step": 220, "time_per_iteration": 2.799927234649658 }, { "auxiliary_loss_clip": 0.01370688, "auxiliary_loss_mlp": 0.01115596, "balance_loss_clip": 1.10440111, "balance_loss_mlp": 1.0679127, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 3.2557072980071795, "language_loss": 0.86437249, "learning_rate": 3.475618842282164e-06, "loss": 0.88923532, "num_input_tokens_seen": 4656835, "step": 221, "time_per_iteration": 2.7040672302246094 }, { "auxiliary_loss_clip": 0.01376134, "auxiliary_loss_mlp": 0.01116397, "balance_loss_clip": 1.10384834, "balance_loss_mlp": 1.0637064, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.585706849100757, "language_loss": 0.92369294, "learning_rate": 3.4785256290862486e-06, "loss": 0.94861829, "num_input_tokens_seen": 4673015, "step": 222, "time_per_iteration": 2.6648194789886475 }, { "auxiliary_loss_clip": 0.01373283, "auxiliary_loss_mlp": 0.01106423, "balance_loss_clip": 1.10636806, "balance_loss_mlp": 1.05156267, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 7.739608779999776, "language_loss": 0.95708215, "learning_rate": 3.481419351635897e-06, "loss": 0.98187923, "num_input_tokens_seen": 4692355, "step": 223, "time_per_iteration": 2.7261807918548584 }, { "auxiliary_loss_clip": 0.01374555, "auxiliary_loss_mlp": 0.0110963, "balance_loss_clip": 1.10768425, "balance_loss_mlp": 1.05870414, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.673591615227502, "language_loss": 0.88031876, "learning_rate": 3.484300126837776e-06, "loss": 0.90516055, "num_input_tokens_seen": 4710080, "step": 224, "time_per_iteration": 2.601686477661133 }, { "auxiliary_loss_clip": 0.01374533, "auxiliary_loss_mlp": 0.01103, "balance_loss_clip": 1.10679817, "balance_loss_mlp": 1.04804444, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 3.0722216996453535, "language_loss": 0.89625597, "learning_rate": 3.487168070036317e-06, "loss": 0.9210313, "num_input_tokens_seen": 4728980, "step": 225, "time_per_iteration": 2.6677513122558594 }, { "auxiliary_loss_clip": 0.01369955, "auxiliary_loss_mlp": 0.0112021, "balance_loss_clip": 1.10561275, "balance_loss_mlp": 1.06675696, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 1.9576206039109396, "language_loss": 0.98980033, "learning_rate": 3.4900232950414224e-06, "loss": 1.01470196, "num_input_tokens_seen": 4747020, "step": 226, "time_per_iteration": 2.8320930004119873 }, { "auxiliary_loss_clip": 0.01375268, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.10837173, "balance_loss_mlp": 1.05572701, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.3303410550109245, "language_loss": 0.90965348, "learning_rate": 3.4928659141555727e-06, "loss": 0.93450654, "num_input_tokens_seen": 4765000, "step": 227, "time_per_iteration": 2.648606061935425 }, { "auxiliary_loss_clip": 0.01255161, "auxiliary_loss_mlp": 0.01079249, "balance_loss_clip": 1.11229861, "balance_loss_mlp": 1.06017554, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9472069433514878, "language_loss": 0.57650995, "learning_rate": 3.4956960382003234e-06, "loss": 0.59985405, "num_input_tokens_seen": 4833210, "step": 228, "time_per_iteration": 3.246328592300415 }, { "auxiliary_loss_clip": 0.01366835, "auxiliary_loss_mlp": 0.01117377, "balance_loss_clip": 1.10507822, "balance_loss_mlp": 1.06711841, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.957038430634678, "language_loss": 0.87773621, "learning_rate": 3.4985137765422354e-06, "loss": 0.90257835, "num_input_tokens_seen": 4850120, "step": 229, "time_per_iteration": 2.6319024562835693 }, { "auxiliary_loss_clip": 0.01375278, "auxiliary_loss_mlp": 0.01098609, "balance_loss_clip": 1.10567176, "balance_loss_mlp": 1.04873204, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 4.72663824849547, "language_loss": 0.83937395, "learning_rate": 3.501319237118231e-06, "loss": 0.86411285, "num_input_tokens_seen": 4866215, "step": 230, "time_per_iteration": 2.7026398181915283 }, { "auxiliary_loss_clip": 0.01373544, "auxiliary_loss_mlp": 0.01113683, "balance_loss_clip": 1.10701275, "balance_loss_mlp": 1.06361556, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 2.2562202151287867, "language_loss": 0.904212, "learning_rate": 3.5041125264604056e-06, "loss": 0.9290843, "num_input_tokens_seen": 4885630, "step": 231, "time_per_iteration": 2.6424474716186523 }, { "auxiliary_loss_clip": 0.01377759, "auxiliary_loss_mlp": 0.01110232, "balance_loss_clip": 1.11118639, "balance_loss_mlp": 1.06030726, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.0229562700819215, "language_loss": 0.83624899, "learning_rate": 3.5068937497203002e-06, "loss": 0.86112887, "num_input_tokens_seen": 4905570, "step": 232, "time_per_iteration": 2.621704339981079 }, { "auxiliary_loss_clip": 0.01377798, "auxiliary_loss_mlp": 0.01094369, "balance_loss_clip": 1.10229027, "balance_loss_mlp": 1.04253721, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 5.516695444379509, "language_loss": 0.74727643, "learning_rate": 3.509663010692652e-06, "loss": 0.77199805, "num_input_tokens_seen": 4923535, "step": 233, "time_per_iteration": 2.659188747406006 }, { "auxiliary_loss_clip": 0.01382744, "auxiliary_loss_mlp": 0.01125121, "balance_loss_clip": 1.1099937, "balance_loss_mlp": 1.0723356, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.5763093382937483, "language_loss": 0.85633421, "learning_rate": 3.512420411838642e-06, "loss": 0.88141286, "num_input_tokens_seen": 4939200, "step": 234, "time_per_iteration": 2.610635757446289 }, { "auxiliary_loss_clip": 0.01374562, "auxiliary_loss_mlp": 0.01114672, "balance_loss_clip": 1.10890436, "balance_loss_mlp": 1.06467605, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.467487286445388, "language_loss": 0.89192498, "learning_rate": 3.515166054308634e-06, "loss": 0.91681731, "num_input_tokens_seen": 4956620, "step": 235, "time_per_iteration": 2.668769359588623 }, { "auxiliary_loss_clip": 0.01373018, "auxiliary_loss_mlp": 0.01131641, "balance_loss_clip": 1.11011076, "balance_loss_mlp": 1.08073914, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.143165146200321, "language_loss": 0.85535377, "learning_rate": 3.5179000379644498e-06, "loss": 0.88040036, "num_input_tokens_seen": 4975650, "step": 236, "time_per_iteration": 2.7570323944091797 }, { "auxiliary_loss_clip": 0.01369632, "auxiliary_loss_mlp": 0.01100269, "balance_loss_clip": 1.10296702, "balance_loss_mlp": 1.04905629, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 2.1351980688483136, "language_loss": 0.82550979, "learning_rate": 3.520622461401154e-06, "loss": 0.85020876, "num_input_tokens_seen": 4997415, "step": 237, "time_per_iteration": 2.811617374420166 }, { "auxiliary_loss_clip": 0.01369728, "auxiliary_loss_mlp": 0.01124352, "balance_loss_clip": 1.10659075, "balance_loss_mlp": 1.07085085, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.0241581748099313, "language_loss": 0.77096599, "learning_rate": 3.5233334219683935e-06, "loss": 0.79590684, "num_input_tokens_seen": 5013905, "step": 238, "time_per_iteration": 2.8044662475585938 }, { "auxiliary_loss_clip": 0.01367496, "auxiliary_loss_mlp": 0.01111406, "balance_loss_clip": 1.10897434, "balance_loss_mlp": 1.06343579, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 1.8300428555870456, "language_loss": 0.8707583, "learning_rate": 3.526033015791284e-06, "loss": 0.89554727, "num_input_tokens_seen": 5033645, "step": 239, "time_per_iteration": 2.681452751159668 }, { "auxiliary_loss_clip": 0.01353036, "auxiliary_loss_mlp": 0.01103184, "balance_loss_clip": 1.10036874, "balance_loss_mlp": 1.05516672, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 2.109315431148974, "language_loss": 0.93055749, "learning_rate": 3.528721337790862e-06, "loss": 0.95511973, "num_input_tokens_seen": 5052875, "step": 240, "time_per_iteration": 2.679826021194458 }, { "auxiliary_loss_clip": 0.01360794, "auxiliary_loss_mlp": 0.01103084, "balance_loss_clip": 1.10475957, "balance_loss_mlp": 1.05611515, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 3.7136133710916575, "language_loss": 0.8482846, "learning_rate": 3.531398481704111e-06, "loss": 0.87292337, "num_input_tokens_seen": 5075005, "step": 241, "time_per_iteration": 2.679126262664795 }, { "auxiliary_loss_clip": 0.01359518, "auxiliary_loss_mlp": 0.01119602, "balance_loss_clip": 1.11010456, "balance_loss_mlp": 1.06931913, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 1.8502491938168453, "language_loss": 0.88590866, "learning_rate": 3.534064540103573e-06, "loss": 0.9106999, "num_input_tokens_seen": 5091875, "step": 242, "time_per_iteration": 2.7366583347320557 }, { "auxiliary_loss_clip": 0.01359534, "auxiliary_loss_mlp": 0.01104713, "balance_loss_clip": 1.10356677, "balance_loss_mlp": 1.05342889, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.261458758817042, "language_loss": 0.86688942, "learning_rate": 3.536719604416555e-06, "loss": 0.89153194, "num_input_tokens_seen": 5111290, "step": 243, "time_per_iteration": 2.764378070831299 }, { "auxiliary_loss_clip": 0.01364897, "auxiliary_loss_mlp": 0.01106776, "balance_loss_clip": 1.10636568, "balance_loss_mlp": 1.05656552, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 1.6964959858678799, "language_loss": 0.84256208, "learning_rate": 3.5393637649439464e-06, "loss": 0.86727887, "num_input_tokens_seen": 5132265, "step": 244, "time_per_iteration": 2.630441188812256 }, { "auxiliary_loss_clip": 0.01372266, "auxiliary_loss_mlp": 0.01115072, "balance_loss_clip": 1.10771632, "balance_loss_mlp": 1.06328762, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 8.49550264430495, "language_loss": 0.78613877, "learning_rate": 3.54199711087864e-06, "loss": 0.81101215, "num_input_tokens_seen": 5148575, "step": 245, "time_per_iteration": 2.6991443634033203 }, { "auxiliary_loss_clip": 0.01371598, "auxiliary_loss_mlp": 0.0110404, "balance_loss_clip": 1.10405719, "balance_loss_mlp": 1.05008554, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 2.2582939339926305, "language_loss": 0.84165329, "learning_rate": 3.5446197303235913e-06, "loss": 0.86640966, "num_input_tokens_seen": 5170415, "step": 246, "time_per_iteration": 2.726743221282959 }, { "auxiliary_loss_clip": 0.01365538, "auxiliary_loss_mlp": 0.01101456, "balance_loss_clip": 1.10242295, "balance_loss_mlp": 1.05062532, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 1.9870849133800452, "language_loss": 0.89958012, "learning_rate": 3.5472317103095034e-06, "loss": 0.92425001, "num_input_tokens_seen": 5188565, "step": 247, "time_per_iteration": 2.5998406410217285 }, { "auxiliary_loss_clip": 0.01364581, "auxiliary_loss_mlp": 0.01098108, "balance_loss_clip": 1.09896278, "balance_loss_mlp": 1.0489223, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.0527635487774343, "language_loss": 0.783005, "learning_rate": 3.549833136812155e-06, "loss": 0.80763197, "num_input_tokens_seen": 5207810, "step": 248, "time_per_iteration": 2.689784049987793 }, { "auxiliary_loss_clip": 0.01365896, "auxiliary_loss_mlp": 0.01110511, "balance_loss_clip": 1.10732806, "balance_loss_mlp": 1.06044269, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 1.9405946352322343, "language_loss": 0.83855766, "learning_rate": 3.552424094769381e-06, "loss": 0.86332172, "num_input_tokens_seen": 5226210, "step": 249, "time_per_iteration": 2.8210339546203613 }, { "auxiliary_loss_clip": 0.01358179, "auxiliary_loss_mlp": 0.01106801, "balance_loss_clip": 1.10089588, "balance_loss_mlp": 1.05802023, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.0689026358419786, "language_loss": 0.93631709, "learning_rate": 3.5550046680977174e-06, "loss": 0.96096689, "num_input_tokens_seen": 5241660, "step": 250, "time_per_iteration": 2.7074570655822754 }, { "auxiliary_loss_clip": 0.01368183, "auxiliary_loss_mlp": 0.01115393, "balance_loss_clip": 1.1065619, "balance_loss_mlp": 1.06415713, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.6509740932573127, "language_loss": 0.9678722, "learning_rate": 3.5575749397087034e-06, "loss": 0.99270797, "num_input_tokens_seen": 5261090, "step": 251, "time_per_iteration": 2.6740176677703857 }, { "auxiliary_loss_clip": 0.01361249, "auxiliary_loss_mlp": 0.01108489, "balance_loss_clip": 1.10063529, "balance_loss_mlp": 1.0597558, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.996044018630987, "language_loss": 0.84516245, "learning_rate": 3.5601349915248707e-06, "loss": 0.86985981, "num_input_tokens_seen": 5279175, "step": 252, "time_per_iteration": 2.7198123931884766 }, { "auxiliary_loss_clip": 0.01356789, "auxiliary_loss_mlp": 0.0111346, "balance_loss_clip": 1.1023767, "balance_loss_mlp": 1.06346345, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.3132428526475275, "language_loss": 0.98516917, "learning_rate": 3.5626849044954064e-06, "loss": 1.0098716, "num_input_tokens_seen": 5296975, "step": 253, "time_per_iteration": 2.6751561164855957 }, { "auxiliary_loss_clip": 0.01244193, "auxiliary_loss_mlp": 0.01100072, "balance_loss_clip": 1.1058414, "balance_loss_mlp": 1.08338308, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8719135194962525, "language_loss": 0.55628473, "learning_rate": 3.5652247586115167e-06, "loss": 0.57972741, "num_input_tokens_seen": 5358375, "step": 254, "time_per_iteration": 3.2305996417999268 }, { "auxiliary_loss_clip": 0.0136146, "auxiliary_loss_mlp": 0.01119692, "balance_loss_clip": 1.0985806, "balance_loss_mlp": 1.06952846, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 2.113472843461701, "language_loss": 0.90234184, "learning_rate": 3.567754632921479e-06, "loss": 0.92715329, "num_input_tokens_seen": 5377255, "step": 255, "time_per_iteration": 2.7138473987579346 }, { "auxiliary_loss_clip": 0.01357311, "auxiliary_loss_mlp": 0.01137867, "balance_loss_clip": 1.1001389, "balance_loss_mlp": 1.08803785, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.320838285045027, "language_loss": 0.85392761, "learning_rate": 3.5702746055454075e-06, "loss": 0.87887937, "num_input_tokens_seen": 5395320, "step": 256, "time_per_iteration": 2.7135775089263916 }, { "auxiliary_loss_clip": 0.01363873, "auxiliary_loss_mlp": 0.0112257, "balance_loss_clip": 1.10053098, "balance_loss_mlp": 1.07281172, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 4.480294478847577, "language_loss": 0.71472508, "learning_rate": 3.5727847536897254e-06, "loss": 0.73958945, "num_input_tokens_seen": 5411970, "step": 257, "time_per_iteration": 6.340675592422485 }, { "auxiliary_loss_clip": 0.01355912, "auxiliary_loss_mlp": 0.01112611, "balance_loss_clip": 1.10014856, "balance_loss_mlp": 1.06280565, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 2.0292888191897673, "language_loss": 0.94713151, "learning_rate": 3.5752851536613596e-06, "loss": 0.97181678, "num_input_tokens_seen": 5430245, "step": 258, "time_per_iteration": 5.674164772033691 }, { "auxiliary_loss_clip": 0.01356656, "auxiliary_loss_mlp": 0.01113313, "balance_loss_clip": 1.09867072, "balance_loss_mlp": 1.0645566, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.3215886633849236, "language_loss": 0.93037683, "learning_rate": 3.577775880881658e-06, "loss": 0.95507646, "num_input_tokens_seen": 5448905, "step": 259, "time_per_iteration": 2.6286497116088867 }, { "auxiliary_loss_clip": 0.01348977, "auxiliary_loss_mlp": 0.01102171, "balance_loss_clip": 1.10076857, "balance_loss_mlp": 1.05625176, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.9575053933526474, "language_loss": 0.97368109, "learning_rate": 3.5802570099000424e-06, "loss": 0.99819261, "num_input_tokens_seen": 5466405, "step": 260, "time_per_iteration": 2.625072717666626 }, { "auxiliary_loss_clip": 0.01362999, "auxiliary_loss_mlp": 0.01127943, "balance_loss_clip": 1.1010474, "balance_loss_mlp": 1.07940137, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.2828802632863305, "language_loss": 0.87807435, "learning_rate": 3.5827286144073947e-06, "loss": 0.90298378, "num_input_tokens_seen": 5487055, "step": 261, "time_per_iteration": 2.6737279891967773 }, { "auxiliary_loss_clip": 0.01357008, "auxiliary_loss_mlp": 0.01125312, "balance_loss_clip": 1.09822345, "balance_loss_mlp": 1.07665133, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 5.057676675675106, "language_loss": 0.67100549, "learning_rate": 3.5851907672491904e-06, "loss": 0.69582868, "num_input_tokens_seen": 5506600, "step": 262, "time_per_iteration": 2.651690721511841 }, { "auxiliary_loss_clip": 0.01353953, "auxiliary_loss_mlp": 0.01135541, "balance_loss_clip": 1.09924924, "balance_loss_mlp": 1.08499634, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 3.0820356667611337, "language_loss": 0.68077701, "learning_rate": 3.587643540438383e-06, "loss": 0.70567191, "num_input_tokens_seen": 5524350, "step": 263, "time_per_iteration": 2.6885130405426025 }, { "auxiliary_loss_clip": 0.01355592, "auxiliary_loss_mlp": 0.01116799, "balance_loss_clip": 1.09620881, "balance_loss_mlp": 1.06766081, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 3.9089218881424674, "language_loss": 0.85002583, "learning_rate": 3.590087005168037e-06, "loss": 0.87474978, "num_input_tokens_seen": 5542145, "step": 264, "time_per_iteration": 2.6557912826538086 }, { "auxiliary_loss_clip": 0.01360388, "auxiliary_loss_mlp": 0.01102763, "balance_loss_clip": 1.10088885, "balance_loss_mlp": 1.056319, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.7020928553211476, "language_loss": 1.04234743, "learning_rate": 3.5925212318237344e-06, "loss": 1.06697881, "num_input_tokens_seen": 5557920, "step": 265, "time_per_iteration": 2.6262216567993164 }, { "auxiliary_loss_clip": 0.01364512, "auxiliary_loss_mlp": 0.01120309, "balance_loss_clip": 1.1033864, "balance_loss_mlp": 1.06835794, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 3.1220748516520134, "language_loss": 0.74914098, "learning_rate": 3.5949462899957323e-06, "loss": 0.7739892, "num_input_tokens_seen": 5576290, "step": 266, "time_per_iteration": 2.6244583129882812 }, { "auxiliary_loss_clip": 0.01349738, "auxiliary_loss_mlp": 0.0111189, "balance_loss_clip": 1.1000762, "balance_loss_mlp": 1.06206095, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 1.8166776194063956, "language_loss": 0.90909529, "learning_rate": 3.5973622484909068e-06, "loss": 0.93371153, "num_input_tokens_seen": 5595205, "step": 267, "time_per_iteration": 2.6753580570220947 }, { "auxiliary_loss_clip": 0.01359091, "auxiliary_loss_mlp": 0.01115968, "balance_loss_clip": 1.10122573, "balance_loss_mlp": 1.06797481, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 2.450608875877181, "language_loss": 0.85636413, "learning_rate": 3.599769175344462e-06, "loss": 0.88111478, "num_input_tokens_seen": 5612645, "step": 268, "time_per_iteration": 2.7161567211151123 }, { "auxiliary_loss_clip": 0.01351132, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.10226274, "balance_loss_mlp": 1.05475891, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.1714201716772457, "language_loss": 0.88080788, "learning_rate": 3.602167137831432e-06, "loss": 0.90534198, "num_input_tokens_seen": 5628345, "step": 269, "time_per_iteration": 2.6403756141662598 }, { "auxiliary_loss_clip": 0.01357907, "auxiliary_loss_mlp": 0.01111574, "balance_loss_clip": 1.10001528, "balance_loss_mlp": 1.06021833, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.5848702107942803, "language_loss": 0.97077739, "learning_rate": 3.6045562024779565e-06, "loss": 0.99547219, "num_input_tokens_seen": 5645940, "step": 270, "time_per_iteration": 2.635546922683716 }, { "auxiliary_loss_clip": 0.01356007, "auxiliary_loss_mlp": 0.01118132, "balance_loss_clip": 1.10402, "balance_loss_mlp": 1.06918478, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 2.1115750591463223, "language_loss": 0.86112005, "learning_rate": 3.606936435072361e-06, "loss": 0.8858614, "num_input_tokens_seen": 5665690, "step": 271, "time_per_iteration": 2.6877286434173584 }, { "auxiliary_loss_clip": 0.013537, "auxiliary_loss_mlp": 0.01105687, "balance_loss_clip": 1.0962286, "balance_loss_mlp": 1.057693, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 2.5391912683658413, "language_loss": 0.81550127, "learning_rate": 3.609307900676025e-06, "loss": 0.84009504, "num_input_tokens_seen": 5683190, "step": 272, "time_per_iteration": 2.6728365421295166 }, { "auxiliary_loss_clip": 0.01348527, "auxiliary_loss_mlp": 0.01120864, "balance_loss_clip": 1.09806561, "balance_loss_mlp": 1.07368064, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.3613573538590487, "language_loss": 0.81075382, "learning_rate": 3.611670663634051e-06, "loss": 0.83544779, "num_input_tokens_seen": 5699780, "step": 273, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01346135, "auxiliary_loss_mlp": 0.01105539, "balance_loss_clip": 1.09398317, "balance_loss_mlp": 1.05749762, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 2.1979313648400547, "language_loss": 0.9131726, "learning_rate": 3.614024787585744e-06, "loss": 0.9376893, "num_input_tokens_seen": 5716980, "step": 274, "time_per_iteration": 2.684718132019043 }, { "auxiliary_loss_clip": 0.013432, "auxiliary_loss_mlp": 0.01108715, "balance_loss_clip": 1.09515727, "balance_loss_mlp": 1.06062579, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.9719932168994616, "language_loss": 0.88054645, "learning_rate": 3.6163703354748927e-06, "loss": 0.90506566, "num_input_tokens_seen": 5737780, "step": 275, "time_per_iteration": 2.7204532623291016 }, { "auxiliary_loss_clip": 0.01346726, "auxiliary_loss_mlp": 0.01102856, "balance_loss_clip": 1.09623361, "balance_loss_mlp": 1.05312169, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.7930545784536995, "language_loss": 0.80726624, "learning_rate": 3.6187073695598707e-06, "loss": 0.83176208, "num_input_tokens_seen": 5758330, "step": 276, "time_per_iteration": 3.04716157913208 }, { "auxiliary_loss_clip": 0.0133817, "auxiliary_loss_mlp": 0.01096103, "balance_loss_clip": 1.09588337, "balance_loss_mlp": 1.05220985, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 1.9196343116615175, "language_loss": 0.80707026, "learning_rate": 3.621035951423551e-06, "loss": 0.83141291, "num_input_tokens_seen": 5778340, "step": 277, "time_per_iteration": 2.809645652770996 }, { "auxiliary_loss_clip": 0.01337061, "auxiliary_loss_mlp": 0.0109637, "balance_loss_clip": 1.08979487, "balance_loss_mlp": 1.04923487, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 2.3224792061881185, "language_loss": 0.80508065, "learning_rate": 3.623356141983041e-06, "loss": 0.82941496, "num_input_tokens_seen": 5794295, "step": 278, "time_per_iteration": 2.604830741882324 }, { "auxiliary_loss_clip": 0.01341116, "auxiliary_loss_mlp": 0.01101968, "balance_loss_clip": 1.09395671, "balance_loss_mlp": 1.05585837, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 2.0021377353660057, "language_loss": 0.90582991, "learning_rate": 3.6256680014992486e-06, "loss": 0.93026078, "num_input_tokens_seen": 5814405, "step": 279, "time_per_iteration": 2.7193243503570557 }, { "auxiliary_loss_clip": 0.01346095, "auxiliary_loss_mlp": 0.01112065, "balance_loss_clip": 1.09383631, "balance_loss_mlp": 1.06450009, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 2.9314445951013988, "language_loss": 0.94049025, "learning_rate": 3.6279715895862713e-06, "loss": 0.96507192, "num_input_tokens_seen": 5832795, "step": 280, "time_per_iteration": 2.680924654006958 }, { "auxiliary_loss_clip": 0.01346658, "auxiliary_loss_mlp": 0.01109166, "balance_loss_clip": 1.09285879, "balance_loss_mlp": 1.06060064, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.6758913403282483, "language_loss": 0.74425459, "learning_rate": 3.6302669652206183e-06, "loss": 0.76881289, "num_input_tokens_seen": 5855750, "step": 281, "time_per_iteration": 2.691152811050415 }, { "auxiliary_loss_clip": 0.01343371, "auxiliary_loss_mlp": 0.01117708, "balance_loss_clip": 1.09609079, "balance_loss_mlp": 1.0724318, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 3.4878028680462005, "language_loss": 0.80255079, "learning_rate": 3.632554186750274e-06, "loss": 0.82716167, "num_input_tokens_seen": 5872610, "step": 282, "time_per_iteration": 2.592664957046509 }, { "auxiliary_loss_clip": 0.01348082, "auxiliary_loss_mlp": 0.01118449, "balance_loss_clip": 1.09700727, "balance_loss_mlp": 1.07114697, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 2.296781711700251, "language_loss": 0.77719986, "learning_rate": 3.6348333119035937e-06, "loss": 0.80186516, "num_input_tokens_seen": 5892985, "step": 283, "time_per_iteration": 2.6502227783203125 }, { "auxiliary_loss_clip": 0.01347311, "auxiliary_loss_mlp": 0.01092934, "balance_loss_clip": 1.0977478, "balance_loss_mlp": 1.04804015, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.3467060832193414, "language_loss": 0.84246969, "learning_rate": 3.6371043977980503e-06, "loss": 0.86687213, "num_input_tokens_seen": 5914060, "step": 284, "time_per_iteration": 2.8534958362579346 }, { "auxiliary_loss_clip": 0.01337962, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.09212708, "balance_loss_mlp": 1.05297756, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 2.7335752956200388, "language_loss": 0.96998906, "learning_rate": 3.639367500948819e-06, "loss": 0.99437273, "num_input_tokens_seen": 5932860, "step": 285, "time_per_iteration": 2.6338655948638916 }, { "auxiliary_loss_clip": 0.01341319, "auxiliary_loss_mlp": 0.01095606, "balance_loss_clip": 1.09538078, "balance_loss_mlp": 1.05123687, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.294843469150046, "language_loss": 0.94079655, "learning_rate": 3.6416226772772178e-06, "loss": 0.96516573, "num_input_tokens_seen": 5952725, "step": 286, "time_per_iteration": 2.711087942123413 }, { "auxiliary_loss_clip": 0.01332862, "auxiliary_loss_mlp": 0.0109035, "balance_loss_clip": 1.08986938, "balance_loss_mlp": 1.04409683, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 1.9277896882465477, "language_loss": 0.92464817, "learning_rate": 3.643869982119001e-06, "loss": 0.94888031, "num_input_tokens_seen": 5970560, "step": 287, "time_per_iteration": 2.640267848968506 }, { "auxiliary_loss_clip": 0.01338192, "auxiliary_loss_mlp": 0.01092315, "balance_loss_clip": 1.09039164, "balance_loss_mlp": 1.04651475, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 2.7883535936791035, "language_loss": 1.01873291, "learning_rate": 3.646109470232502e-06, "loss": 1.04303789, "num_input_tokens_seen": 5982980, "step": 288, "time_per_iteration": 2.558312177658081 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01188305, "balance_loss_clip": 1.09194219, "balance_loss_mlp": 1.17228377, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.9289960013542303, "language_loss": 0.63867617, "learning_rate": 3.6483411958066417e-06, "loss": 0.66281009, "num_input_tokens_seen": 6049445, "step": 289, "time_per_iteration": 3.386254072189331 }, { "auxiliary_loss_clip": 0.01341215, "auxiliary_loss_mlp": 0.01107788, "balance_loss_clip": 1.09622383, "balance_loss_mlp": 1.06482446, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.368974734045724, "language_loss": 0.88156199, "learning_rate": 3.6505652124687957e-06, "loss": 0.90605205, "num_input_tokens_seen": 6064150, "step": 290, "time_per_iteration": 2.5670948028564453 }, { "auxiliary_loss_clip": 0.0133848, "auxiliary_loss_mlp": 0.010946, "balance_loss_clip": 1.09388971, "balance_loss_mlp": 1.04965782, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 2.2011772664145504, "language_loss": 0.84472585, "learning_rate": 3.6527815732925258e-06, "loss": 0.8690567, "num_input_tokens_seen": 6083920, "step": 291, "time_per_iteration": 2.648452043533325 }, { "auxiliary_loss_clip": 0.01343563, "auxiliary_loss_mlp": 0.01115116, "balance_loss_clip": 1.10129941, "balance_loss_mlp": 1.06607366, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.7675259544479762, "language_loss": 0.72679955, "learning_rate": 3.6549903308051806e-06, "loss": 0.75138628, "num_input_tokens_seen": 6105460, "step": 292, "time_per_iteration": 2.7239537239074707 }, { "auxiliary_loss_clip": 0.01334066, "auxiliary_loss_mlp": 0.01107289, "balance_loss_clip": 1.09397244, "balance_loss_mlp": 1.06170392, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.419616990787406, "language_loss": 0.86866581, "learning_rate": 3.6571915369953646e-06, "loss": 0.89307928, "num_input_tokens_seen": 6122890, "step": 293, "time_per_iteration": 2.642854690551758 }, { "auxiliary_loss_clip": 0.01333726, "auxiliary_loss_mlp": 0.0110557, "balance_loss_clip": 1.09271646, "balance_loss_mlp": 1.06086659, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.112624444766753, "language_loss": 0.80896151, "learning_rate": 3.6593852433202797e-06, "loss": 0.83335447, "num_input_tokens_seen": 6142890, "step": 294, "time_per_iteration": 2.598176956176758 }, { "auxiliary_loss_clip": 0.01334179, "auxiliary_loss_mlp": 0.01113433, "balance_loss_clip": 1.09030747, "balance_loss_mlp": 1.06892014, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 2.8289841764142416, "language_loss": 0.83806521, "learning_rate": 3.6615715007129453e-06, "loss": 0.86254132, "num_input_tokens_seen": 6162030, "step": 295, "time_per_iteration": 2.750103712081909 }, { "auxiliary_loss_clip": 0.01339845, "auxiliary_loss_mlp": 0.01121984, "balance_loss_clip": 1.09978509, "balance_loss_mlp": 1.0772326, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 1.8804378237246864, "language_loss": 0.84576106, "learning_rate": 3.6637503595892897e-06, "loss": 0.87037927, "num_input_tokens_seen": 6180540, "step": 296, "time_per_iteration": 4.154251337051392 }, { "auxiliary_loss_clip": 0.01337678, "auxiliary_loss_mlp": 0.01105295, "balance_loss_clip": 1.09463406, "balance_loss_mlp": 1.06154561, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.055710812588959, "language_loss": 0.87810111, "learning_rate": 3.665921869855132e-06, "loss": 0.90253091, "num_input_tokens_seen": 6199425, "step": 297, "time_per_iteration": 4.379676103591919 }, { "auxiliary_loss_clip": 0.0133717, "auxiliary_loss_mlp": 0.01103766, "balance_loss_clip": 1.09343684, "balance_loss_mlp": 1.06004047, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.689351030321763, "language_loss": 0.88947791, "learning_rate": 3.6680860809130346e-06, "loss": 0.91388726, "num_input_tokens_seen": 6219170, "step": 298, "time_per_iteration": 4.1055779457092285 }, { "auxiliary_loss_clip": 0.01333843, "auxiliary_loss_mlp": 0.01121179, "balance_loss_clip": 1.09470236, "balance_loss_mlp": 1.07499719, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 1.8935027270905305, "language_loss": 0.88550889, "learning_rate": 3.6702430416690516e-06, "loss": 0.91005915, "num_input_tokens_seen": 6237930, "step": 299, "time_per_iteration": 2.611168622970581 }, { "auxiliary_loss_clip": 0.0133938, "auxiliary_loss_mlp": 0.0110718, "balance_loss_clip": 1.09468794, "balance_loss_mlp": 1.06130886, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 4.075580609786654, "language_loss": 0.64664406, "learning_rate": 3.672392800539357e-06, "loss": 0.67110968, "num_input_tokens_seen": 6257170, "step": 300, "time_per_iteration": 2.645603656768799 }, { "auxiliary_loss_clip": 0.01338559, "auxiliary_loss_mlp": 0.01111665, "balance_loss_clip": 1.09775913, "balance_loss_mlp": 1.06636548, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 2.5071418214687515, "language_loss": 0.87940675, "learning_rate": 3.6745354054567686e-06, "loss": 0.90390897, "num_input_tokens_seen": 6274780, "step": 301, "time_per_iteration": 2.6035923957824707 }, { "auxiliary_loss_clip": 0.01238361, "auxiliary_loss_mlp": 0.01073699, "balance_loss_clip": 1.1100142, "balance_loss_mlp": 1.05901265, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8350739260664176, "language_loss": 0.62219667, "learning_rate": 3.676670903877158e-06, "loss": 0.64531732, "num_input_tokens_seen": 6340435, "step": 302, "time_per_iteration": 3.3307297229766846 }, { "auxiliary_loss_clip": 0.0132981, "auxiliary_loss_mlp": 0.01110918, "balance_loss_clip": 1.0910126, "balance_loss_mlp": 1.06507051, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 2.115144575016314, "language_loss": 0.89737153, "learning_rate": 3.6787993427857567e-06, "loss": 0.9217788, "num_input_tokens_seen": 6358160, "step": 303, "time_per_iteration": 2.6773293018341064 }, { "auxiliary_loss_clip": 0.01335628, "auxiliary_loss_mlp": 0.01118481, "balance_loss_clip": 1.09579217, "balance_loss_mlp": 1.07237101, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.8670669350935472, "language_loss": 0.80417514, "learning_rate": 3.680920768703364e-06, "loss": 0.82871628, "num_input_tokens_seen": 6378485, "step": 304, "time_per_iteration": 2.691347360610962 }, { "auxiliary_loss_clip": 0.01330802, "auxiliary_loss_mlp": 0.01091671, "balance_loss_clip": 1.09832263, "balance_loss_mlp": 1.04858923, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6863564291935742, "language_loss": 0.82761526, "learning_rate": 3.6830352276924415e-06, "loss": 0.85184002, "num_input_tokens_seen": 6397845, "step": 305, "time_per_iteration": 2.6883981227874756 }, { "auxiliary_loss_clip": 0.01330759, "auxiliary_loss_mlp": 0.01093908, "balance_loss_clip": 1.09012437, "balance_loss_mlp": 1.05115986, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 2.1780708917523297, "language_loss": 0.91148543, "learning_rate": 3.685142765363119e-06, "loss": 0.93573213, "num_input_tokens_seen": 6416475, "step": 306, "time_per_iteration": 2.6465187072753906 }, { "auxiliary_loss_clip": 0.01324743, "auxiliary_loss_mlp": 0.01091696, "balance_loss_clip": 1.08900762, "balance_loss_mlp": 1.04882836, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.4680205003751072, "language_loss": 0.86581063, "learning_rate": 3.687243426879095e-06, "loss": 0.88997507, "num_input_tokens_seen": 6437520, "step": 307, "time_per_iteration": 2.7787318229675293 }, { "auxiliary_loss_clip": 0.01326572, "auxiliary_loss_mlp": 0.01110018, "balance_loss_clip": 1.09346747, "balance_loss_mlp": 1.06247783, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 2.413130156754219, "language_loss": 0.71650648, "learning_rate": 3.6893372569634466e-06, "loss": 0.74087244, "num_input_tokens_seen": 6455680, "step": 308, "time_per_iteration": 2.652973175048828 }, { "auxiliary_loss_clip": 0.01331912, "auxiliary_loss_mlp": 0.01102766, "balance_loss_clip": 1.09061241, "balance_loss_mlp": 1.05911207, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.1869498369051077, "language_loss": 0.91841364, "learning_rate": 3.6914242999043395e-06, "loss": 0.94276047, "num_input_tokens_seen": 6474880, "step": 309, "time_per_iteration": 2.6613030433654785 }, { "auxiliary_loss_clip": 0.01339178, "auxiliary_loss_mlp": 0.01096668, "balance_loss_clip": 1.09145641, "balance_loss_mlp": 1.05084395, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.0400456475786353, "language_loss": 0.72784412, "learning_rate": 3.69350459956065e-06, "loss": 0.75220263, "num_input_tokens_seen": 6495945, "step": 310, "time_per_iteration": 2.705345392227173 }, { "auxiliary_loss_clip": 0.01331019, "auxiliary_loss_mlp": 0.01113021, "balance_loss_clip": 1.09560525, "balance_loss_mlp": 1.06922317, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.1345597100799645, "language_loss": 0.74162471, "learning_rate": 3.695578199367497e-06, "loss": 0.76606506, "num_input_tokens_seen": 6519930, "step": 311, "time_per_iteration": 2.846503496170044 }, { "auxiliary_loss_clip": 0.01338389, "auxiliary_loss_mlp": 0.01104203, "balance_loss_clip": 1.09206033, "balance_loss_mlp": 1.0609777, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 3.713635021153945, "language_loss": 0.91668129, "learning_rate": 3.6976451423416825e-06, "loss": 0.94110715, "num_input_tokens_seen": 6535070, "step": 312, "time_per_iteration": 2.598400592803955 }, { "auxiliary_loss_clip": 0.01339145, "auxiliary_loss_mlp": 0.01116197, "balance_loss_clip": 1.09512305, "balance_loss_mlp": 1.07034922, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 4.5530066286460045, "language_loss": 0.89634913, "learning_rate": 3.699705471087043e-06, "loss": 0.92090249, "num_input_tokens_seen": 6554135, "step": 313, "time_per_iteration": 2.6944596767425537 }, { "auxiliary_loss_clip": 0.01340962, "auxiliary_loss_mlp": 0.0109941, "balance_loss_clip": 1.09381938, "balance_loss_mlp": 1.05430174, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 2.3990870717118455, "language_loss": 0.7335974, "learning_rate": 3.7017592277997256e-06, "loss": 0.75800109, "num_input_tokens_seen": 6572275, "step": 314, "time_per_iteration": 2.6550133228302 }, { "auxiliary_loss_clip": 0.01329658, "auxiliary_loss_mlp": 0.01105546, "balance_loss_clip": 1.09075165, "balance_loss_mlp": 1.06246412, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 5.81191681220521, "language_loss": 0.89890182, "learning_rate": 3.7038064542733654e-06, "loss": 0.92325383, "num_input_tokens_seen": 6594520, "step": 315, "time_per_iteration": 2.7121222019195557 }, { "auxiliary_loss_clip": 0.0133262, "auxiliary_loss_mlp": 0.01096177, "balance_loss_clip": 1.09287357, "balance_loss_mlp": 1.05209303, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 2.446494284682687, "language_loss": 0.80517328, "learning_rate": 3.7058471919041945e-06, "loss": 0.82946122, "num_input_tokens_seen": 6614245, "step": 316, "time_per_iteration": 2.640573501586914 }, { "auxiliary_loss_clip": 0.01326654, "auxiliary_loss_mlp": 0.01094904, "balance_loss_clip": 1.09036672, "balance_loss_mlp": 1.05046248, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 2.3705495670370524, "language_loss": 0.90161496, "learning_rate": 3.7078814816960605e-06, "loss": 0.92583054, "num_input_tokens_seen": 6632015, "step": 317, "time_per_iteration": 2.594388246536255 }, { "auxiliary_loss_clip": 0.01324014, "auxiliary_loss_mlp": 0.01097498, "balance_loss_clip": 1.08944559, "balance_loss_mlp": 1.05281842, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 7.443622240044352, "language_loss": 0.90836811, "learning_rate": 3.709909364265374e-06, "loss": 0.93258321, "num_input_tokens_seen": 6649015, "step": 318, "time_per_iteration": 2.6647114753723145 }, { "auxiliary_loss_clip": 0.01326579, "auxiliary_loss_mlp": 0.01092817, "balance_loss_clip": 1.0886786, "balance_loss_mlp": 1.05102181, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 2.232217614618188, "language_loss": 0.93955356, "learning_rate": 3.7119308798459706e-06, "loss": 0.9637475, "num_input_tokens_seen": 6669225, "step": 319, "time_per_iteration": 2.6901800632476807 }, { "auxiliary_loss_clip": 0.01209258, "auxiliary_loss_mlp": 0.01057567, "balance_loss_clip": 1.08611965, "balance_loss_mlp": 1.04288089, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 1.0009907084180605, "language_loss": 0.59817195, "learning_rate": 3.7139460682939026e-06, "loss": 0.62084019, "num_input_tokens_seen": 6725775, "step": 320, "time_per_iteration": 3.1044812202453613 }, { "auxiliary_loss_clip": 0.01323701, "auxiliary_loss_mlp": 0.01105882, "balance_loss_clip": 1.08827436, "balance_loss_mlp": 1.06291938, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 3.6735645336458163, "language_loss": 0.89620435, "learning_rate": 3.715954969092154e-06, "loss": 0.92050016, "num_input_tokens_seen": 6744170, "step": 321, "time_per_iteration": 2.650325298309326 }, { "auxiliary_loss_clip": 0.01333523, "auxiliary_loss_mlp": 0.01118534, "balance_loss_clip": 1.09200621, "balance_loss_mlp": 1.07440257, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.289334718991835, "language_loss": 0.82897186, "learning_rate": 3.7179576213552805e-06, "loss": 0.85349244, "num_input_tokens_seen": 6764565, "step": 322, "time_per_iteration": 2.65793514251709 }, { "auxiliary_loss_clip": 0.01332983, "auxiliary_loss_mlp": 0.01092262, "balance_loss_clip": 1.09035325, "balance_loss_mlp": 1.05061018, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.3678949255052912, "language_loss": 0.72983897, "learning_rate": 3.719954063833981e-06, "loss": 0.75409144, "num_input_tokens_seen": 6785310, "step": 323, "time_per_iteration": 2.6827828884124756 }, { "auxiliary_loss_clip": 0.01321298, "auxiliary_loss_mlp": 0.01092254, "balance_loss_clip": 1.08474624, "balance_loss_mlp": 1.04974401, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.9971507164977458, "language_loss": 0.92358303, "learning_rate": 3.721944334919596e-06, "loss": 0.9477185, "num_input_tokens_seen": 6803290, "step": 324, "time_per_iteration": 2.667363405227661 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01089098, "balance_loss_clip": 1.09217644, "balance_loss_mlp": 1.04878139, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 6.407507213214319, "language_loss": 0.65127969, "learning_rate": 3.7239284726485375e-06, "loss": 0.67547202, "num_input_tokens_seen": 6822570, "step": 325, "time_per_iteration": 2.658700466156006 }, { "auxiliary_loss_clip": 0.01328385, "auxiliary_loss_mlp": 0.01109788, "balance_loss_clip": 1.09598839, "balance_loss_mlp": 1.06675363, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.7177375017641943, "language_loss": 0.76394802, "learning_rate": 3.72590651470665e-06, "loss": 0.78832972, "num_input_tokens_seen": 6841910, "step": 326, "time_per_iteration": 2.6326630115509033 }, { "auxiliary_loss_clip": 0.01322824, "auxiliary_loss_mlp": 0.01103487, "balance_loss_clip": 1.09083152, "balance_loss_mlp": 1.06040514, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.041100065316132, "language_loss": 0.79262185, "learning_rate": 3.727878498433505e-06, "loss": 0.81688493, "num_input_tokens_seen": 6862480, "step": 327, "time_per_iteration": 2.7195518016815186 }, { "auxiliary_loss_clip": 0.0132945, "auxiliary_loss_mlp": 0.01099712, "balance_loss_clip": 1.09292865, "balance_loss_mlp": 1.05832207, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 2.852301933148325, "language_loss": 0.80569315, "learning_rate": 3.7298444608266328e-06, "loss": 0.82998472, "num_input_tokens_seen": 6882015, "step": 328, "time_per_iteration": 2.6789369583129883 }, { "auxiliary_loss_clip": 0.01327544, "auxiliary_loss_mlp": 0.01094059, "balance_loss_clip": 1.08719349, "balance_loss_mlp": 1.05045235, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.280823996815513, "language_loss": 0.93599927, "learning_rate": 3.731804438545683e-06, "loss": 0.96021533, "num_input_tokens_seen": 6899785, "step": 329, "time_per_iteration": 2.6043548583984375 }, { "auxiliary_loss_clip": 0.0133329, "auxiliary_loss_mlp": 0.0110952, "balance_loss_clip": 1.09211767, "balance_loss_mlp": 1.06629419, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 2.788704520584699, "language_loss": 0.7476396, "learning_rate": 3.7337584679165324e-06, "loss": 0.77206767, "num_input_tokens_seen": 6918575, "step": 330, "time_per_iteration": 2.706001043319702 }, { "auxiliary_loss_clip": 0.0133006, "auxiliary_loss_mlp": 0.01115344, "balance_loss_clip": 1.09077096, "balance_loss_mlp": 1.07280993, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 4.201650057157668, "language_loss": 0.93435889, "learning_rate": 3.7357065849353186e-06, "loss": 0.95881295, "num_input_tokens_seen": 6936965, "step": 331, "time_per_iteration": 2.6499180793762207 }, { "auxiliary_loss_clip": 0.01316843, "auxiliary_loss_mlp": 0.01085812, "balance_loss_clip": 1.08825564, "balance_loss_mlp": 1.04563856, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 2.5475056489813968, "language_loss": 0.9293468, "learning_rate": 3.737648825272422e-06, "loss": 0.95337331, "num_input_tokens_seen": 6953475, "step": 332, "time_per_iteration": 2.5990231037139893 }, { "auxiliary_loss_clip": 0.01325701, "auxiliary_loss_mlp": 0.01091941, "balance_loss_clip": 1.09376514, "balance_loss_mlp": 1.04902601, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 2.7319388202061106, "language_loss": 0.75380504, "learning_rate": 3.739585224276384e-06, "loss": 0.77798152, "num_input_tokens_seen": 6971630, "step": 333, "time_per_iteration": 2.6225569248199463 }, { "auxiliary_loss_clip": 0.01323488, "auxiliary_loss_mlp": 0.01083816, "balance_loss_clip": 1.08822608, "balance_loss_mlp": 1.04249835, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 3.3732742696494924, "language_loss": 0.78797042, "learning_rate": 3.7415158169777673e-06, "loss": 0.81204355, "num_input_tokens_seen": 6992775, "step": 334, "time_per_iteration": 2.725562572479248 }, { "auxiliary_loss_clip": 0.01325152, "auxiliary_loss_mlp": 0.01093257, "balance_loss_clip": 1.08535278, "balance_loss_mlp": 1.04867256, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.945115565921162, "language_loss": 0.83465719, "learning_rate": 3.7434406380929575e-06, "loss": 0.8588413, "num_input_tokens_seen": 7011425, "step": 335, "time_per_iteration": 2.638871192932129 }, { "auxiliary_loss_clip": 0.01322365, "auxiliary_loss_mlp": 0.01085854, "balance_loss_clip": 1.08842373, "balance_loss_mlp": 1.04405963, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 2.3527147371949058, "language_loss": 0.92432821, "learning_rate": 3.745359722027911e-06, "loss": 0.94841033, "num_input_tokens_seen": 7029450, "step": 336, "time_per_iteration": 2.6654980182647705 }, { "auxiliary_loss_clip": 0.01321531, "auxiliary_loss_mlp": 0.01079695, "balance_loss_clip": 1.08577883, "balance_loss_mlp": 1.03818631, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.7223490941555537, "language_loss": 0.88663971, "learning_rate": 3.7472731028818428e-06, "loss": 0.91065204, "num_input_tokens_seen": 7047555, "step": 337, "time_per_iteration": 4.246743440628052 }, { "auxiliary_loss_clip": 0.01312441, "auxiliary_loss_mlp": 0.01102336, "balance_loss_clip": 1.08320296, "balance_loss_mlp": 1.05841899, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.6493597356962735, "language_loss": 0.89869279, "learning_rate": 3.7491808144508626e-06, "loss": 0.92284054, "num_input_tokens_seen": 7068185, "step": 338, "time_per_iteration": 5.869866609573364 }, { "auxiliary_loss_clip": 0.01321566, "auxiliary_loss_mlp": 0.0109858, "balance_loss_clip": 1.08546185, "balance_loss_mlp": 1.05554605, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.1603069065052694, "language_loss": 0.85168982, "learning_rate": 3.7510828902315576e-06, "loss": 0.87589133, "num_input_tokens_seen": 7085955, "step": 339, "time_per_iteration": 2.603130340576172 }, { "auxiliary_loss_clip": 0.01328225, "auxiliary_loss_mlp": 0.01099064, "balance_loss_clip": 1.0902226, "balance_loss_mlp": 1.05524242, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 2.1746002196087817, "language_loss": 0.88821882, "learning_rate": 3.75297936342452e-06, "loss": 0.91249174, "num_input_tokens_seen": 7106345, "step": 340, "time_per_iteration": 2.7247626781463623 }, { "auxiliary_loss_clip": 0.01322505, "auxiliary_loss_mlp": 0.01085559, "balance_loss_clip": 1.08594203, "balance_loss_mlp": 1.04004502, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 2.004763613818719, "language_loss": 0.88489276, "learning_rate": 3.7548702669378253e-06, "loss": 0.9089734, "num_input_tokens_seen": 7125070, "step": 341, "time_per_iteration": 2.731411933898926 }, { "auxiliary_loss_clip": 0.01324734, "auxiliary_loss_mlp": 0.01098572, "balance_loss_clip": 1.08451748, "balance_loss_mlp": 1.05479813, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 2.3638593093640736, "language_loss": 0.80611861, "learning_rate": 3.756755633390458e-06, "loss": 0.83035159, "num_input_tokens_seen": 7144675, "step": 342, "time_per_iteration": 2.6085095405578613 }, { "auxiliary_loss_clip": 0.01313805, "auxiliary_loss_mlp": 0.01098164, "balance_loss_clip": 1.08411694, "balance_loss_mlp": 1.05138612, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.727276092160433, "language_loss": 0.89612651, "learning_rate": 3.7586354951156886e-06, "loss": 0.92024612, "num_input_tokens_seen": 7165505, "step": 343, "time_per_iteration": 2.739912509918213 }, { "auxiliary_loss_clip": 0.01324722, "auxiliary_loss_mlp": 0.01096954, "balance_loss_clip": 1.09109879, "balance_loss_mlp": 1.05518293, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 2.6902665590614663, "language_loss": 0.78381217, "learning_rate": 3.7605098841644e-06, "loss": 0.80802888, "num_input_tokens_seen": 7184605, "step": 344, "time_per_iteration": 2.638439655303955 }, { "auxiliary_loss_clip": 0.01310552, "auxiliary_loss_mlp": 0.01103983, "balance_loss_clip": 1.08375537, "balance_loss_mlp": 1.05982804, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 2.2675296623639114, "language_loss": 0.75051636, "learning_rate": 3.7623788323083666e-06, "loss": 0.77466166, "num_input_tokens_seen": 7203065, "step": 345, "time_per_iteration": 2.581258773803711 }, { "auxiliary_loss_clip": 0.01316305, "auxiliary_loss_mlp": 0.01107937, "balance_loss_clip": 1.08855689, "balance_loss_mlp": 1.06447339, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.2144688897761395, "language_loss": 0.90414572, "learning_rate": 3.7642423710434837e-06, "loss": 0.92838824, "num_input_tokens_seen": 7222995, "step": 346, "time_per_iteration": 2.6281676292419434 }, { "auxiliary_loss_clip": 0.01312286, "auxiliary_loss_mlp": 0.01096576, "balance_loss_clip": 1.08357453, "balance_loss_mlp": 1.05621195, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 3.1106741063140366, "language_loss": 0.79133296, "learning_rate": 3.7661005315929563e-06, "loss": 0.81542158, "num_input_tokens_seen": 7244625, "step": 347, "time_per_iteration": 2.6477038860321045 }, { "auxiliary_loss_clip": 0.01317665, "auxiliary_loss_mlp": 0.01097416, "balance_loss_clip": 1.08921003, "balance_loss_mlp": 1.05328524, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 3.7065871267995893, "language_loss": 0.71211165, "learning_rate": 3.7679533449104354e-06, "loss": 0.73626244, "num_input_tokens_seen": 7263255, "step": 348, "time_per_iteration": 2.6215686798095703 }, { "auxiliary_loss_clip": 0.01319168, "auxiliary_loss_mlp": 0.01104109, "balance_loss_clip": 1.0859139, "balance_loss_mlp": 1.06066906, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.3976328225512495, "language_loss": 0.77118891, "learning_rate": 3.7698008416831116e-06, "loss": 0.79542166, "num_input_tokens_seen": 7279275, "step": 349, "time_per_iteration": 2.60102915763855 }, { "auxiliary_loss_clip": 0.01304146, "auxiliary_loss_mlp": 0.01101496, "balance_loss_clip": 1.08412242, "balance_loss_mlp": 1.06017756, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 1.7599420553547571, "language_loss": 0.85191035, "learning_rate": 3.7716430523347664e-06, "loss": 0.87596673, "num_input_tokens_seen": 7300180, "step": 350, "time_per_iteration": 2.7636313438415527 }, { "auxiliary_loss_clip": 0.01310639, "auxiliary_loss_mlp": 0.01090182, "balance_loss_clip": 1.08742464, "balance_loss_mlp": 1.05015147, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.2188224040826956, "language_loss": 0.7998929, "learning_rate": 3.773480007028776e-06, "loss": 0.82390112, "num_input_tokens_seen": 7317430, "step": 351, "time_per_iteration": 2.651803493499756 }, { "auxiliary_loss_clip": 0.01318922, "auxiliary_loss_mlp": 0.01104903, "balance_loss_clip": 1.08851838, "balance_loss_mlp": 1.06093884, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.30399977815629, "language_loss": 0.8746841, "learning_rate": 3.775311735671078e-06, "loss": 0.89892232, "num_input_tokens_seen": 7334875, "step": 352, "time_per_iteration": 2.687080144882202 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.01101912, "balance_loss_clip": 1.0859803, "balance_loss_mlp": 1.05861485, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 2.574621592267882, "language_loss": 0.8247534, "learning_rate": 3.7771382679130878e-06, "loss": 0.84888554, "num_input_tokens_seen": 7355185, "step": 353, "time_per_iteration": 2.7096078395843506 }, { "auxiliary_loss_clip": 0.01308698, "auxiliary_loss_mlp": 0.01092448, "balance_loss_clip": 1.08573294, "balance_loss_mlp": 1.05160654, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 1.9591973719581535, "language_loss": 0.8089481, "learning_rate": 3.7789596331545845e-06, "loss": 0.83295953, "num_input_tokens_seen": 7374425, "step": 354, "time_per_iteration": 2.658649444580078 }, { "auxiliary_loss_clip": 0.01314249, "auxiliary_loss_mlp": 0.01095812, "balance_loss_clip": 1.08369493, "balance_loss_mlp": 1.05218124, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 2.22170783568627, "language_loss": 0.81311834, "learning_rate": 3.780775860546545e-06, "loss": 0.837219, "num_input_tokens_seen": 7394175, "step": 355, "time_per_iteration": 2.619551420211792 }, { "auxiliary_loss_clip": 0.01310207, "auxiliary_loss_mlp": 0.01090401, "balance_loss_clip": 1.08222032, "balance_loss_mlp": 1.04851055, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.212340256471132, "language_loss": 0.89746779, "learning_rate": 3.7825869789939474e-06, "loss": 0.92147392, "num_input_tokens_seen": 7412645, "step": 356, "time_per_iteration": 2.5877137184143066 }, { "auxiliary_loss_clip": 0.01308298, "auxiliary_loss_mlp": 0.0108474, "balance_loss_clip": 1.08573771, "balance_loss_mlp": 1.04191971, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 1.9878508054592678, "language_loss": 0.79956681, "learning_rate": 3.784393017158528e-06, "loss": 0.82349718, "num_input_tokens_seen": 7432275, "step": 357, "time_per_iteration": 2.781755208969116 }, { "auxiliary_loss_clip": 0.0130988, "auxiliary_loss_mlp": 0.01083565, "balance_loss_clip": 1.08250284, "balance_loss_mlp": 1.04417801, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 2.6679617624252137, "language_loss": 0.76516652, "learning_rate": 3.786194003461506e-06, "loss": 0.78910094, "num_input_tokens_seen": 7450245, "step": 358, "time_per_iteration": 2.63144850730896 }, { "auxiliary_loss_clip": 0.01307251, "auxiliary_loss_mlp": 0.01092013, "balance_loss_clip": 1.08083165, "balance_loss_mlp": 1.04842997, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.344744226979962, "language_loss": 0.88770491, "learning_rate": 3.787989966086264e-06, "loss": 0.91169769, "num_input_tokens_seen": 7466845, "step": 359, "time_per_iteration": 2.641932964324951 }, { "auxiliary_loss_clip": 0.01315087, "auxiliary_loss_mlp": 0.01090441, "balance_loss_clip": 1.08486438, "balance_loss_mlp": 1.05088758, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 3.6505103877164804, "language_loss": 0.75853801, "learning_rate": 3.789780932980997e-06, "loss": 0.78259325, "num_input_tokens_seen": 7485450, "step": 360, "time_per_iteration": 2.5901477336883545 }, { "auxiliary_loss_clip": 0.01203506, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.07682121, "balance_loss_mlp": 1.01781011, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8439708743577624, "language_loss": 0.64861441, "learning_rate": 3.79156693186132e-06, "loss": 0.67095727, "num_input_tokens_seen": 7553780, "step": 361, "time_per_iteration": 3.278409957885742 }, { "auxiliary_loss_clip": 0.01306068, "auxiliary_loss_mlp": 0.01086116, "balance_loss_clip": 1.0792098, "balance_loss_mlp": 1.04501224, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 3.144635825096315, "language_loss": 0.78844237, "learning_rate": 3.7933479902128433e-06, "loss": 0.81236422, "num_input_tokens_seen": 7574155, "step": 362, "time_per_iteration": 2.6302051544189453 }, { "auxiliary_loss_clip": 0.01309585, "auxiliary_loss_mlp": 0.01093258, "balance_loss_clip": 1.08188891, "balance_loss_mlp": 1.05244076, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.019833715135914, "language_loss": 0.92474592, "learning_rate": 3.7951241352937077e-06, "loss": 0.94877434, "num_input_tokens_seen": 7592320, "step": 363, "time_per_iteration": 2.6566081047058105 }, { "auxiliary_loss_clip": 0.01305173, "auxiliary_loss_mlp": 0.01096467, "balance_loss_clip": 1.0816617, "balance_loss_mlp": 1.05693769, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.282586403147275, "language_loss": 0.89844346, "learning_rate": 3.7968953941370915e-06, "loss": 0.92245984, "num_input_tokens_seen": 7611185, "step": 364, "time_per_iteration": 2.711911201477051 }, { "auxiliary_loss_clip": 0.01311963, "auxiliary_loss_mlp": 0.0109247, "balance_loss_clip": 1.08607888, "balance_loss_mlp": 1.04955506, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 1.948927065488749, "language_loss": 0.79460645, "learning_rate": 3.798661793553676e-06, "loss": 0.81865084, "num_input_tokens_seen": 7631970, "step": 365, "time_per_iteration": 2.6396052837371826 }, { "auxiliary_loss_clip": 0.01306043, "auxiliary_loss_mlp": 0.01100405, "balance_loss_clip": 1.08267248, "balance_loss_mlp": 1.05658317, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 1.85181498507666, "language_loss": 0.84341359, "learning_rate": 3.8004233601340808e-06, "loss": 0.86747801, "num_input_tokens_seen": 7649745, "step": 366, "time_per_iteration": 2.6278867721557617 }, { "auxiliary_loss_clip": 0.01312113, "auxiliary_loss_mlp": 0.01087574, "balance_loss_clip": 1.08304918, "balance_loss_mlp": 1.04859269, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 1.9326288300300676, "language_loss": 0.87040466, "learning_rate": 3.8021801202512694e-06, "loss": 0.89440155, "num_input_tokens_seen": 7668830, "step": 367, "time_per_iteration": 2.6410560607910156 }, { "auxiliary_loss_clip": 0.01312217, "auxiliary_loss_mlp": 0.01096053, "balance_loss_clip": 1.08074582, "balance_loss_mlp": 1.05335259, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.7247329926128976, "language_loss": 0.8487373, "learning_rate": 3.803932100062912e-06, "loss": 0.87282002, "num_input_tokens_seen": 7687240, "step": 368, "time_per_iteration": 2.652012825012207 }, { "auxiliary_loss_clip": 0.01312089, "auxiliary_loss_mlp": 0.01079926, "balance_loss_clip": 1.0801568, "balance_loss_mlp": 1.04027653, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.4839328990540794, "language_loss": 0.75997221, "learning_rate": 3.8056793255137264e-06, "loss": 0.78389233, "num_input_tokens_seen": 7704440, "step": 369, "time_per_iteration": 2.601384401321411 }, { "auxiliary_loss_clip": 0.01306737, "auxiliary_loss_mlp": 0.01099274, "balance_loss_clip": 1.08232927, "balance_loss_mlp": 1.05836105, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.189428421230448, "language_loss": 0.82977992, "learning_rate": 3.8074218223377844e-06, "loss": 0.85383999, "num_input_tokens_seen": 7727160, "step": 370, "time_per_iteration": 2.6538548469543457 }, { "auxiliary_loss_clip": 0.01306327, "auxiliary_loss_mlp": 0.01099594, "balance_loss_clip": 1.08127654, "balance_loss_mlp": 1.05713177, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.8569755368340455, "language_loss": 0.81588483, "learning_rate": 3.8091596160607834e-06, "loss": 0.83994406, "num_input_tokens_seen": 7747730, "step": 371, "time_per_iteration": 2.6779489517211914 }, { "auxiliary_loss_clip": 0.01311283, "auxiliary_loss_mlp": 0.01093653, "balance_loss_clip": 1.08593988, "balance_loss_mlp": 1.05169153, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 2.0622769904034817, "language_loss": 0.83493644, "learning_rate": 3.8108927320022896e-06, "loss": 0.85898578, "num_input_tokens_seen": 7766765, "step": 372, "time_per_iteration": 2.676797866821289 }, { "auxiliary_loss_clip": 0.01303906, "auxiliary_loss_mlp": 0.01091688, "balance_loss_clip": 1.08125615, "balance_loss_mlp": 1.05022752, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 2.8569846697004424, "language_loss": 0.79004842, "learning_rate": 3.8126211952779548e-06, "loss": 0.81400436, "num_input_tokens_seen": 7784010, "step": 373, "time_per_iteration": 2.593186616897583 }, { "auxiliary_loss_clip": 0.01309731, "auxiliary_loss_mlp": 0.01087409, "balance_loss_clip": 1.08431911, "balance_loss_mlp": 1.0448271, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 2.5442660874947385, "language_loss": 0.77622557, "learning_rate": 3.8143450308016952e-06, "loss": 0.80019701, "num_input_tokens_seen": 7801305, "step": 374, "time_per_iteration": 2.628392457962036 }, { "auxiliary_loss_clip": 0.0129871, "auxiliary_loss_mlp": 0.01076131, "balance_loss_clip": 1.07404125, "balance_loss_mlp": 1.03395462, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.574507922341891, "language_loss": 0.86032569, "learning_rate": 3.8160642632878525e-06, "loss": 0.88407415, "num_input_tokens_seen": 7823965, "step": 375, "time_per_iteration": 2.6783435344696045 }, { "auxiliary_loss_clip": 0.01307026, "auxiliary_loss_mlp": 0.01102393, "balance_loss_clip": 1.08340597, "balance_loss_mlp": 1.0590483, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.1279260859120286, "language_loss": 0.8901403, "learning_rate": 3.817778917253314e-06, "loss": 0.91423446, "num_input_tokens_seen": 7842115, "step": 376, "time_per_iteration": 2.621629476547241 }, { "auxiliary_loss_clip": 0.01306872, "auxiliary_loss_mlp": 0.01087647, "balance_loss_clip": 1.07870364, "balance_loss_mlp": 1.04868913, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 3.0367767906095917, "language_loss": 0.75437558, "learning_rate": 3.8194890170196155e-06, "loss": 0.77832079, "num_input_tokens_seen": 7857830, "step": 377, "time_per_iteration": 2.5465245246887207 }, { "auxiliary_loss_clip": 0.01298987, "auxiliary_loss_mlp": 0.01093623, "balance_loss_clip": 1.08128345, "balance_loss_mlp": 1.0517087, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.1955644054597374, "language_loss": 0.99231368, "learning_rate": 3.8211945867150055e-06, "loss": 1.01623976, "num_input_tokens_seen": 7875840, "step": 378, "time_per_iteration": 7.184643983840942 }, { "auxiliary_loss_clip": 0.01202133, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.0828104, "balance_loss_mlp": 1.0283463, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9608118941287621, "language_loss": 0.75395739, "learning_rate": 3.822895650276492e-06, "loss": 0.7763871, "num_input_tokens_seen": 7940190, "step": 379, "time_per_iteration": 4.961140394210815 }, { "auxiliary_loss_clip": 0.01308523, "auxiliary_loss_mlp": 0.01087195, "balance_loss_clip": 1.07820678, "balance_loss_mlp": 1.04792738, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 3.7276648293904375, "language_loss": 0.78197825, "learning_rate": 3.824592231451859e-06, "loss": 0.8059355, "num_input_tokens_seen": 7960840, "step": 380, "time_per_iteration": 2.7892863750457764 }, { "auxiliary_loss_clip": 0.01301718, "auxiliary_loss_mlp": 0.01088822, "balance_loss_clip": 1.07955217, "balance_loss_mlp": 1.04945946, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.0941800643649855, "language_loss": 0.96743369, "learning_rate": 3.826284353801652e-06, "loss": 0.99133915, "num_input_tokens_seen": 7975500, "step": 381, "time_per_iteration": 2.619854688644409 }, { "auxiliary_loss_clip": 0.01311313, "auxiliary_loss_mlp": 0.01093973, "balance_loss_clip": 1.08192921, "balance_loss_mlp": 1.0539186, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.122042453210184, "language_loss": 0.87664795, "learning_rate": 3.827972040701142e-06, "loss": 0.90070075, "num_input_tokens_seen": 7993880, "step": 382, "time_per_iteration": 2.617398500442505 }, { "auxiliary_loss_clip": 0.01304042, "auxiliary_loss_mlp": 0.01096828, "balance_loss_clip": 1.0821979, "balance_loss_mlp": 1.05760849, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 1.978420170714987, "language_loss": 0.84990942, "learning_rate": 3.829655315342268e-06, "loss": 0.87391812, "num_input_tokens_seen": 8012730, "step": 383, "time_per_iteration": 2.6345314979553223 }, { "auxiliary_loss_clip": 0.01300873, "auxiliary_loss_mlp": 0.0111136, "balance_loss_clip": 1.08199024, "balance_loss_mlp": 1.0716393, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.0575071112917778, "language_loss": 0.83349717, "learning_rate": 3.831334200735543e-06, "loss": 0.8576194, "num_input_tokens_seen": 8031275, "step": 384, "time_per_iteration": 2.6339902877807617 }, { "auxiliary_loss_clip": 0.0129979, "auxiliary_loss_mlp": 0.010893, "balance_loss_clip": 1.08362782, "balance_loss_mlp": 1.05255938, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.7828777740185773, "language_loss": 0.89289594, "learning_rate": 3.8330087197119426e-06, "loss": 0.91678685, "num_input_tokens_seen": 8051600, "step": 385, "time_per_iteration": 2.690460205078125 }, { "auxiliary_loss_clip": 0.01305297, "auxiliary_loss_mlp": 0.01118129, "balance_loss_clip": 1.08288455, "balance_loss_mlp": 1.07926655, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.9487706588237765, "language_loss": 0.70157433, "learning_rate": 3.83467889492477e-06, "loss": 0.72580856, "num_input_tokens_seen": 8070600, "step": 386, "time_per_iteration": 2.681957721710205 }, { "auxiliary_loss_clip": 0.01305989, "auxiliary_loss_mlp": 0.0109088, "balance_loss_clip": 1.08441973, "balance_loss_mlp": 1.05309081, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 2.354342660334866, "language_loss": 0.87840039, "learning_rate": 3.836344748851495e-06, "loss": 0.90236908, "num_input_tokens_seen": 8090680, "step": 387, "time_per_iteration": 2.6511123180389404 }, { "auxiliary_loss_clip": 0.01304298, "auxiliary_loss_mlp": 0.01075541, "balance_loss_clip": 1.08178413, "balance_loss_mlp": 1.03658366, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 2.2068948332198643, "language_loss": 0.8341614, "learning_rate": 3.838006303795566e-06, "loss": 0.85795981, "num_input_tokens_seen": 8114610, "step": 388, "time_per_iteration": 2.7062034606933594 }, { "auxiliary_loss_clip": 0.01301997, "auxiliary_loss_mlp": 0.01089724, "balance_loss_clip": 1.08110905, "balance_loss_mlp": 1.05284107, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.1887236217853863, "language_loss": 0.93710232, "learning_rate": 3.839663581888206e-06, "loss": 0.96101958, "num_input_tokens_seen": 8133975, "step": 389, "time_per_iteration": 2.680280923843384 }, { "auxiliary_loss_clip": 0.01296082, "auxiliary_loss_mlp": 0.01083127, "balance_loss_clip": 1.0818491, "balance_loss_mlp": 1.04397893, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 1.981860280002506, "language_loss": 0.87747037, "learning_rate": 3.841316605090178e-06, "loss": 0.9012624, "num_input_tokens_seen": 8153570, "step": 390, "time_per_iteration": 2.65970516204834 }, { "auxiliary_loss_clip": 0.01301203, "auxiliary_loss_mlp": 0.01092853, "balance_loss_clip": 1.08357048, "balance_loss_mlp": 1.0568521, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.134782100250632, "language_loss": 0.89370871, "learning_rate": 3.842965395193529e-06, "loss": 0.91764927, "num_input_tokens_seen": 8170075, "step": 391, "time_per_iteration": 2.620009660720825 }, { "auxiliary_loss_clip": 0.01296395, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.07956719, "balance_loss_mlp": 1.03521371, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 2.366558958564603, "language_loss": 0.86076117, "learning_rate": 3.84460997382332e-06, "loss": 0.88444775, "num_input_tokens_seen": 8190420, "step": 392, "time_per_iteration": 2.7171695232391357 }, { "auxiliary_loss_clip": 0.01293283, "auxiliary_loss_mlp": 0.01084283, "balance_loss_clip": 1.07891107, "balance_loss_mlp": 1.04763794, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.038818686720474, "language_loss": 0.89096916, "learning_rate": 3.8462503624393256e-06, "loss": 0.91474473, "num_input_tokens_seen": 8208790, "step": 393, "time_per_iteration": 2.632129669189453 }, { "auxiliary_loss_clip": 0.01304158, "auxiliary_loss_mlp": 0.01102255, "balance_loss_clip": 1.08471596, "balance_loss_mlp": 1.06279635, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 1.7920692319020195, "language_loss": 0.8156364, "learning_rate": 3.84788658233771e-06, "loss": 0.83970058, "num_input_tokens_seen": 8226885, "step": 394, "time_per_iteration": 2.5932936668395996 }, { "auxiliary_loss_clip": 0.01296851, "auxiliary_loss_mlp": 0.01088191, "balance_loss_clip": 1.07939875, "balance_loss_mlp": 1.04920936, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 4.539737106404062, "language_loss": 0.85808635, "learning_rate": 3.84951865465269e-06, "loss": 0.88193679, "num_input_tokens_seen": 8246825, "step": 395, "time_per_iteration": 2.6112868785858154 }, { "auxiliary_loss_clip": 0.01194704, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.07210529, "balance_loss_mlp": 1.02319229, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9258089920958834, "language_loss": 0.6380353, "learning_rate": 3.851146600358172e-06, "loss": 0.66032922, "num_input_tokens_seen": 8302835, "step": 396, "time_per_iteration": 3.031489133834839 }, { "auxiliary_loss_clip": 0.0129188, "auxiliary_loss_mlp": 0.01071022, "balance_loss_clip": 1.07806754, "balance_loss_mlp": 1.03447223, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 2.3741099598177624, "language_loss": 0.83878696, "learning_rate": 3.852770440269372e-06, "loss": 0.86241591, "num_input_tokens_seen": 8320745, "step": 397, "time_per_iteration": 2.6049532890319824 }, { "auxiliary_loss_clip": 0.01297108, "auxiliary_loss_mlp": 0.01087341, "balance_loss_clip": 1.08104038, "balance_loss_mlp": 1.04890823, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 4.6847154905409205, "language_loss": 0.84066498, "learning_rate": 3.854390195044404e-06, "loss": 0.86450952, "num_input_tokens_seen": 8339540, "step": 398, "time_per_iteration": 2.6516692638397217 }, { "auxiliary_loss_clip": 0.01295876, "auxiliary_loss_mlp": 0.01078722, "balance_loss_clip": 1.07671928, "balance_loss_mlp": 1.04007471, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.80358563189936, "language_loss": 0.86029691, "learning_rate": 3.856005885185868e-06, "loss": 0.88404286, "num_input_tokens_seen": 8354890, "step": 399, "time_per_iteration": 2.5452589988708496 }, { "auxiliary_loss_clip": 0.01292698, "auxiliary_loss_mlp": 0.01090822, "balance_loss_clip": 1.08074594, "balance_loss_mlp": 1.05308056, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 2.021318687641168, "language_loss": 0.86254489, "learning_rate": 3.857617531042398e-06, "loss": 0.88638014, "num_input_tokens_seen": 8375845, "step": 400, "time_per_iteration": 2.6626927852630615 }, { "auxiliary_loss_clip": 0.01299822, "auxiliary_loss_mlp": 0.01083301, "balance_loss_clip": 1.08346462, "balance_loss_mlp": 1.04687035, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.735822397657743, "language_loss": 0.79276752, "learning_rate": 3.8592251528102065e-06, "loss": 0.81659877, "num_input_tokens_seen": 8395240, "step": 401, "time_per_iteration": 2.68418025970459 }, { "auxiliary_loss_clip": 0.0129275, "auxiliary_loss_mlp": 0.01091389, "balance_loss_clip": 1.07852793, "balance_loss_mlp": 1.05493474, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 3.889755427752258, "language_loss": 0.78890866, "learning_rate": 3.8608287705345976e-06, "loss": 0.81274998, "num_input_tokens_seen": 8416950, "step": 402, "time_per_iteration": 2.7509379386901855 }, { "auxiliary_loss_clip": 0.01296434, "auxiliary_loss_mlp": 0.01082712, "balance_loss_clip": 1.07797897, "balance_loss_mlp": 1.04399323, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.49356632429363, "language_loss": 0.94936156, "learning_rate": 3.86242840411147e-06, "loss": 0.97315305, "num_input_tokens_seen": 8433660, "step": 403, "time_per_iteration": 2.5760560035705566 }, { "auxiliary_loss_clip": 0.0129994, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07754242, "balance_loss_mlp": 1.05315053, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.361656575803209, "language_loss": 0.99877387, "learning_rate": 3.864024073288798e-06, "loss": 1.0226922, "num_input_tokens_seen": 8450180, "step": 404, "time_per_iteration": 2.5966458320617676 }, { "auxiliary_loss_clip": 0.01298911, "auxiliary_loss_mlp": 0.01100127, "balance_loss_clip": 1.08096266, "balance_loss_mlp": 1.06312442, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.3162348618509276, "language_loss": 0.8802169, "learning_rate": 3.865615797668091e-06, "loss": 0.90420723, "num_input_tokens_seen": 8467775, "step": 405, "time_per_iteration": 2.5728275775909424 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.01097881, "balance_loss_clip": 1.084512, "balance_loss_mlp": 1.06004393, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.7399607903318275, "language_loss": 0.93386561, "learning_rate": 3.867203596705844e-06, "loss": 0.95791268, "num_input_tokens_seen": 8486765, "step": 406, "time_per_iteration": 2.612668991088867 }, { "auxiliary_loss_clip": 0.01299426, "auxiliary_loss_mlp": 0.01088378, "balance_loss_clip": 1.08213782, "balance_loss_mlp": 1.0500164, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 2.1742012769968526, "language_loss": 0.87128031, "learning_rate": 3.86878748971496e-06, "loss": 0.89515841, "num_input_tokens_seen": 8506515, "step": 407, "time_per_iteration": 2.5982017517089844 }, { "auxiliary_loss_clip": 0.01298266, "auxiliary_loss_mlp": 0.01083858, "balance_loss_clip": 1.08472157, "balance_loss_mlp": 1.04630709, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.1458430439144234, "language_loss": 0.74102569, "learning_rate": 3.8703674958661596e-06, "loss": 0.76484692, "num_input_tokens_seen": 8528035, "step": 408, "time_per_iteration": 2.708670139312744 }, { "auxiliary_loss_clip": 0.01300128, "auxiliary_loss_mlp": 0.01089985, "balance_loss_clip": 1.08222318, "balance_loss_mlp": 1.05233896, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 2.4878473813549675, "language_loss": 0.92509401, "learning_rate": 3.871943634189376e-06, "loss": 0.94899511, "num_input_tokens_seen": 8546455, "step": 409, "time_per_iteration": 2.665321111679077 }, { "auxiliary_loss_clip": 0.01296394, "auxiliary_loss_mlp": 0.01077538, "balance_loss_clip": 1.08126342, "balance_loss_mlp": 1.04291987, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.2521095969191722, "language_loss": 0.82792604, "learning_rate": 3.873515923575128e-06, "loss": 0.85166532, "num_input_tokens_seen": 8568450, "step": 410, "time_per_iteration": 2.848928213119507 }, { "auxiliary_loss_clip": 0.01299459, "auxiliary_loss_mlp": 0.01089133, "balance_loss_clip": 1.08187068, "balance_loss_mlp": 1.05284572, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.1393760271628595, "language_loss": 0.77577484, "learning_rate": 3.875084382775879e-06, "loss": 0.79966074, "num_input_tokens_seen": 8589340, "step": 411, "time_per_iteration": 2.6645278930664062 }, { "auxiliary_loss_clip": 0.01298341, "auxiliary_loss_mlp": 0.0110154, "balance_loss_clip": 1.07977521, "balance_loss_mlp": 1.06289268, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.2974658872162665, "language_loss": 0.86379063, "learning_rate": 3.87664903040738e-06, "loss": 0.88778943, "num_input_tokens_seen": 8607150, "step": 412, "time_per_iteration": 2.6091151237487793 }, { "auxiliary_loss_clip": 0.01187014, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.07387948, "balance_loss_mlp": 1.02089787, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8687159185244209, "language_loss": 0.5852263, "learning_rate": 3.878209884949994e-06, "loss": 0.60741079, "num_input_tokens_seen": 8669865, "step": 413, "time_per_iteration": 3.2269625663757324 }, { "auxiliary_loss_clip": 0.0129043, "auxiliary_loss_mlp": 0.01091958, "balance_loss_clip": 1.07709181, "balance_loss_mlp": 1.05249953, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.8280666153990437, "language_loss": 0.80517173, "learning_rate": 3.879766964750006e-06, "loss": 0.82899559, "num_input_tokens_seen": 8690235, "step": 414, "time_per_iteration": 2.720341444015503 }, { "auxiliary_loss_clip": 0.01287097, "auxiliary_loss_mlp": 0.0109242, "balance_loss_clip": 1.0756042, "balance_loss_mlp": 1.0556556, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 2.1921003994701302, "language_loss": 0.80227423, "learning_rate": 3.881320288020917e-06, "loss": 0.82606936, "num_input_tokens_seen": 8706295, "step": 415, "time_per_iteration": 2.6473400592803955 }, { "auxiliary_loss_clip": 0.01302694, "auxiliary_loss_mlp": 0.01082455, "balance_loss_clip": 1.08156919, "balance_loss_mlp": 1.04497528, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.9318871737289776, "language_loss": 0.96236515, "learning_rate": 3.882869872844723e-06, "loss": 0.9862166, "num_input_tokens_seen": 8724200, "step": 416, "time_per_iteration": 2.596189260482788 }, { "auxiliary_loss_clip": 0.01291636, "auxiliary_loss_mlp": 0.01074465, "balance_loss_clip": 1.07628798, "balance_loss_mlp": 1.0355792, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.741746736079687, "language_loss": 0.77381694, "learning_rate": 3.884415737173176e-06, "loss": 0.79747796, "num_input_tokens_seen": 8744170, "step": 417, "time_per_iteration": 5.610344171524048 }, { "auxiliary_loss_clip": 0.01290746, "auxiliary_loss_mlp": 0.0109022, "balance_loss_clip": 1.08072221, "balance_loss_mlp": 1.05264485, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.554385639735456, "language_loss": 0.77076226, "learning_rate": 3.8859578988290344e-06, "loss": 0.79457194, "num_input_tokens_seen": 8765120, "step": 418, "time_per_iteration": 5.837290525436401 }, { "auxiliary_loss_clip": 0.01297026, "auxiliary_loss_mlp": 0.01071197, "balance_loss_clip": 1.08019948, "balance_loss_mlp": 1.03550553, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 2.4603268634516207, "language_loss": 0.81445098, "learning_rate": 3.887496375507294e-06, "loss": 0.83813322, "num_input_tokens_seen": 8783500, "step": 419, "time_per_iteration": 2.582590341567993 }, { "auxiliary_loss_clip": 0.01291114, "auxiliary_loss_mlp": 0.01086736, "balance_loss_clip": 1.07929599, "balance_loss_mlp": 1.04708743, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.8078532084212713, "language_loss": 0.73618573, "learning_rate": 3.8890311847764065e-06, "loss": 0.75996423, "num_input_tokens_seen": 8801175, "step": 420, "time_per_iteration": 2.6739418506622314 }, { "auxiliary_loss_clip": 0.01290485, "auxiliary_loss_mlp": 0.01096292, "balance_loss_clip": 1.07605243, "balance_loss_mlp": 1.05924153, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.77336014903074, "language_loss": 0.79040134, "learning_rate": 3.890562344079484e-06, "loss": 0.81426907, "num_input_tokens_seen": 8820215, "step": 421, "time_per_iteration": 2.6928632259368896 }, { "auxiliary_loss_clip": 0.01290689, "auxiliary_loss_mlp": 0.01088863, "balance_loss_clip": 1.07922924, "balance_loss_mlp": 1.04983425, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.2139016136437104, "language_loss": 0.8203755, "learning_rate": 3.89208987073549e-06, "loss": 0.84417105, "num_input_tokens_seen": 8839660, "step": 422, "time_per_iteration": 2.714707851409912 }, { "auxiliary_loss_clip": 0.01293659, "auxiliary_loss_mlp": 0.01078975, "balance_loss_clip": 1.07677865, "balance_loss_mlp": 1.04430926, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 2.1259138778576356, "language_loss": 0.83458018, "learning_rate": 3.893613781940409e-06, "loss": 0.85830647, "num_input_tokens_seen": 8859280, "step": 423, "time_per_iteration": 2.652757167816162 }, { "auxiliary_loss_clip": 0.01287497, "auxiliary_loss_mlp": 0.01078335, "balance_loss_clip": 1.0742569, "balance_loss_mlp": 1.04221487, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2.012741083661608, "language_loss": 0.74129444, "learning_rate": 3.895134094768415e-06, "loss": 0.76495278, "num_input_tokens_seen": 8880560, "step": 424, "time_per_iteration": 2.7724521160125732 }, { "auxiliary_loss_clip": 0.01296446, "auxiliary_loss_mlp": 0.01093799, "balance_loss_clip": 1.07987142, "balance_loss_mlp": 1.05782199, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 4.623670538116741, "language_loss": 0.83193713, "learning_rate": 3.896650826173015e-06, "loss": 0.85583955, "num_input_tokens_seen": 8899155, "step": 425, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.01292462, "auxiliary_loss_mlp": 0.01092376, "balance_loss_clip": 1.07259536, "balance_loss_mlp": 1.0544672, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.5075767706443566, "language_loss": 0.853073, "learning_rate": 3.898163992988186e-06, "loss": 0.87692136, "num_input_tokens_seen": 8917890, "step": 426, "time_per_iteration": 2.6445271968841553 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.01017688, "balance_loss_clip": 1.06532824, "balance_loss_mlp": 1.00781715, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8949637292547264, "language_loss": 0.57219732, "learning_rate": 3.899673611929491e-06, "loss": 0.5941335, "num_input_tokens_seen": 8978260, "step": 427, "time_per_iteration": 3.2690517902374268 }, { "auxiliary_loss_clip": 0.01291989, "auxiliary_loss_mlp": 0.01092649, "balance_loss_clip": 1.08155811, "balance_loss_mlp": 1.05674267, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.4869215225306673, "language_loss": 0.88130605, "learning_rate": 3.901179699595194e-06, "loss": 0.90515244, "num_input_tokens_seen": 8994460, "step": 428, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01283603, "auxiliary_loss_mlp": 0.0107531, "balance_loss_clip": 1.07418942, "balance_loss_mlp": 1.03735399, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 2.067247304638145, "language_loss": 0.85790849, "learning_rate": 3.902682272467353e-06, "loss": 0.88149762, "num_input_tokens_seen": 9016670, "step": 429, "time_per_iteration": 2.749328374862671 }, { "auxiliary_loss_clip": 0.01288943, "auxiliary_loss_mlp": 0.01083888, "balance_loss_clip": 1.07337689, "balance_loss_mlp": 1.04590786, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.4411876712444034, "language_loss": 0.8815223, "learning_rate": 3.904181346912895e-06, "loss": 0.90525061, "num_input_tokens_seen": 9039720, "step": 430, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.01290726, "auxiliary_loss_mlp": 0.01080495, "balance_loss_clip": 1.0803287, "balance_loss_mlp": 1.04573333, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.086180078538185, "language_loss": 0.84249514, "learning_rate": 3.905676939184698e-06, "loss": 0.8662073, "num_input_tokens_seen": 9059850, "step": 431, "time_per_iteration": 2.6531126499176025 }, { "auxiliary_loss_clip": 0.01286945, "auxiliary_loss_mlp": 0.01073345, "balance_loss_clip": 1.07570636, "balance_loss_mlp": 1.03951311, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 2.681931959502968, "language_loss": 0.86511916, "learning_rate": 3.907169065422638e-06, "loss": 0.88872206, "num_input_tokens_seen": 9077590, "step": 432, "time_per_iteration": 2.7582762241363525 }, { "auxiliary_loss_clip": 0.01287429, "auxiliary_loss_mlp": 0.01072961, "balance_loss_clip": 1.07632601, "balance_loss_mlp": 1.03891492, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 1.95596969308187, "language_loss": 0.76036298, "learning_rate": 3.908657741654636e-06, "loss": 0.7839669, "num_input_tokens_seen": 9099880, "step": 433, "time_per_iteration": 2.707771062850952 }, { "auxiliary_loss_clip": 0.01289436, "auxiliary_loss_mlp": 0.01088504, "balance_loss_clip": 1.07470191, "balance_loss_mlp": 1.04973757, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 2.157056093147959, "language_loss": 0.8979522, "learning_rate": 3.910142983797699e-06, "loss": 0.92173159, "num_input_tokens_seen": 9118620, "step": 434, "time_per_iteration": 2.5665409564971924 }, { "auxiliary_loss_clip": 0.01289617, "auxiliary_loss_mlp": 0.01096405, "balance_loss_clip": 1.07960439, "balance_loss_mlp": 1.05904448, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.306071945033866, "language_loss": 0.80187833, "learning_rate": 3.9116248076589305e-06, "loss": 0.82573849, "num_input_tokens_seen": 9135655, "step": 435, "time_per_iteration": 2.614440679550171 }, { "auxiliary_loss_clip": 0.01285396, "auxiliary_loss_mlp": 0.01092207, "balance_loss_clip": 1.07367229, "balance_loss_mlp": 1.05503798, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 3.0257040949539356, "language_loss": 0.86361396, "learning_rate": 3.913103228936546e-06, "loss": 0.88739002, "num_input_tokens_seen": 9153520, "step": 436, "time_per_iteration": 2.635033130645752 }, { "auxiliary_loss_clip": 0.01289558, "auxiliary_loss_mlp": 0.01096903, "balance_loss_clip": 1.07716811, "balance_loss_mlp": 1.06080687, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.4233286399217993, "language_loss": 0.74725163, "learning_rate": 3.914578263220868e-06, "loss": 0.77111626, "num_input_tokens_seen": 9170750, "step": 437, "time_per_iteration": 2.6614880561828613 }, { "auxiliary_loss_clip": 0.01286403, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.07628679, "balance_loss_mlp": 1.06220388, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.79370908187484, "language_loss": 0.9131338, "learning_rate": 3.916049925995316e-06, "loss": 0.93700182, "num_input_tokens_seen": 9188430, "step": 438, "time_per_iteration": 2.674877166748047 }, { "auxiliary_loss_clip": 0.01169678, "auxiliary_loss_mlp": 0.01072518, "balance_loss_clip": 1.0602653, "balance_loss_mlp": 1.06250465, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8871275810137318, "language_loss": 0.62631273, "learning_rate": 3.917518232637377e-06, "loss": 0.64873469, "num_input_tokens_seen": 9255835, "step": 439, "time_per_iteration": 3.2527849674224854 }, { "auxiliary_loss_clip": 0.01296492, "auxiliary_loss_mlp": 0.01095184, "balance_loss_clip": 1.08175814, "balance_loss_mlp": 1.05758572, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 3.31985956061953, "language_loss": 0.75982475, "learning_rate": 3.918983198419573e-06, "loss": 0.78374153, "num_input_tokens_seen": 9276835, "step": 440, "time_per_iteration": 2.6770262718200684 }, { "auxiliary_loss_clip": 0.01286342, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.07652593, "balance_loss_mlp": 1.04048026, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 3.0236705091068283, "language_loss": 0.83197021, "learning_rate": 3.920444838510415e-06, "loss": 0.85559869, "num_input_tokens_seen": 9295075, "step": 441, "time_per_iteration": 2.591306209564209 }, { "auxiliary_loss_clip": 0.01291817, "auxiliary_loss_mlp": 0.01086154, "balance_loss_clip": 1.07703269, "balance_loss_mlp": 1.04829359, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 2.202684635319811, "language_loss": 0.78490162, "learning_rate": 3.92190316797534e-06, "loss": 0.80868137, "num_input_tokens_seen": 9314205, "step": 442, "time_per_iteration": 2.633054733276367 }, { "auxiliary_loss_clip": 0.0116251, "auxiliary_loss_mlp": 0.01015158, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.0054301, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9609264438471399, "language_loss": 0.64459753, "learning_rate": 3.92335820177765e-06, "loss": 0.66637421, "num_input_tokens_seen": 9367395, "step": 443, "time_per_iteration": 3.1241400241851807 }, { "auxiliary_loss_clip": 0.01291897, "auxiliary_loss_mlp": 0.01085882, "balance_loss_clip": 1.08147204, "balance_loss_mlp": 1.04906964, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 2.121488874389134, "language_loss": 0.82093638, "learning_rate": 3.924809954779425e-06, "loss": 0.84471416, "num_input_tokens_seen": 9385185, "step": 444, "time_per_iteration": 2.6202428340911865 }, { "auxiliary_loss_clip": 0.0129406, "auxiliary_loss_mlp": 0.01082041, "balance_loss_clip": 1.07940578, "balance_loss_mlp": 1.04263067, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.2213674770888607, "language_loss": 0.95689106, "learning_rate": 3.9262584417424425e-06, "loss": 0.98065209, "num_input_tokens_seen": 9403225, "step": 445, "time_per_iteration": 2.6071228981018066 }, { "auxiliary_loss_clip": 0.01289866, "auxiliary_loss_mlp": 0.01094053, "balance_loss_clip": 1.07953668, "balance_loss_mlp": 1.05492878, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.775359545549618, "language_loss": 0.91932094, "learning_rate": 3.9277036773290725e-06, "loss": 0.94316012, "num_input_tokens_seen": 9420540, "step": 446, "time_per_iteration": 2.5791916847229004 }, { "auxiliary_loss_clip": 0.01289847, "auxiliary_loss_mlp": 0.01088114, "balance_loss_clip": 1.08072042, "balance_loss_mlp": 1.05092025, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.0562763127679204, "language_loss": 0.79831308, "learning_rate": 3.92914567610317e-06, "loss": 0.82209271, "num_input_tokens_seen": 9438840, "step": 447, "time_per_iteration": 2.6420843601226807 }, { "auxiliary_loss_clip": 0.01289397, "auxiliary_loss_mlp": 0.01079607, "balance_loss_clip": 1.07901013, "balance_loss_mlp": 1.04446411, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.231264914467203, "language_loss": 0.86402845, "learning_rate": 3.930584452530952e-06, "loss": 0.8877185, "num_input_tokens_seen": 9457215, "step": 448, "time_per_iteration": 2.590277910232544 }, { "auxiliary_loss_clip": 0.01282455, "auxiliary_loss_mlp": 0.01091099, "balance_loss_clip": 1.07706833, "balance_loss_mlp": 1.05662322, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.941778256808524, "language_loss": 0.88581634, "learning_rate": 3.9320200209818755e-06, "loss": 0.90955186, "num_input_tokens_seen": 9475615, "step": 449, "time_per_iteration": 2.610065460205078 }, { "auxiliary_loss_clip": 0.01293472, "auxiliary_loss_mlp": 0.01085576, "balance_loss_clip": 1.07856452, "balance_loss_mlp": 1.04814398, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.199007921978797, "language_loss": 0.80395782, "learning_rate": 3.933452395729493e-06, "loss": 0.8277483, "num_input_tokens_seen": 9493975, "step": 450, "time_per_iteration": 2.637465238571167 }, { "auxiliary_loss_clip": 0.01284612, "auxiliary_loss_mlp": 0.0108001, "balance_loss_clip": 1.08025336, "balance_loss_mlp": 1.04384232, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.599374223212879, "language_loss": 0.81562543, "learning_rate": 3.934881590952304e-06, "loss": 0.83927161, "num_input_tokens_seen": 9514810, "step": 451, "time_per_iteration": 2.6506927013397217 }, { "auxiliary_loss_clip": 0.0128567, "auxiliary_loss_mlp": 0.01090719, "balance_loss_clip": 1.08126068, "balance_loss_mlp": 1.0533824, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.9677929562692107, "language_loss": 0.77019048, "learning_rate": 3.936307620734599e-06, "loss": 0.79395437, "num_input_tokens_seen": 9533635, "step": 452, "time_per_iteration": 2.5751442909240723 }, { "auxiliary_loss_clip": 0.01286865, "auxiliary_loss_mlp": 0.01088287, "balance_loss_clip": 1.08011293, "balance_loss_mlp": 1.05135596, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1.7205362750177517, "language_loss": 0.72874546, "learning_rate": 3.937730499067294e-06, "loss": 0.75249696, "num_input_tokens_seen": 9555420, "step": 453, "time_per_iteration": 2.668083667755127 }, { "auxiliary_loss_clip": 0.01281405, "auxiliary_loss_mlp": 0.01083223, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.04748416, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 1.8353680194819204, "language_loss": 0.82419729, "learning_rate": 3.939150239848748e-06, "loss": 0.84784359, "num_input_tokens_seen": 9578950, "step": 454, "time_per_iteration": 2.8580126762390137 }, { "auxiliary_loss_clip": 0.01285525, "auxiliary_loss_mlp": 0.01077241, "balance_loss_clip": 1.07935429, "balance_loss_mlp": 1.043648, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 1.985829769195046, "language_loss": 0.75404847, "learning_rate": 3.9405668568855866e-06, "loss": 0.77767611, "num_input_tokens_seen": 9598160, "step": 455, "time_per_iteration": 2.6593477725982666 }, { "auxiliary_loss_clip": 0.01282853, "auxiliary_loss_mlp": 0.01094959, "balance_loss_clip": 1.07477236, "balance_loss_mlp": 1.0597918, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 1.92483069519606, "language_loss": 0.80670613, "learning_rate": 3.941980363893499e-06, "loss": 0.83048427, "num_input_tokens_seen": 9616010, "step": 456, "time_per_iteration": 2.6798384189605713 }, { "auxiliary_loss_clip": 0.01280135, "auxiliary_loss_mlp": 0.01080319, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.0435549, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.171481572134165, "language_loss": 0.81587321, "learning_rate": 3.9433907744980384e-06, "loss": 0.83947778, "num_input_tokens_seen": 9634000, "step": 457, "time_per_iteration": 5.62308406829834 }, { "auxiliary_loss_clip": 0.01283922, "auxiliary_loss_mlp": 0.01084055, "balance_loss_clip": 1.07603848, "balance_loss_mlp": 1.04891229, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 2.024184269172234, "language_loss": 0.94030929, "learning_rate": 3.944798102235412e-06, "loss": 0.96398914, "num_input_tokens_seen": 9653455, "step": 458, "time_per_iteration": 5.694372653961182 }, { "auxiliary_loss_clip": 0.01280807, "auxiliary_loss_mlp": 0.01091426, "balance_loss_clip": 1.07479525, "balance_loss_mlp": 1.05666471, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 2.356061876390436, "language_loss": 0.79279089, "learning_rate": 3.9462023605532545e-06, "loss": 0.81651318, "num_input_tokens_seen": 9669650, "step": 459, "time_per_iteration": 2.626948595046997 }, { "auxiliary_loss_clip": 0.01286253, "auxiliary_loss_mlp": 0.01081623, "balance_loss_clip": 1.08119941, "balance_loss_mlp": 1.04278445, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.0583603779546404, "language_loss": 0.83362132, "learning_rate": 3.947603562811407e-06, "loss": 0.85730016, "num_input_tokens_seen": 9691415, "step": 460, "time_per_iteration": 2.7191598415374756 }, { "auxiliary_loss_clip": 0.01158037, "auxiliary_loss_mlp": 0.01054463, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.044402, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.612511499168885, "language_loss": 0.7351321, "learning_rate": 3.949001722282675e-06, "loss": 0.7572571, "num_input_tokens_seen": 9755605, "step": 461, "time_per_iteration": 3.210820436477661 }, { "auxiliary_loss_clip": 0.01284234, "auxiliary_loss_mlp": 0.01079832, "balance_loss_clip": 1.08432341, "balance_loss_mlp": 1.04700136, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.4500038571081073, "language_loss": 0.81596625, "learning_rate": 3.950396852153582e-06, "loss": 0.839607, "num_input_tokens_seen": 9776270, "step": 462, "time_per_iteration": 2.683197021484375 }, { "auxiliary_loss_clip": 0.01280414, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.07752454, "balance_loss_mlp": 1.0454762, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.258526594266715, "language_loss": 0.90062451, "learning_rate": 3.951788965525118e-06, "loss": 0.92421508, "num_input_tokens_seen": 9794465, "step": 463, "time_per_iteration": 2.641674757003784 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.01010002, "balance_loss_clip": 1.04755902, "balance_loss_mlp": 1.00027454, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8962796480673014, "language_loss": 0.59058654, "learning_rate": 3.953178075413476e-06, "loss": 0.61220491, "num_input_tokens_seen": 9849685, "step": 464, "time_per_iteration": 3.1129612922668457 }, { "auxiliary_loss_clip": 0.01292933, "auxiliary_loss_mlp": 0.01100533, "balance_loss_clip": 1.08296049, "balance_loss_mlp": 1.06412649, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.3712654859298055, "language_loss": 0.81454253, "learning_rate": 3.954564194750784e-06, "loss": 0.83847719, "num_input_tokens_seen": 9869505, "step": 465, "time_per_iteration": 2.723144769668579 }, { "auxiliary_loss_clip": 0.01279938, "auxiliary_loss_mlp": 0.01092668, "balance_loss_clip": 1.07546401, "balance_loss_mlp": 1.05630863, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 1.9968224423519798, "language_loss": 0.78396618, "learning_rate": 3.955947336385828e-06, "loss": 0.80769229, "num_input_tokens_seen": 9890950, "step": 466, "time_per_iteration": 2.6278555393218994 }, { "auxiliary_loss_clip": 0.0127853, "auxiliary_loss_mlp": 0.01091802, "balance_loss_clip": 1.07703936, "balance_loss_mlp": 1.05661178, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 2.010021605622182, "language_loss": 0.87699366, "learning_rate": 3.957327513084761e-06, "loss": 0.90069699, "num_input_tokens_seen": 9911265, "step": 467, "time_per_iteration": 2.6687490940093994 }, { "auxiliary_loss_clip": 0.01285129, "auxiliary_loss_mlp": 0.01112935, "balance_loss_clip": 1.07874036, "balance_loss_mlp": 1.07576585, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.2302958424490416, "language_loss": 0.86091757, "learning_rate": 3.958704737531818e-06, "loss": 0.88489819, "num_input_tokens_seen": 9929025, "step": 468, "time_per_iteration": 2.5745644569396973 }, { "auxiliary_loss_clip": 0.01281128, "auxiliary_loss_mlp": 0.01085455, "balance_loss_clip": 1.07529211, "balance_loss_mlp": 1.04857147, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.1866562002509875, "language_loss": 0.91690558, "learning_rate": 3.9600790223300065e-06, "loss": 0.94057143, "num_input_tokens_seen": 9945190, "step": 469, "time_per_iteration": 2.610821008682251 }, { "auxiliary_loss_clip": 0.0127909, "auxiliary_loss_mlp": 0.0110095, "balance_loss_clip": 1.07675052, "balance_loss_mlp": 1.06482995, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 2.674428223667968, "language_loss": 0.81758964, "learning_rate": 3.96145038000181e-06, "loss": 0.84139001, "num_input_tokens_seen": 9962820, "step": 470, "time_per_iteration": 2.6004326343536377 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01086643, "balance_loss_clip": 1.07482624, "balance_loss_mlp": 1.04947352, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.788793606991614, "language_loss": 0.93071401, "learning_rate": 3.962818822989861e-06, "loss": 0.95438784, "num_input_tokens_seen": 9982595, "step": 471, "time_per_iteration": 2.556288719177246 }, { "auxiliary_loss_clip": 0.01273697, "auxiliary_loss_mlp": 0.0110454, "balance_loss_clip": 1.07223165, "balance_loss_mlp": 1.06884849, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 1.8550872135639116, "language_loss": 0.7613501, "learning_rate": 3.964184363657625e-06, "loss": 0.78513247, "num_input_tokens_seen": 10004645, "step": 472, "time_per_iteration": 2.667804002761841 }, { "auxiliary_loss_clip": 0.01280341, "auxiliary_loss_mlp": 0.01090649, "balance_loss_clip": 1.07279634, "balance_loss_mlp": 1.05624473, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.9914661475951314, "language_loss": 0.93097353, "learning_rate": 3.965547014290071e-06, "loss": 0.95468336, "num_input_tokens_seen": 10022555, "step": 473, "time_per_iteration": 2.6402342319488525 }, { "auxiliary_loss_clip": 0.01287339, "auxiliary_loss_mlp": 0.01124194, "balance_loss_clip": 1.07773685, "balance_loss_mlp": 1.08979011, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 3.2560638787193237, "language_loss": 0.88488632, "learning_rate": 3.96690678709433e-06, "loss": 0.90900171, "num_input_tokens_seen": 10041025, "step": 474, "time_per_iteration": 2.5853888988494873 }, { "auxiliary_loss_clip": 0.0127783, "auxiliary_loss_mlp": 0.01093132, "balance_loss_clip": 1.07535374, "balance_loss_mlp": 1.05620146, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 3.1427023167402006, "language_loss": 0.78901398, "learning_rate": 3.968263694200355e-06, "loss": 0.81272364, "num_input_tokens_seen": 10060775, "step": 475, "time_per_iteration": 2.654519557952881 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01095224, "balance_loss_clip": 1.04505777, "balance_loss_mlp": 1.08583021, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9280065830162254, "language_loss": 0.66926932, "learning_rate": 3.969617747661569e-06, "loss": 0.6917026, "num_input_tokens_seen": 10120225, "step": 476, "time_per_iteration": 3.1292569637298584 }, { "auxiliary_loss_clip": 0.01279748, "auxiliary_loss_mlp": 0.01088794, "balance_loss_clip": 1.07638311, "balance_loss_mlp": 1.05188656, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 2.985672001195028, "language_loss": 0.83807188, "learning_rate": 3.970968959455509e-06, "loss": 0.86175728, "num_input_tokens_seen": 10137880, "step": 477, "time_per_iteration": 2.651493549346924 }, { "auxiliary_loss_clip": 0.01284956, "auxiliary_loss_mlp": 0.0108711, "balance_loss_clip": 1.07924342, "balance_loss_mlp": 1.05089426, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.1929055744411943, "language_loss": 0.8233152, "learning_rate": 3.97231734148446e-06, "loss": 0.84703588, "num_input_tokens_seen": 10156930, "step": 478, "time_per_iteration": 2.6986753940582275 }, { "auxiliary_loss_clip": 0.01277687, "auxiliary_loss_mlp": 0.01080644, "balance_loss_clip": 1.07448888, "balance_loss_mlp": 1.04500043, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 4.057107988717453, "language_loss": 0.81195259, "learning_rate": 3.973662905576082e-06, "loss": 0.83553594, "num_input_tokens_seen": 10176295, "step": 479, "time_per_iteration": 2.6321041584014893 }, { "auxiliary_loss_clip": 0.01273765, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.07335579, "balance_loss_mlp": 1.04552341, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 2.352225573775279, "language_loss": 0.7335608, "learning_rate": 3.975005663484038e-06, "loss": 0.75713164, "num_input_tokens_seen": 10195790, "step": 480, "time_per_iteration": 2.650696277618408 }, { "auxiliary_loss_clip": 0.01273107, "auxiliary_loss_mlp": 0.01075586, "balance_loss_clip": 1.07424879, "balance_loss_mlp": 1.04277968, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.867890428108999, "language_loss": 0.87560165, "learning_rate": 3.976345626888605e-06, "loss": 0.89908862, "num_input_tokens_seen": 10218405, "step": 481, "time_per_iteration": 2.6585533618927 }, { "auxiliary_loss_clip": 0.01142103, "auxiliary_loss_mlp": 0.01017301, "balance_loss_clip": 1.04286921, "balance_loss_mlp": 1.00895679, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8486437303263991, "language_loss": 0.66030192, "learning_rate": 3.9776828073972864e-06, "loss": 0.68189597, "num_input_tokens_seen": 10271005, "step": 482, "time_per_iteration": 2.9788918495178223 }, { "auxiliary_loss_clip": 0.01287904, "auxiliary_loss_mlp": 0.01082416, "balance_loss_clip": 1.07739437, "balance_loss_mlp": 1.04868007, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.6473263724689873, "language_loss": 0.7899214, "learning_rate": 3.979017216545415e-06, "loss": 0.81362462, "num_input_tokens_seen": 10288405, "step": 483, "time_per_iteration": 2.5642752647399902 }, { "auxiliary_loss_clip": 0.01283775, "auxiliary_loss_mlp": 0.01097438, "balance_loss_clip": 1.07794189, "balance_loss_mlp": 1.06155562, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.6777328906555766, "language_loss": 0.75510043, "learning_rate": 3.980348865796749e-06, "loss": 0.77891254, "num_input_tokens_seen": 10306875, "step": 484, "time_per_iteration": 2.608337640762329 }, { "auxiliary_loss_clip": 0.0127962, "auxiliary_loss_mlp": 0.01081582, "balance_loss_clip": 1.07543373, "balance_loss_mlp": 1.04760778, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.3457282915841113, "language_loss": 0.8378315, "learning_rate": 3.9816777665440615e-06, "loss": 0.86144352, "num_input_tokens_seen": 10323965, "step": 485, "time_per_iteration": 2.591409921646118 }, { "auxiliary_loss_clip": 0.01282377, "auxiliary_loss_mlp": 0.01084922, "balance_loss_clip": 1.08029485, "balance_loss_mlp": 1.04956484, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.044831141886674, "language_loss": 0.84432101, "learning_rate": 3.983003930109732e-06, "loss": 0.86799401, "num_input_tokens_seen": 10342620, "step": 486, "time_per_iteration": 2.7101452350616455 }, { "auxiliary_loss_clip": 0.01276806, "auxiliary_loss_mlp": 0.01090739, "balance_loss_clip": 1.07363296, "balance_loss_mlp": 1.05476189, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 12.432525192672303, "language_loss": 0.88968349, "learning_rate": 3.984327367746315e-06, "loss": 0.91335887, "num_input_tokens_seen": 10364610, "step": 487, "time_per_iteration": 2.637910842895508 }, { "auxiliary_loss_clip": 0.01283084, "auxiliary_loss_mlp": 0.01069223, "balance_loss_clip": 1.07921362, "balance_loss_mlp": 1.03677416, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.566388301054309, "language_loss": 0.88581878, "learning_rate": 3.985648090637122e-06, "loss": 0.90934181, "num_input_tokens_seen": 10380910, "step": 488, "time_per_iteration": 2.6569244861602783 }, { "auxiliary_loss_clip": 0.01275613, "auxiliary_loss_mlp": 0.01081415, "balance_loss_clip": 1.07419777, "balance_loss_mlp": 1.04667735, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 2.0135021623582503, "language_loss": 0.88869834, "learning_rate": 3.986966109896785e-06, "loss": 0.91226858, "num_input_tokens_seen": 10400665, "step": 489, "time_per_iteration": 2.805555582046509 }, { "auxiliary_loss_clip": 0.01271096, "auxiliary_loss_mlp": 0.01077182, "balance_loss_clip": 1.0704807, "balance_loss_mlp": 1.04168141, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 2.807428314395572, "language_loss": 0.88554472, "learning_rate": 3.988281436571815e-06, "loss": 0.90902752, "num_input_tokens_seen": 10420150, "step": 490, "time_per_iteration": 2.612993001937866 }, { "auxiliary_loss_clip": 0.01276687, "auxiliary_loss_mlp": 0.01088031, "balance_loss_clip": 1.0729506, "balance_loss_mlp": 1.0536747, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 2.430337539839543, "language_loss": 0.91496718, "learning_rate": 3.989594081641164e-06, "loss": 0.93861437, "num_input_tokens_seen": 10438210, "step": 491, "time_per_iteration": 2.6203627586364746 }, { "auxiliary_loss_clip": 0.01266864, "auxiliary_loss_mlp": 0.01072939, "balance_loss_clip": 1.07131863, "balance_loss_mlp": 1.03984618, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 1.9753258841331502, "language_loss": 0.85654163, "learning_rate": 3.9909040560167675e-06, "loss": 0.87993968, "num_input_tokens_seen": 10455125, "step": 492, "time_per_iteration": 2.636378288269043 }, { "auxiliary_loss_clip": 0.01279009, "auxiliary_loss_mlp": 0.01100381, "balance_loss_clip": 1.07765996, "balance_loss_mlp": 1.06471384, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 4.076790847855052, "language_loss": 0.84615922, "learning_rate": 3.992211370544093e-06, "loss": 0.86995316, "num_input_tokens_seen": 10470990, "step": 493, "time_per_iteration": 2.6144914627075195 }, { "auxiliary_loss_clip": 0.01272514, "auxiliary_loss_mlp": 0.01074657, "balance_loss_clip": 1.07140934, "balance_loss_mlp": 1.04042029, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 1.8084917907335818, "language_loss": 0.8658669, "learning_rate": 3.99351603600268e-06, "loss": 0.88933873, "num_input_tokens_seen": 10490685, "step": 494, "time_per_iteration": 2.7063095569610596 }, { "auxiliary_loss_clip": 0.01281688, "auxiliary_loss_mlp": 0.01084428, "balance_loss_clip": 1.07739305, "balance_loss_mlp": 1.05279028, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 7.125038043922513, "language_loss": 0.86841047, "learning_rate": 3.994818063106668e-06, "loss": 0.8920716, "num_input_tokens_seen": 10509435, "step": 495, "time_per_iteration": 2.641700267791748 }, { "auxiliary_loss_clip": 0.01268945, "auxiliary_loss_mlp": 0.01078198, "balance_loss_clip": 1.07384837, "balance_loss_mlp": 1.04508162, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 2.201071528053665, "language_loss": 0.61988759, "learning_rate": 3.99611746250533e-06, "loss": 0.64335901, "num_input_tokens_seen": 10530050, "step": 496, "time_per_iteration": 2.6524407863616943 }, { "auxiliary_loss_clip": 0.01270994, "auxiliary_loss_mlp": 0.01089922, "balance_loss_clip": 1.07575428, "balance_loss_mlp": 1.05680561, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.7538974268426115, "language_loss": 0.88820887, "learning_rate": 3.997414244783595e-06, "loss": 0.91181797, "num_input_tokens_seen": 10551370, "step": 497, "time_per_iteration": 5.648245811462402 }, { "auxiliary_loss_clip": 0.01277289, "auxiliary_loss_mlp": 0.01079642, "balance_loss_clip": 1.07670021, "balance_loss_mlp": 1.04604888, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.8395997319333204, "language_loss": 0.85091698, "learning_rate": 3.998708420462557e-06, "loss": 0.87448633, "num_input_tokens_seen": 10569225, "step": 498, "time_per_iteration": 4.362173080444336 }, { "auxiliary_loss_clip": 0.0127249, "auxiliary_loss_mlp": 0.01078673, "balance_loss_clip": 1.07436109, "balance_loss_mlp": 1.04691589, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 3.2275044857926605, "language_loss": 0.77883017, "learning_rate": 4e-06, "loss": 0.80234182, "num_input_tokens_seen": 10586170, "step": 499, "time_per_iteration": 2.6029655933380127 }, { "auxiliary_loss_clip": 0.01272525, "auxiliary_loss_mlp": 0.01082339, "balance_loss_clip": 1.07433248, "balance_loss_mlp": 1.04905546, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 2.244229511477372, "language_loss": 0.82687509, "learning_rate": 3.9999999620799e-06, "loss": 0.85042375, "num_input_tokens_seen": 10606205, "step": 500, "time_per_iteration": 2.6293113231658936 }, { "auxiliary_loss_clip": 0.01266453, "auxiliary_loss_mlp": 0.0108458, "balance_loss_clip": 1.07100737, "balance_loss_mlp": 1.04922247, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 3.2569274145363356, "language_loss": 0.88086087, "learning_rate": 3.9999998483196e-06, "loss": 0.90437114, "num_input_tokens_seen": 10625995, "step": 501, "time_per_iteration": 2.601081132888794 }, { "auxiliary_loss_clip": 0.01273997, "auxiliary_loss_mlp": 0.01071746, "balance_loss_clip": 1.07361674, "balance_loss_mlp": 1.04025102, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 3.3627001763511855, "language_loss": 0.86654103, "learning_rate": 3.9999996587191065e-06, "loss": 0.88999844, "num_input_tokens_seen": 10644105, "step": 502, "time_per_iteration": 2.5507659912109375 }, { "auxiliary_loss_clip": 0.01270542, "auxiliary_loss_mlp": 0.01081534, "balance_loss_clip": 1.07475543, "balance_loss_mlp": 1.04827452, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 2.4572357458963876, "language_loss": 0.84281206, "learning_rate": 3.999999393278425e-06, "loss": 0.86633277, "num_input_tokens_seen": 10661090, "step": 503, "time_per_iteration": 2.618587017059326 }, { "auxiliary_loss_clip": 0.01262547, "auxiliary_loss_mlp": 0.01091143, "balance_loss_clip": 1.0710721, "balance_loss_mlp": 1.05781209, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 1.6994359255159197, "language_loss": 0.88137805, "learning_rate": 3.999999051997567e-06, "loss": 0.90491492, "num_input_tokens_seen": 10682380, "step": 504, "time_per_iteration": 2.6794183254241943 }, { "auxiliary_loss_clip": 0.01264601, "auxiliary_loss_mlp": 0.01086749, "balance_loss_clip": 1.07040262, "balance_loss_mlp": 1.0541091, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 2.074855698516145, "language_loss": 0.786093, "learning_rate": 3.9999986348765425e-06, "loss": 0.80960649, "num_input_tokens_seen": 10699925, "step": 505, "time_per_iteration": 2.564960479736328 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.010147, "balance_loss_clip": 1.03763247, "balance_loss_mlp": 1.00692737, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.9565689962416369, "language_loss": 0.54981297, "learning_rate": 3.999998141915371e-06, "loss": 0.57130682, "num_input_tokens_seen": 10766525, "step": 506, "time_per_iteration": 3.3345654010772705 }, { "auxiliary_loss_clip": 0.01266577, "auxiliary_loss_mlp": 0.01090299, "balance_loss_clip": 1.07119894, "balance_loss_mlp": 1.05687308, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.2738865373146684, "language_loss": 0.83377159, "learning_rate": 3.999997573114069e-06, "loss": 0.8573404, "num_input_tokens_seen": 10786725, "step": 507, "time_per_iteration": 2.645613670349121 }, { "auxiliary_loss_clip": 0.01269938, "auxiliary_loss_mlp": 0.01076205, "balance_loss_clip": 1.07151937, "balance_loss_mlp": 1.04344678, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.375369924968869, "language_loss": 0.88842839, "learning_rate": 3.999996928472659e-06, "loss": 0.91188985, "num_input_tokens_seen": 10805390, "step": 508, "time_per_iteration": 2.617283344268799 }, { "auxiliary_loss_clip": 0.01272148, "auxiliary_loss_mlp": 0.01067206, "balance_loss_clip": 1.07232118, "balance_loss_mlp": 1.03394616, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 6.964954749829821, "language_loss": 0.71807706, "learning_rate": 3.999996207991165e-06, "loss": 0.74147063, "num_input_tokens_seen": 10828030, "step": 509, "time_per_iteration": 2.7723498344421387 }, { "auxiliary_loss_clip": 0.01264594, "auxiliary_loss_mlp": 0.01074377, "balance_loss_clip": 1.07241154, "balance_loss_mlp": 1.04333544, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 1.9285974370038053, "language_loss": 0.82031929, "learning_rate": 3.999995411669614e-06, "loss": 0.84370899, "num_input_tokens_seen": 10845240, "step": 510, "time_per_iteration": 2.6254217624664307 }, { "auxiliary_loss_clip": 0.01268793, "auxiliary_loss_mlp": 0.01075379, "balance_loss_clip": 1.07532823, "balance_loss_mlp": 1.04252458, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 5.706057095430757, "language_loss": 0.83572316, "learning_rate": 3.999994539508036e-06, "loss": 0.85916495, "num_input_tokens_seen": 10864325, "step": 511, "time_per_iteration": 2.613457441329956 }, { "auxiliary_loss_clip": 0.01269742, "auxiliary_loss_mlp": 0.01081314, "balance_loss_clip": 1.07207167, "balance_loss_mlp": 1.0496521, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.025270681093948, "language_loss": 0.82109964, "learning_rate": 3.9999935915064655e-06, "loss": 0.84461015, "num_input_tokens_seen": 10883860, "step": 512, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01266054, "auxiliary_loss_mlp": 0.01084436, "balance_loss_clip": 1.07086158, "balance_loss_mlp": 1.05070007, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 2.500363981205655, "language_loss": 0.86933553, "learning_rate": 3.9999925676649374e-06, "loss": 0.89284045, "num_input_tokens_seen": 10904555, "step": 513, "time_per_iteration": 2.671926259994507 }, { "auxiliary_loss_clip": 0.01272542, "auxiliary_loss_mlp": 0.01080065, "balance_loss_clip": 1.07461214, "balance_loss_mlp": 1.04744935, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.704575426690477, "language_loss": 0.79124331, "learning_rate": 3.999991467983491e-06, "loss": 0.81476939, "num_input_tokens_seen": 10923700, "step": 514, "time_per_iteration": 2.6158573627471924 }, { "auxiliary_loss_clip": 0.01265821, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.07397485, "balance_loss_mlp": 1.03711247, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 2.729063628201222, "language_loss": 0.77758944, "learning_rate": 3.999990292462167e-06, "loss": 0.80092615, "num_input_tokens_seen": 10942730, "step": 515, "time_per_iteration": 2.636294364929199 }, { "auxiliary_loss_clip": 0.0126398, "auxiliary_loss_mlp": 0.01072575, "balance_loss_clip": 1.06835747, "balance_loss_mlp": 1.03874326, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 2.1228851207681503, "language_loss": 0.82452714, "learning_rate": 3.999989041101011e-06, "loss": 0.84789264, "num_input_tokens_seen": 10967120, "step": 516, "time_per_iteration": 2.8078057765960693 }, { "auxiliary_loss_clip": 0.01263726, "auxiliary_loss_mlp": 0.01073859, "balance_loss_clip": 1.0712111, "balance_loss_mlp": 1.04090929, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 1.9016724574566626, "language_loss": 0.79088318, "learning_rate": 3.999987713900071e-06, "loss": 0.81425899, "num_input_tokens_seen": 10986775, "step": 517, "time_per_iteration": 2.5935981273651123 }, { "auxiliary_loss_clip": 0.0125895, "auxiliary_loss_mlp": 0.0107836, "balance_loss_clip": 1.07049131, "balance_loss_mlp": 1.04629326, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.6829619528007147, "language_loss": 0.90798068, "learning_rate": 3.999986310859396e-06, "loss": 0.93135381, "num_input_tokens_seen": 11011360, "step": 518, "time_per_iteration": 2.6855509281158447 }, { "auxiliary_loss_clip": 0.01272237, "auxiliary_loss_mlp": 0.01097567, "balance_loss_clip": 1.07848859, "balance_loss_mlp": 1.06230497, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 1.8835331125391583, "language_loss": 0.86759162, "learning_rate": 3.999984831979039e-06, "loss": 0.89128959, "num_input_tokens_seen": 11030150, "step": 519, "time_per_iteration": 2.628380060195923 }, { "auxiliary_loss_clip": 0.01265864, "auxiliary_loss_mlp": 0.01086943, "balance_loss_clip": 1.06901193, "balance_loss_mlp": 1.05578136, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 3.8823628482318164, "language_loss": 0.87246573, "learning_rate": 3.999983277259057e-06, "loss": 0.89599377, "num_input_tokens_seen": 11049145, "step": 520, "time_per_iteration": 2.5850255489349365 }, { "auxiliary_loss_clip": 0.01269157, "auxiliary_loss_mlp": 0.01086266, "balance_loss_clip": 1.07231963, "balance_loss_mlp": 1.0528394, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.7050130216714323, "language_loss": 0.89274424, "learning_rate": 3.999981646699509e-06, "loss": 0.91629851, "num_input_tokens_seen": 11068835, "step": 521, "time_per_iteration": 2.6412506103515625 }, { "auxiliary_loss_clip": 0.01263772, "auxiliary_loss_mlp": 0.01082584, "balance_loss_clip": 1.0717473, "balance_loss_mlp": 1.04827595, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 2.085624200373119, "language_loss": 0.71452564, "learning_rate": 3.999979940300456e-06, "loss": 0.73798925, "num_input_tokens_seen": 11088980, "step": 522, "time_per_iteration": 2.6561174392700195 }, { "auxiliary_loss_clip": 0.01265725, "auxiliary_loss_mlp": 0.01082552, "balance_loss_clip": 1.06871116, "balance_loss_mlp": 1.05079484, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 4.223323698032832, "language_loss": 0.84758592, "learning_rate": 3.999978158061963e-06, "loss": 0.87106872, "num_input_tokens_seen": 11104300, "step": 523, "time_per_iteration": 2.608565330505371 }, { "auxiliary_loss_clip": 0.01271589, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.07193565, "balance_loss_mlp": 1.04296994, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.324094801199308, "language_loss": 0.89989722, "learning_rate": 3.999976299984099e-06, "loss": 0.92337573, "num_input_tokens_seen": 11123335, "step": 524, "time_per_iteration": 2.68269944190979 }, { "auxiliary_loss_clip": 0.01273471, "auxiliary_loss_mlp": 0.0108318, "balance_loss_clip": 1.07427168, "balance_loss_mlp": 1.04944324, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.4635323942475766, "language_loss": 0.80114233, "learning_rate": 3.999974366066933e-06, "loss": 0.82470882, "num_input_tokens_seen": 11140880, "step": 525, "time_per_iteration": 2.6396324634552 }, { "auxiliary_loss_clip": 0.01264716, "auxiliary_loss_mlp": 0.01080959, "balance_loss_clip": 1.0681529, "balance_loss_mlp": 1.04798603, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.3553733144031948, "language_loss": 0.81162, "learning_rate": 3.999972356310538e-06, "loss": 0.83507675, "num_input_tokens_seen": 11158710, "step": 526, "time_per_iteration": 2.6167168617248535 }, { "auxiliary_loss_clip": 0.01273987, "auxiliary_loss_mlp": 0.01072725, "balance_loss_clip": 1.07507181, "balance_loss_mlp": 1.03736734, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 1.9666844995001491, "language_loss": 0.81491739, "learning_rate": 3.999970270714991e-06, "loss": 0.83838451, "num_input_tokens_seen": 11177550, "step": 527, "time_per_iteration": 2.580310821533203 }, { "auxiliary_loss_clip": 0.01261155, "auxiliary_loss_mlp": 0.01080842, "balance_loss_clip": 1.06786597, "balance_loss_mlp": 1.04717755, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 1.9105688869262756, "language_loss": 0.93801636, "learning_rate": 3.999968109280371e-06, "loss": 0.96143627, "num_input_tokens_seen": 11196230, "step": 528, "time_per_iteration": 2.5901002883911133 }, { "auxiliary_loss_clip": 0.01263275, "auxiliary_loss_mlp": 0.01071724, "balance_loss_clip": 1.06776333, "balance_loss_mlp": 1.0387274, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 1.8924176613796981, "language_loss": 0.84130204, "learning_rate": 3.99996587200676e-06, "loss": 0.86465204, "num_input_tokens_seen": 11214935, "step": 529, "time_per_iteration": 2.593867063522339 }, { "auxiliary_loss_clip": 0.01266309, "auxiliary_loss_mlp": 0.01088988, "balance_loss_clip": 1.07501197, "balance_loss_mlp": 1.0563724, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 2.316883777672742, "language_loss": 0.90458709, "learning_rate": 3.999963558894243e-06, "loss": 0.92814004, "num_input_tokens_seen": 11235310, "step": 530, "time_per_iteration": 2.5994982719421387 }, { "auxiliary_loss_clip": 0.01261024, "auxiliary_loss_mlp": 0.0107627, "balance_loss_clip": 1.06481552, "balance_loss_mlp": 1.04188991, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.2744046769674324, "language_loss": 0.76334512, "learning_rate": 3.999961169942907e-06, "loss": 0.78671807, "num_input_tokens_seen": 11254425, "step": 531, "time_per_iteration": 2.618149757385254 }, { "auxiliary_loss_clip": 0.01260981, "auxiliary_loss_mlp": 0.01064937, "balance_loss_clip": 1.0669558, "balance_loss_mlp": 1.03143883, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 2.467757262816931, "language_loss": 0.90483695, "learning_rate": 3.999958705152843e-06, "loss": 0.92809618, "num_input_tokens_seen": 11274595, "step": 532, "time_per_iteration": 2.647947072982788 }, { "auxiliary_loss_clip": 0.01146464, "auxiliary_loss_mlp": 0.01012028, "balance_loss_clip": 1.04988623, "balance_loss_mlp": 1.00325394, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 1.9655071928838626, "language_loss": 0.57953775, "learning_rate": 3.9999561645241445e-06, "loss": 0.60112268, "num_input_tokens_seen": 11336705, "step": 533, "time_per_iteration": 3.2502808570861816 }, { "auxiliary_loss_clip": 0.01260941, "auxiliary_loss_mlp": 0.01084263, "balance_loss_clip": 1.06724441, "balance_loss_mlp": 1.0516715, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 1.7138682169725878, "language_loss": 0.86666048, "learning_rate": 3.999953548056907e-06, "loss": 0.89011252, "num_input_tokens_seen": 11356820, "step": 534, "time_per_iteration": 2.678739070892334 }, { "auxiliary_loss_clip": 0.01259554, "auxiliary_loss_mlp": 0.01066669, "balance_loss_clip": 1.06782031, "balance_loss_mlp": 1.03407741, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.12774196295415, "language_loss": 0.77627808, "learning_rate": 3.999950855751232e-06, "loss": 0.79954034, "num_input_tokens_seen": 11376645, "step": 535, "time_per_iteration": 2.7128217220306396 }, { "auxiliary_loss_clip": 0.01261708, "auxiliary_loss_mlp": 0.01081378, "balance_loss_clip": 1.06843078, "balance_loss_mlp": 1.0485003, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 3.9913279940153585, "language_loss": 0.80939913, "learning_rate": 3.999948087607219e-06, "loss": 0.83283001, "num_input_tokens_seen": 11397310, "step": 536, "time_per_iteration": 2.7490127086639404 }, { "auxiliary_loss_clip": 0.01262237, "auxiliary_loss_mlp": 0.01075987, "balance_loss_clip": 1.06839073, "balance_loss_mlp": 1.04167831, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 1.6888601787189168, "language_loss": 0.7009111, "learning_rate": 3.999945243624975e-06, "loss": 0.72429335, "num_input_tokens_seen": 11418475, "step": 537, "time_per_iteration": 5.5609166622161865 }, { "auxiliary_loss_clip": 0.0126357, "auxiliary_loss_mlp": 0.01084205, "balance_loss_clip": 1.07331729, "balance_loss_mlp": 1.05161297, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 2.146306428033486, "language_loss": 0.82684958, "learning_rate": 3.999942323804607e-06, "loss": 0.85032725, "num_input_tokens_seen": 11436630, "step": 538, "time_per_iteration": 2.5465030670166016 }, { "auxiliary_loss_clip": 0.01269537, "auxiliary_loss_mlp": 0.01078099, "balance_loss_clip": 1.06987572, "balance_loss_mlp": 1.04536414, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 1.8709064214989917, "language_loss": 0.79146457, "learning_rate": 3.999939328146225e-06, "loss": 0.81494099, "num_input_tokens_seen": 11457275, "step": 539, "time_per_iteration": 4.172123432159424 }, { "auxiliary_loss_clip": 0.0126143, "auxiliary_loss_mlp": 0.01069528, "balance_loss_clip": 1.06830835, "balance_loss_mlp": 1.03567231, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 35.59051030008172, "language_loss": 0.77379727, "learning_rate": 3.999936256649943e-06, "loss": 0.79710686, "num_input_tokens_seen": 11476925, "step": 540, "time_per_iteration": 2.5633046627044678 }, { "auxiliary_loss_clip": 0.01269863, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.07271969, "balance_loss_mlp": 1.04124355, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.0489065110302636, "language_loss": 0.85458571, "learning_rate": 3.999933109315878e-06, "loss": 0.878021, "num_input_tokens_seen": 11496830, "step": 541, "time_per_iteration": 2.6079938411712646 }, { "auxiliary_loss_clip": 0.01258504, "auxiliary_loss_mlp": 0.01082451, "balance_loss_clip": 1.06961954, "balance_loss_mlp": 1.04835749, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.674731240129174, "language_loss": 0.89234567, "learning_rate": 3.9999298861441496e-06, "loss": 0.91575521, "num_input_tokens_seen": 11515605, "step": 542, "time_per_iteration": 2.597036600112915 }, { "auxiliary_loss_clip": 0.0126351, "auxiliary_loss_mlp": 0.01081041, "balance_loss_clip": 1.06974792, "balance_loss_mlp": 1.04792452, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 2.2714121360014334, "language_loss": 0.71123677, "learning_rate": 3.999926587134879e-06, "loss": 0.73468232, "num_input_tokens_seen": 11536230, "step": 543, "time_per_iteration": 2.634601354598999 }, { "auxiliary_loss_clip": 0.01259994, "auxiliary_loss_mlp": 0.01088763, "balance_loss_clip": 1.06379187, "balance_loss_mlp": 1.05545604, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 4.777521083182084, "language_loss": 0.91540575, "learning_rate": 3.999923212288192e-06, "loss": 0.93889332, "num_input_tokens_seen": 11554715, "step": 544, "time_per_iteration": 2.6173009872436523 }, { "auxiliary_loss_clip": 0.01264485, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.06989884, "balance_loss_mlp": 1.05571437, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 2.6951315012120025, "language_loss": 0.65799558, "learning_rate": 3.999919761604216e-06, "loss": 0.68149722, "num_input_tokens_seen": 11571370, "step": 545, "time_per_iteration": 2.6500988006591797 }, { "auxiliary_loss_clip": 0.012623, "auxiliary_loss_mlp": 0.0107161, "balance_loss_clip": 1.06693912, "balance_loss_mlp": 1.0393517, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.2564766449723908, "language_loss": 0.92221987, "learning_rate": 3.999916235083083e-06, "loss": 0.94555902, "num_input_tokens_seen": 11588560, "step": 546, "time_per_iteration": 2.673250913619995 }, { "auxiliary_loss_clip": 0.01260258, "auxiliary_loss_mlp": 0.01077296, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.04313052, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 2.1923718908590653, "language_loss": 0.81706661, "learning_rate": 3.999912632724925e-06, "loss": 0.84044212, "num_input_tokens_seen": 11605685, "step": 547, "time_per_iteration": 2.725198745727539 }, { "auxiliary_loss_clip": 0.0126227, "auxiliary_loss_mlp": 0.0107871, "balance_loss_clip": 1.06794477, "balance_loss_mlp": 1.04480648, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.730652582963277, "language_loss": 0.81227565, "learning_rate": 3.999908954529881e-06, "loss": 0.83568549, "num_input_tokens_seen": 11626290, "step": 548, "time_per_iteration": 2.714073419570923 }, { "auxiliary_loss_clip": 0.01264818, "auxiliary_loss_mlp": 0.01084154, "balance_loss_clip": 1.06963027, "balance_loss_mlp": 1.04870164, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 3.8540092911047603, "language_loss": 0.67460287, "learning_rate": 3.999905200498087e-06, "loss": 0.69809258, "num_input_tokens_seen": 11643950, "step": 549, "time_per_iteration": 2.6747171878814697 }, { "auxiliary_loss_clip": 0.0125805, "auxiliary_loss_mlp": 0.01076001, "balance_loss_clip": 1.06968856, "balance_loss_mlp": 1.04236054, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 1.933615596136007, "language_loss": 0.86379111, "learning_rate": 3.999901370629689e-06, "loss": 0.88713157, "num_input_tokens_seen": 11662560, "step": 550, "time_per_iteration": 2.553386926651001 }, { "auxiliary_loss_clip": 0.01264951, "auxiliary_loss_mlp": 0.01095377, "balance_loss_clip": 1.07279766, "balance_loss_mlp": 1.06142652, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 3.1958143211070977, "language_loss": 0.8127178, "learning_rate": 3.99989746492483e-06, "loss": 0.83632112, "num_input_tokens_seen": 11682265, "step": 551, "time_per_iteration": 2.6231682300567627 }, { "auxiliary_loss_clip": 0.01271579, "auxiliary_loss_mlp": 0.0108998, "balance_loss_clip": 1.07285261, "balance_loss_mlp": 1.05626702, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 2.9473143774727606, "language_loss": 0.86134821, "learning_rate": 3.999893483383658e-06, "loss": 0.88496381, "num_input_tokens_seen": 11699300, "step": 552, "time_per_iteration": 2.7002694606781006 }, { "auxiliary_loss_clip": 0.01267081, "auxiliary_loss_mlp": 0.01081671, "balance_loss_clip": 1.07191086, "balance_loss_mlp": 1.04650474, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.990469903058063, "language_loss": 0.9301765, "learning_rate": 3.999889426006326e-06, "loss": 0.95366406, "num_input_tokens_seen": 11716955, "step": 553, "time_per_iteration": 2.6629648208618164 }, { "auxiliary_loss_clip": 0.01262345, "auxiliary_loss_mlp": 0.01077186, "balance_loss_clip": 1.06925786, "balance_loss_mlp": 1.04149485, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.1924330874053166, "language_loss": 0.78881586, "learning_rate": 3.999885292792986e-06, "loss": 0.8122111, "num_input_tokens_seen": 11736130, "step": 554, "time_per_iteration": 2.668970823287964 }, { "auxiliary_loss_clip": 0.01258048, "auxiliary_loss_mlp": 0.0108557, "balance_loss_clip": 1.06745815, "balance_loss_mlp": 1.05045104, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.2144550089326938, "language_loss": 0.81971425, "learning_rate": 3.999881083743795e-06, "loss": 0.84315038, "num_input_tokens_seen": 11754425, "step": 555, "time_per_iteration": 2.610807418823242 }, { "auxiliary_loss_clip": 0.01264442, "auxiliary_loss_mlp": 0.0108339, "balance_loss_clip": 1.06914032, "balance_loss_mlp": 1.04805672, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 3.7821745066525487, "language_loss": 0.88661897, "learning_rate": 3.999876798858914e-06, "loss": 0.9100973, "num_input_tokens_seen": 11772845, "step": 556, "time_per_iteration": 2.6288907527923584 }, { "auxiliary_loss_clip": 0.01262553, "auxiliary_loss_mlp": 0.01084158, "balance_loss_clip": 1.06896496, "balance_loss_mlp": 1.04863358, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 1.974910128087634, "language_loss": 0.83708388, "learning_rate": 3.999872438138503e-06, "loss": 0.860551, "num_input_tokens_seen": 11792850, "step": 557, "time_per_iteration": 2.649401903152466 }, { "auxiliary_loss_clip": 0.01268198, "auxiliary_loss_mlp": 0.01069057, "balance_loss_clip": 1.07400489, "balance_loss_mlp": 1.03684711, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 3.176542206824637, "language_loss": 0.94202292, "learning_rate": 3.999868001582729e-06, "loss": 0.96539545, "num_input_tokens_seen": 11809670, "step": 558, "time_per_iteration": 2.550515651702881 }, { "auxiliary_loss_clip": 0.01258948, "auxiliary_loss_mlp": 0.01074291, "balance_loss_clip": 1.06591845, "balance_loss_mlp": 1.04036427, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.6619487077732384, "language_loss": 0.77115649, "learning_rate": 3.99986348919176e-06, "loss": 0.79448891, "num_input_tokens_seen": 11829665, "step": 559, "time_per_iteration": 2.729597330093384 }, { "auxiliary_loss_clip": 0.01261947, "auxiliary_loss_mlp": 0.01080822, "balance_loss_clip": 1.06835234, "balance_loss_mlp": 1.04882574, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 1.945022837871561, "language_loss": 0.87472397, "learning_rate": 3.9998589009657675e-06, "loss": 0.89815164, "num_input_tokens_seen": 11848190, "step": 560, "time_per_iteration": 2.6082279682159424 }, { "auxiliary_loss_clip": 0.01257198, "auxiliary_loss_mlp": 0.0107356, "balance_loss_clip": 1.06704283, "balance_loss_mlp": 1.04199314, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.4061219554407502, "language_loss": 0.81578708, "learning_rate": 3.999854236904925e-06, "loss": 0.83909464, "num_input_tokens_seen": 11864795, "step": 561, "time_per_iteration": 2.602193832397461 }, { "auxiliary_loss_clip": 0.01254722, "auxiliary_loss_mlp": 0.01076361, "balance_loss_clip": 1.06685936, "balance_loss_mlp": 1.04422247, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.683217504050761, "language_loss": 0.82320511, "learning_rate": 3.999849497009409e-06, "loss": 0.84651601, "num_input_tokens_seen": 11885275, "step": 562, "time_per_iteration": 2.675872564315796 }, { "auxiliary_loss_clip": 0.01262146, "auxiliary_loss_mlp": 0.01084212, "balance_loss_clip": 1.06894755, "balance_loss_mlp": 1.0508337, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 2.262509698135982, "language_loss": 0.84285647, "learning_rate": 3.999844681279401e-06, "loss": 0.86632001, "num_input_tokens_seen": 11903595, "step": 563, "time_per_iteration": 2.586944103240967 }, { "auxiliary_loss_clip": 0.01258135, "auxiliary_loss_mlp": 0.01083866, "balance_loss_clip": 1.0675565, "balance_loss_mlp": 1.05094075, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.115200912185494, "language_loss": 0.94438875, "learning_rate": 3.99983978971508e-06, "loss": 0.96780878, "num_input_tokens_seen": 11917815, "step": 564, "time_per_iteration": 2.5444440841674805 }, { "auxiliary_loss_clip": 0.01259509, "auxiliary_loss_mlp": 0.01073406, "balance_loss_clip": 1.06518865, "balance_loss_mlp": 1.03907406, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.6560391741906924, "language_loss": 0.94669235, "learning_rate": 3.999834822316635e-06, "loss": 0.97002149, "num_input_tokens_seen": 11936305, "step": 565, "time_per_iteration": 2.5614171028137207 }, { "auxiliary_loss_clip": 0.01150452, "auxiliary_loss_mlp": 0.01081579, "balance_loss_clip": 1.04835606, "balance_loss_mlp": 1.07499874, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.0610477485673708, "language_loss": 0.54800498, "learning_rate": 3.9998297790842535e-06, "loss": 0.57032537, "num_input_tokens_seen": 11998940, "step": 566, "time_per_iteration": 3.229137659072876 }, { "auxiliary_loss_clip": 0.0126129, "auxiliary_loss_mlp": 0.01073482, "balance_loss_clip": 1.06798041, "balance_loss_mlp": 1.03793335, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 3.1955261820278564, "language_loss": 0.76836932, "learning_rate": 3.999824660018126e-06, "loss": 0.79171705, "num_input_tokens_seen": 12018860, "step": 567, "time_per_iteration": 2.632741928100586 }, { "auxiliary_loss_clip": 0.01253596, "auxiliary_loss_mlp": 0.01083559, "balance_loss_clip": 1.06611466, "balance_loss_mlp": 1.05153918, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 2.115683621050472, "language_loss": 0.80834144, "learning_rate": 3.999819465118447e-06, "loss": 0.83171296, "num_input_tokens_seen": 12039675, "step": 568, "time_per_iteration": 2.7206337451934814 }, { "auxiliary_loss_clip": 0.01254921, "auxiliary_loss_mlp": 0.01082401, "balance_loss_clip": 1.06888509, "balance_loss_mlp": 1.04940367, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.891360159585894, "language_loss": 0.86560667, "learning_rate": 3.999814194385413e-06, "loss": 0.88897985, "num_input_tokens_seen": 12057680, "step": 569, "time_per_iteration": 2.7271673679351807 }, { "auxiliary_loss_clip": 0.01255135, "auxiliary_loss_mlp": 0.01082251, "balance_loss_clip": 1.06644094, "balance_loss_mlp": 1.04922962, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.6888504559193653, "language_loss": 0.95945716, "learning_rate": 3.9998088478192255e-06, "loss": 0.982831, "num_input_tokens_seen": 12076135, "step": 570, "time_per_iteration": 2.5918867588043213 }, { "auxiliary_loss_clip": 0.01255487, "auxiliary_loss_mlp": 0.0108066, "balance_loss_clip": 1.06228065, "balance_loss_mlp": 1.0435617, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.39132447086081, "language_loss": 0.7964232, "learning_rate": 3.9998034254200846e-06, "loss": 0.8197847, "num_input_tokens_seen": 12094785, "step": 571, "time_per_iteration": 2.590184450149536 }, { "auxiliary_loss_clip": 0.01256218, "auxiliary_loss_mlp": 0.01091484, "balance_loss_clip": 1.06740785, "balance_loss_mlp": 1.0565083, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.0738695690993, "language_loss": 0.80214274, "learning_rate": 3.999797927188199e-06, "loss": 0.82561976, "num_input_tokens_seen": 12114590, "step": 572, "time_per_iteration": 2.6862123012542725 }, { "auxiliary_loss_clip": 0.01263024, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.06995344, "balance_loss_mlp": 1.04098535, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 2.2324763929909284, "language_loss": 0.84548658, "learning_rate": 3.999792353123774e-06, "loss": 0.86885858, "num_input_tokens_seen": 12132390, "step": 573, "time_per_iteration": 2.78487229347229 }, { "auxiliary_loss_clip": 0.01256326, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.0644815, "balance_loss_mlp": 1.03781831, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.576428901855709, "language_loss": 0.76602584, "learning_rate": 3.999786703227023e-06, "loss": 0.78927696, "num_input_tokens_seen": 12149035, "step": 574, "time_per_iteration": 2.5697100162506104 }, { "auxiliary_loss_clip": 0.01255191, "auxiliary_loss_mlp": 0.0107671, "balance_loss_clip": 1.06581593, "balance_loss_mlp": 1.04502439, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 2.156110110571344, "language_loss": 0.83854586, "learning_rate": 3.9997809774981606e-06, "loss": 0.86186486, "num_input_tokens_seen": 12167530, "step": 575, "time_per_iteration": 2.596418619155884 }, { "auxiliary_loss_clip": 0.01249695, "auxiliary_loss_mlp": 0.01076053, "balance_loss_clip": 1.06684637, "balance_loss_mlp": 1.04334211, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 2.350120742735315, "language_loss": 0.83990753, "learning_rate": 3.9997751759374025e-06, "loss": 0.86316502, "num_input_tokens_seen": 12186340, "step": 576, "time_per_iteration": 5.821930646896362 }, { "auxiliary_loss_clip": 0.01257114, "auxiliary_loss_mlp": 0.01079503, "balance_loss_clip": 1.07237518, "balance_loss_mlp": 1.04817426, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.138457686407641, "language_loss": 0.85803086, "learning_rate": 3.99976929854497e-06, "loss": 0.88139701, "num_input_tokens_seen": 12204090, "step": 577, "time_per_iteration": 4.225277423858643 }, { "auxiliary_loss_clip": 0.01253845, "auxiliary_loss_mlp": 0.01080214, "balance_loss_clip": 1.06869018, "balance_loss_mlp": 1.04712176, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 4.535240156776142, "language_loss": 0.72226608, "learning_rate": 3.9997633453210845e-06, "loss": 0.74560666, "num_input_tokens_seen": 12224850, "step": 578, "time_per_iteration": 4.486239433288574 }, { "auxiliary_loss_clip": 0.01251871, "auxiliary_loss_mlp": 0.01080519, "balance_loss_clip": 1.06461096, "balance_loss_mlp": 1.04663968, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 1.9496379050984929, "language_loss": 0.77785492, "learning_rate": 3.999757316265973e-06, "loss": 0.80117887, "num_input_tokens_seen": 12244935, "step": 579, "time_per_iteration": 2.6706583499908447 }, { "auxiliary_loss_clip": 0.01251647, "auxiliary_loss_mlp": 0.01087497, "balance_loss_clip": 1.06656826, "balance_loss_mlp": 1.05435717, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.054973215074824, "language_loss": 0.86841297, "learning_rate": 3.999751211379863e-06, "loss": 0.8918044, "num_input_tokens_seen": 12262140, "step": 580, "time_per_iteration": 2.639146566390991 }, { "auxiliary_loss_clip": 0.01256528, "auxiliary_loss_mlp": 0.01069029, "balance_loss_clip": 1.06636667, "balance_loss_mlp": 1.0398469, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.205850105033732, "language_loss": 0.82570344, "learning_rate": 3.999745030662987e-06, "loss": 0.84895897, "num_input_tokens_seen": 12280930, "step": 581, "time_per_iteration": 2.6505649089813232 }, { "auxiliary_loss_clip": 0.01252942, "auxiliary_loss_mlp": 0.01072317, "balance_loss_clip": 1.06823969, "balance_loss_mlp": 1.04168022, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.1922492117358146, "language_loss": 0.7733047, "learning_rate": 3.99973877411558e-06, "loss": 0.79655731, "num_input_tokens_seen": 12299125, "step": 582, "time_per_iteration": 2.7323596477508545 }, { "auxiliary_loss_clip": 0.01250253, "auxiliary_loss_mlp": 0.01082356, "balance_loss_clip": 1.06794167, "balance_loss_mlp": 1.04861939, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 2.1536178016194327, "language_loss": 0.87679923, "learning_rate": 3.999732441737877e-06, "loss": 0.90012532, "num_input_tokens_seen": 12316905, "step": 583, "time_per_iteration": 2.6049294471740723 }, { "auxiliary_loss_clip": 0.01255473, "auxiliary_loss_mlp": 0.01092826, "balance_loss_clip": 1.06699181, "balance_loss_mlp": 1.06104505, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 3.7027110169592015, "language_loss": 0.81196821, "learning_rate": 3.99972603353012e-06, "loss": 0.83545119, "num_input_tokens_seen": 12335070, "step": 584, "time_per_iteration": 2.6011815071105957 }, { "auxiliary_loss_clip": 0.01251161, "auxiliary_loss_mlp": 0.01069463, "balance_loss_clip": 1.06472683, "balance_loss_mlp": 1.03832567, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 3.067717812226321, "language_loss": 0.92399198, "learning_rate": 3.999719549492551e-06, "loss": 0.94719815, "num_input_tokens_seen": 12350315, "step": 585, "time_per_iteration": 2.5592780113220215 }, { "auxiliary_loss_clip": 0.01251271, "auxiliary_loss_mlp": 0.01077423, "balance_loss_clip": 1.06562734, "balance_loss_mlp": 1.04552317, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 2.196660024103635, "language_loss": 0.87644351, "learning_rate": 3.9997129896254165e-06, "loss": 0.89973044, "num_input_tokens_seen": 12366030, "step": 586, "time_per_iteration": 2.5486221313476562 }, { "auxiliary_loss_clip": 0.01256485, "auxiliary_loss_mlp": 0.0108018, "balance_loss_clip": 1.06803596, "balance_loss_mlp": 1.04918551, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.1222089199850878, "language_loss": 0.76079381, "learning_rate": 3.999706353928965e-06, "loss": 0.78416049, "num_input_tokens_seen": 12384895, "step": 587, "time_per_iteration": 2.5923714637756348 }, { "auxiliary_loss_clip": 0.01257125, "auxiliary_loss_mlp": 0.01068649, "balance_loss_clip": 1.06683922, "balance_loss_mlp": 1.03586686, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 2.212352192395094, "language_loss": 0.78601038, "learning_rate": 3.999699642403449e-06, "loss": 0.80926806, "num_input_tokens_seen": 12404980, "step": 588, "time_per_iteration": 2.579280138015747 }, { "auxiliary_loss_clip": 0.0125398, "auxiliary_loss_mlp": 0.0107827, "balance_loss_clip": 1.06582928, "balance_loss_mlp": 1.04367518, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 2.153589114745919, "language_loss": 0.94312829, "learning_rate": 3.99969285504912e-06, "loss": 0.96645081, "num_input_tokens_seen": 12423835, "step": 589, "time_per_iteration": 2.5964701175689697 }, { "auxiliary_loss_clip": 0.01256884, "auxiliary_loss_mlp": 0.01078108, "balance_loss_clip": 1.06697679, "balance_loss_mlp": 1.04666042, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.1162556876212695, "language_loss": 0.84116042, "learning_rate": 3.99968599186624e-06, "loss": 0.8645103, "num_input_tokens_seen": 12443135, "step": 590, "time_per_iteration": 2.746436357498169 }, { "auxiliary_loss_clip": 0.01249398, "auxiliary_loss_mlp": 0.01068452, "balance_loss_clip": 1.06658125, "balance_loss_mlp": 1.03893578, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 1.984522351394552, "language_loss": 0.8684091, "learning_rate": 3.999679052855065e-06, "loss": 0.89158762, "num_input_tokens_seen": 12462895, "step": 591, "time_per_iteration": 2.692303419113159 }, { "auxiliary_loss_clip": 0.01250641, "auxiliary_loss_mlp": 0.01082122, "balance_loss_clip": 1.06297326, "balance_loss_mlp": 1.04883862, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 2.0873185001780783, "language_loss": 0.83075488, "learning_rate": 3.999672038015861e-06, "loss": 0.85408247, "num_input_tokens_seen": 12481515, "step": 592, "time_per_iteration": 2.7822203636169434 }, { "auxiliary_loss_clip": 0.01146211, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.05013406, "balance_loss_mlp": 1.02676773, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8804992705477848, "language_loss": 0.59754086, "learning_rate": 3.999664947348893e-06, "loss": 0.61934447, "num_input_tokens_seen": 12548220, "step": 593, "time_per_iteration": 3.274080276489258 }, { "auxiliary_loss_clip": 0.01249386, "auxiliary_loss_mlp": 0.0107742, "balance_loss_clip": 1.06737614, "balance_loss_mlp": 1.04473329, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 1.8086551314359374, "language_loss": 0.87077361, "learning_rate": 3.999657780854429e-06, "loss": 0.89404166, "num_input_tokens_seen": 12566105, "step": 594, "time_per_iteration": 2.682236671447754 }, { "auxiliary_loss_clip": 0.012487, "auxiliary_loss_mlp": 0.01082358, "balance_loss_clip": 1.06235993, "balance_loss_mlp": 1.05057716, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 5.516524335860627, "language_loss": 0.83920246, "learning_rate": 3.999650538532742e-06, "loss": 0.86251307, "num_input_tokens_seen": 12586680, "step": 595, "time_per_iteration": 2.773669481277466 }, { "auxiliary_loss_clip": 0.01248678, "auxiliary_loss_mlp": 0.01090544, "balance_loss_clip": 1.06579614, "balance_loss_mlp": 1.05850017, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 2.3448814752825204, "language_loss": 0.96041518, "learning_rate": 3.999643220384106e-06, "loss": 0.98380733, "num_input_tokens_seen": 12601605, "step": 596, "time_per_iteration": 2.6541590690612793 }, { "auxiliary_loss_clip": 0.01252662, "auxiliary_loss_mlp": 0.01081887, "balance_loss_clip": 1.0675534, "balance_loss_mlp": 1.05165553, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.4353221882859004, "language_loss": 0.82993281, "learning_rate": 3.999635826408799e-06, "loss": 0.85327828, "num_input_tokens_seen": 12620365, "step": 597, "time_per_iteration": 2.7023818492889404 }, { "auxiliary_loss_clip": 0.01247839, "auxiliary_loss_mlp": 0.01079829, "balance_loss_clip": 1.0668776, "balance_loss_mlp": 1.04766583, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 2.374757318483944, "language_loss": 0.81364304, "learning_rate": 3.999628356607101e-06, "loss": 0.83691972, "num_input_tokens_seen": 12641140, "step": 598, "time_per_iteration": 2.731229782104492 }, { "auxiliary_loss_clip": 0.01243692, "auxiliary_loss_mlp": 0.01077827, "balance_loss_clip": 1.0663228, "balance_loss_mlp": 1.04587913, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.817680341814684, "language_loss": 0.81172699, "learning_rate": 3.999620810979295e-06, "loss": 0.83494222, "num_input_tokens_seen": 12661080, "step": 599, "time_per_iteration": 2.710191011428833 }, { "auxiliary_loss_clip": 0.01250419, "auxiliary_loss_mlp": 0.01074577, "balance_loss_clip": 1.06356514, "balance_loss_mlp": 1.045228, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.3963649020429627, "language_loss": 0.8651731, "learning_rate": 3.999613189525668e-06, "loss": 0.88842309, "num_input_tokens_seen": 12678270, "step": 600, "time_per_iteration": 2.682262420654297 }, { "auxiliary_loss_clip": 0.01241882, "auxiliary_loss_mlp": 0.01084809, "balance_loss_clip": 1.05918193, "balance_loss_mlp": 1.05297971, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 2.0308947613075423, "language_loss": 0.82355881, "learning_rate": 3.999605492246508e-06, "loss": 0.84682572, "num_input_tokens_seen": 12697295, "step": 601, "time_per_iteration": 2.6570894718170166 }, { "auxiliary_loss_clip": 0.01240868, "auxiliary_loss_mlp": 0.010708, "balance_loss_clip": 1.06129336, "balance_loss_mlp": 1.03920949, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 2.3080142694085555, "language_loss": 0.7502507, "learning_rate": 3.999597719142107e-06, "loss": 0.77336735, "num_input_tokens_seen": 12716165, "step": 602, "time_per_iteration": 2.6434237957000732 }, { "auxiliary_loss_clip": 0.01239543, "auxiliary_loss_mlp": 0.01066859, "balance_loss_clip": 1.0604254, "balance_loss_mlp": 1.03562629, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 1.9681237382646195, "language_loss": 0.79599822, "learning_rate": 3.999589870212761e-06, "loss": 0.81906223, "num_input_tokens_seen": 12735475, "step": 603, "time_per_iteration": 2.7201666831970215 }, { "auxiliary_loss_clip": 0.01244834, "auxiliary_loss_mlp": 0.01071177, "balance_loss_clip": 1.06545615, "balance_loss_mlp": 1.04130292, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 1.8363641170913294, "language_loss": 0.86668456, "learning_rate": 3.9995819454587664e-06, "loss": 0.88984472, "num_input_tokens_seen": 12754540, "step": 604, "time_per_iteration": 2.60249924659729 }, { "auxiliary_loss_clip": 0.01248906, "auxiliary_loss_mlp": 0.01072985, "balance_loss_clip": 1.0674324, "balance_loss_mlp": 1.04010737, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 2.510130211393037, "language_loss": 0.80746496, "learning_rate": 3.999573944880424e-06, "loss": 0.83068383, "num_input_tokens_seen": 12773050, "step": 605, "time_per_iteration": 2.766684055328369 }, { "auxiliary_loss_clip": 0.01244274, "auxiliary_loss_mlp": 0.0107873, "balance_loss_clip": 1.0630821, "balance_loss_mlp": 1.04846251, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.2216143800596835, "language_loss": 0.85942292, "learning_rate": 3.9995658684780375e-06, "loss": 0.882653, "num_input_tokens_seen": 12791240, "step": 606, "time_per_iteration": 2.6133925914764404 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.01077404, "balance_loss_clip": 1.06413972, "balance_loss_mlp": 1.04588532, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 2.0684825764003394, "language_loss": 0.82179952, "learning_rate": 3.999557716251912e-06, "loss": 0.84505081, "num_input_tokens_seen": 12812245, "step": 607, "time_per_iteration": 2.6805856227874756 }, { "auxiliary_loss_clip": 0.01245394, "auxiliary_loss_mlp": 0.01073743, "balance_loss_clip": 1.06585169, "balance_loss_mlp": 1.04317796, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 2.3717179235904533, "language_loss": 0.83567071, "learning_rate": 3.999549488202358e-06, "loss": 0.8588621, "num_input_tokens_seen": 12831085, "step": 608, "time_per_iteration": 2.6593453884124756 }, { "auxiliary_loss_clip": 0.01251062, "auxiliary_loss_mlp": 0.01073705, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 1.04006422, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.4795108668903305, "language_loss": 0.8201133, "learning_rate": 3.999541184329688e-06, "loss": 0.84336102, "num_input_tokens_seen": 12849115, "step": 609, "time_per_iteration": 2.6299383640289307 }, { "auxiliary_loss_clip": 0.01255655, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07322037, "balance_loss_mlp": 1.06158984, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.992640540297191, "language_loss": 0.79448462, "learning_rate": 3.999532804634215e-06, "loss": 0.81796008, "num_input_tokens_seen": 12868005, "step": 610, "time_per_iteration": 2.65120530128479 }, { "auxiliary_loss_clip": 0.01254423, "auxiliary_loss_mlp": 0.01088228, "balance_loss_clip": 1.06914616, "balance_loss_mlp": 1.05656588, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 1.9328503999291824, "language_loss": 0.87282723, "learning_rate": 3.9995243491162575e-06, "loss": 0.89625371, "num_input_tokens_seen": 12886890, "step": 611, "time_per_iteration": 2.7398059368133545 }, { "auxiliary_loss_clip": 0.01248885, "auxiliary_loss_mlp": 0.01097673, "balance_loss_clip": 1.06917143, "balance_loss_mlp": 1.06651139, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 3.7435200854847266, "language_loss": 0.72589231, "learning_rate": 3.999515817776136e-06, "loss": 0.74935788, "num_input_tokens_seen": 12906130, "step": 612, "time_per_iteration": 2.700406551361084 }, { "auxiliary_loss_clip": 0.01249112, "auxiliary_loss_mlp": 0.01076924, "balance_loss_clip": 1.06581926, "balance_loss_mlp": 1.04480934, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 3.0863603820013434, "language_loss": 0.79110008, "learning_rate": 3.999507210614175e-06, "loss": 0.81436038, "num_input_tokens_seen": 12925260, "step": 613, "time_per_iteration": 2.630472183227539 }, { "auxiliary_loss_clip": 0.01242581, "auxiliary_loss_mlp": 0.01090278, "balance_loss_clip": 1.06378841, "balance_loss_mlp": 1.05961776, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 2.2015687298668336, "language_loss": 0.93885028, "learning_rate": 3.9994985276307e-06, "loss": 0.96217889, "num_input_tokens_seen": 12944590, "step": 614, "time_per_iteration": 2.6977972984313965 }, { "auxiliary_loss_clip": 0.01254503, "auxiliary_loss_mlp": 0.01081137, "balance_loss_clip": 1.07009673, "balance_loss_mlp": 1.04732919, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 3.0661216019279576, "language_loss": 0.72932875, "learning_rate": 3.999489768826041e-06, "loss": 0.75268513, "num_input_tokens_seen": 12964785, "step": 615, "time_per_iteration": 2.697291612625122 }, { "auxiliary_loss_clip": 0.01250213, "auxiliary_loss_mlp": 0.010716, "balance_loss_clip": 1.06649876, "balance_loss_mlp": 1.04015231, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 2.9941392641088695, "language_loss": 0.81630868, "learning_rate": 3.999480934200528e-06, "loss": 0.83952683, "num_input_tokens_seen": 12986705, "step": 616, "time_per_iteration": 4.1762495040893555 }, { "auxiliary_loss_clip": 0.0124999, "auxiliary_loss_mlp": 0.01076541, "balance_loss_clip": 1.06807041, "balance_loss_mlp": 1.0467627, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.320593216419041, "language_loss": 0.68178958, "learning_rate": 3.999472023754499e-06, "loss": 0.70505488, "num_input_tokens_seen": 13010560, "step": 617, "time_per_iteration": 4.224538564682007 }, { "auxiliary_loss_clip": 0.01254259, "auxiliary_loss_mlp": 0.010771, "balance_loss_clip": 1.07098567, "balance_loss_mlp": 1.04415071, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.245411088847763, "language_loss": 0.80595517, "learning_rate": 3.99946303748829e-06, "loss": 0.82926875, "num_input_tokens_seen": 13028935, "step": 618, "time_per_iteration": 4.200341463088989 }, { "auxiliary_loss_clip": 0.01257669, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06808555, "balance_loss_mlp": 1.04605901, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 10.155035046705617, "language_loss": 0.91591841, "learning_rate": 3.999453975402242e-06, "loss": 0.93928802, "num_input_tokens_seen": 13046000, "step": 619, "time_per_iteration": 2.5787301063537598 }, { "auxiliary_loss_clip": 0.01251145, "auxiliary_loss_mlp": 0.01083548, "balance_loss_clip": 1.06999123, "balance_loss_mlp": 1.05181432, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.0803022158745406, "language_loss": 0.94071603, "learning_rate": 3.9994448374967e-06, "loss": 0.96406299, "num_input_tokens_seen": 13062995, "step": 620, "time_per_iteration": 2.5987205505371094 }, { "auxiliary_loss_clip": 0.01249568, "auxiliary_loss_mlp": 0.0108317, "balance_loss_clip": 1.06624317, "balance_loss_mlp": 1.0502919, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 1.7431896174296577, "language_loss": 0.77319217, "learning_rate": 3.999435623772008e-06, "loss": 0.79651952, "num_input_tokens_seen": 13084120, "step": 621, "time_per_iteration": 2.68758225440979 }, { "auxiliary_loss_clip": 0.01247252, "auxiliary_loss_mlp": 0.01071013, "balance_loss_clip": 1.06894088, "balance_loss_mlp": 1.03792048, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 2.3852872810563364, "language_loss": 0.86546707, "learning_rate": 3.999426334228518e-06, "loss": 0.88864976, "num_input_tokens_seen": 13100035, "step": 622, "time_per_iteration": 2.607121467590332 }, { "auxiliary_loss_clip": 0.012499, "auxiliary_loss_mlp": 0.01072461, "balance_loss_clip": 1.06715882, "balance_loss_mlp": 1.04048872, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 2.2621736327299766, "language_loss": 0.90008956, "learning_rate": 3.999416968866581e-06, "loss": 0.92331314, "num_input_tokens_seen": 13118070, "step": 623, "time_per_iteration": 2.6513512134552 }, { "auxiliary_loss_clip": 0.01251762, "auxiliary_loss_mlp": 0.01090534, "balance_loss_clip": 1.07006013, "balance_loss_mlp": 1.05844235, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 2.760597076727266, "language_loss": 0.84095174, "learning_rate": 3.999407527686551e-06, "loss": 0.8643747, "num_input_tokens_seen": 13136355, "step": 624, "time_per_iteration": 2.66623592376709 }, { "auxiliary_loss_clip": 0.01252431, "auxiliary_loss_mlp": 0.01076353, "balance_loss_clip": 1.06697702, "balance_loss_mlp": 1.04423809, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 4.259276014089895, "language_loss": 0.66778994, "learning_rate": 3.999398010688788e-06, "loss": 0.69107783, "num_input_tokens_seen": 13155435, "step": 625, "time_per_iteration": 2.7288877964019775 }, { "auxiliary_loss_clip": 0.01244959, "auxiliary_loss_mlp": 0.01076274, "balance_loss_clip": 1.06605244, "balance_loss_mlp": 1.042943, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 3.375450269409945, "language_loss": 0.77496696, "learning_rate": 3.999388417873652e-06, "loss": 0.79817927, "num_input_tokens_seen": 13174295, "step": 626, "time_per_iteration": 2.648942470550537 }, { "auxiliary_loss_clip": 0.01249107, "auxiliary_loss_mlp": 0.0108376, "balance_loss_clip": 1.06770003, "balance_loss_mlp": 1.05200303, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 2.0480468386724766, "language_loss": 0.81463408, "learning_rate": 3.999378749241506e-06, "loss": 0.83796275, "num_input_tokens_seen": 13192500, "step": 627, "time_per_iteration": 2.6209845542907715 }, { "auxiliary_loss_clip": 0.01254363, "auxiliary_loss_mlp": 0.01084942, "balance_loss_clip": 1.07041132, "balance_loss_mlp": 1.05215955, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.6934072791943036, "language_loss": 0.88809037, "learning_rate": 3.999369004792719e-06, "loss": 0.91148341, "num_input_tokens_seen": 13213470, "step": 628, "time_per_iteration": 2.7221415042877197 }, { "auxiliary_loss_clip": 0.01247303, "auxiliary_loss_mlp": 0.01080197, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.04765344, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 2.536151380104699, "language_loss": 0.79840028, "learning_rate": 3.999359184527658e-06, "loss": 0.82167524, "num_input_tokens_seen": 13232365, "step": 629, "time_per_iteration": 2.6535024642944336 }, { "auxiliary_loss_clip": 0.01249218, "auxiliary_loss_mlp": 0.0106958, "balance_loss_clip": 1.06675959, "balance_loss_mlp": 1.03885961, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 1.6861994278356789, "language_loss": 0.76824844, "learning_rate": 3.999349288446696e-06, "loss": 0.79143643, "num_input_tokens_seen": 13251920, "step": 630, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.01254291, "auxiliary_loss_mlp": 0.01075963, "balance_loss_clip": 1.06833327, "balance_loss_mlp": 1.04504025, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 3.12435515576561, "language_loss": 0.91593724, "learning_rate": 3.99933931655021e-06, "loss": 0.93923974, "num_input_tokens_seen": 13267440, "step": 631, "time_per_iteration": 2.565293788909912 }, { "auxiliary_loss_clip": 0.01243525, "auxiliary_loss_mlp": 0.01087901, "balance_loss_clip": 1.06386209, "balance_loss_mlp": 1.05356884, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.6822536287963328, "language_loss": 0.92157543, "learning_rate": 3.999329268838575e-06, "loss": 0.94488978, "num_input_tokens_seen": 13287850, "step": 632, "time_per_iteration": 2.6235203742980957 }, { "auxiliary_loss_clip": 0.01248362, "auxiliary_loss_mlp": 0.01067296, "balance_loss_clip": 1.06696796, "balance_loss_mlp": 1.03613472, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 2.1097171792430456, "language_loss": 0.83139223, "learning_rate": 3.999319145312175e-06, "loss": 0.85454881, "num_input_tokens_seen": 13307760, "step": 633, "time_per_iteration": 2.6461985111236572 }, { "auxiliary_loss_clip": 0.01247735, "auxiliary_loss_mlp": 0.01079895, "balance_loss_clip": 1.06473529, "balance_loss_mlp": 1.04811358, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.599115294194595, "language_loss": 0.69883299, "learning_rate": 3.999308945971392e-06, "loss": 0.72210932, "num_input_tokens_seen": 13331230, "step": 634, "time_per_iteration": 2.709033727645874 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01009504, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.00249422, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.893126545279708, "language_loss": 0.61645919, "learning_rate": 3.999298670816614e-06, "loss": 0.63788629, "num_input_tokens_seen": 13394760, "step": 635, "time_per_iteration": 3.2099475860595703 }, { "auxiliary_loss_clip": 0.01244276, "auxiliary_loss_mlp": 0.01072984, "balance_loss_clip": 1.06475401, "balance_loss_mlp": 1.04129851, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.0563589539657205, "language_loss": 0.83629507, "learning_rate": 3.9992883198482294e-06, "loss": 0.85946769, "num_input_tokens_seen": 13412775, "step": 636, "time_per_iteration": 2.6278960704803467 }, { "auxiliary_loss_clip": 0.01248078, "auxiliary_loss_mlp": 0.01096471, "balance_loss_clip": 1.06714165, "balance_loss_mlp": 1.06530952, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.346379148367956, "language_loss": 0.79578567, "learning_rate": 3.999277893066632e-06, "loss": 0.81923115, "num_input_tokens_seen": 13427835, "step": 637, "time_per_iteration": 2.646414279937744 }, { "auxiliary_loss_clip": 0.01247939, "auxiliary_loss_mlp": 0.01088528, "balance_loss_clip": 1.06356907, "balance_loss_mlp": 1.0562222, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.9563283234999833, "language_loss": 0.83989692, "learning_rate": 3.999267390472215e-06, "loss": 0.86326158, "num_input_tokens_seen": 13447295, "step": 638, "time_per_iteration": 2.6416285037994385 }, { "auxiliary_loss_clip": 0.01253172, "auxiliary_loss_mlp": 0.01074704, "balance_loss_clip": 1.06563985, "balance_loss_mlp": 1.04163575, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.5596504471077224, "language_loss": 0.70109725, "learning_rate": 3.999256812065381e-06, "loss": 0.72437602, "num_input_tokens_seen": 13468455, "step": 639, "time_per_iteration": 2.610682487487793 }, { "auxiliary_loss_clip": 0.01248829, "auxiliary_loss_mlp": 0.01081808, "balance_loss_clip": 1.06618333, "balance_loss_mlp": 1.04790449, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.5791624605537082, "language_loss": 0.85322344, "learning_rate": 3.999246157846526e-06, "loss": 0.87652987, "num_input_tokens_seen": 13489085, "step": 640, "time_per_iteration": 2.700456380844116 }, { "auxiliary_loss_clip": 0.01252579, "auxiliary_loss_mlp": 0.01083722, "balance_loss_clip": 1.06751871, "balance_loss_mlp": 1.04934239, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.331268680461456, "language_loss": 0.82141805, "learning_rate": 3.9992354278160574e-06, "loss": 0.84478104, "num_input_tokens_seen": 13509120, "step": 641, "time_per_iteration": 2.6572046279907227 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.01008008, "balance_loss_clip": 1.03825259, "balance_loss_mlp": 1.00095105, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9037629700551453, "language_loss": 0.65444964, "learning_rate": 3.999224621974381e-06, "loss": 0.67580563, "num_input_tokens_seen": 13562005, "step": 642, "time_per_iteration": 3.199925422668457 }, { "auxiliary_loss_clip": 0.01246698, "auxiliary_loss_mlp": 0.01064563, "balance_loss_clip": 1.0651319, "balance_loss_mlp": 1.03453398, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 1.9113268312481755, "language_loss": 0.79272145, "learning_rate": 3.999213740321906e-06, "loss": 0.81583405, "num_input_tokens_seen": 13582185, "step": 643, "time_per_iteration": 2.641437292098999 }, { "auxiliary_loss_clip": 0.01244786, "auxiliary_loss_mlp": 0.01076057, "balance_loss_clip": 1.06219232, "balance_loss_mlp": 1.04599261, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 2.2104774200729262, "language_loss": 0.8294487, "learning_rate": 3.999202782859046e-06, "loss": 0.85265714, "num_input_tokens_seen": 13599555, "step": 644, "time_per_iteration": 2.600558280944824 }, { "auxiliary_loss_clip": 0.01247273, "auxiliary_loss_mlp": 0.01074554, "balance_loss_clip": 1.06383467, "balance_loss_mlp": 1.04193854, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 1.994902925690418, "language_loss": 0.82286513, "learning_rate": 3.9991917495862165e-06, "loss": 0.8460834, "num_input_tokens_seen": 13621160, "step": 645, "time_per_iteration": 2.6751983165740967 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.01070807, "balance_loss_clip": 1.06525111, "balance_loss_mlp": 1.03890657, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.290384247239265, "language_loss": 0.81889713, "learning_rate": 3.9991806405038345e-06, "loss": 0.84209144, "num_input_tokens_seen": 13641915, "step": 646, "time_per_iteration": 2.6987667083740234 }, { "auxiliary_loss_clip": 0.01250204, "auxiliary_loss_mlp": 0.01078836, "balance_loss_clip": 1.06982899, "balance_loss_mlp": 1.04791331, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 1.9171219640425325, "language_loss": 0.82015383, "learning_rate": 3.999169455612323e-06, "loss": 0.84344423, "num_input_tokens_seen": 13661410, "step": 647, "time_per_iteration": 2.590102195739746 }, { "auxiliary_loss_clip": 0.0124696, "auxiliary_loss_mlp": 0.01072111, "balance_loss_clip": 1.06628954, "balance_loss_mlp": 1.04216528, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 1.9398424653049293, "language_loss": 0.84477997, "learning_rate": 3.999158194912106e-06, "loss": 0.86797059, "num_input_tokens_seen": 13681705, "step": 648, "time_per_iteration": 2.7516121864318848 }, { "auxiliary_loss_clip": 0.01244808, "auxiliary_loss_mlp": 0.0107293, "balance_loss_clip": 1.06524062, "balance_loss_mlp": 1.04210222, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.3870859420748136, "language_loss": 0.84254295, "learning_rate": 3.9991468584036086e-06, "loss": 0.86572027, "num_input_tokens_seen": 13700400, "step": 649, "time_per_iteration": 2.6116180419921875 }, { "auxiliary_loss_clip": 0.01246653, "auxiliary_loss_mlp": 0.01073574, "balance_loss_clip": 1.06560743, "balance_loss_mlp": 1.0416739, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 2.00775905451926, "language_loss": 0.79783499, "learning_rate": 3.999135446087263e-06, "loss": 0.82103723, "num_input_tokens_seen": 13720145, "step": 650, "time_per_iteration": 2.574939727783203 }, { "auxiliary_loss_clip": 0.01242721, "auxiliary_loss_mlp": 0.01077536, "balance_loss_clip": 1.06209707, "balance_loss_mlp": 1.04534984, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.334811800093409, "language_loss": 0.78698987, "learning_rate": 3.9991239579635e-06, "loss": 0.81019247, "num_input_tokens_seen": 13737500, "step": 651, "time_per_iteration": 2.5930917263031006 }, { "auxiliary_loss_clip": 0.0124425, "auxiliary_loss_mlp": 0.010838, "balance_loss_clip": 1.06317663, "balance_loss_mlp": 1.05087411, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 3.361008988618244, "language_loss": 0.87392938, "learning_rate": 3.999112394032757e-06, "loss": 0.89720988, "num_input_tokens_seen": 13754750, "step": 652, "time_per_iteration": 2.6072869300842285 }, { "auxiliary_loss_clip": 0.01239638, "auxiliary_loss_mlp": 0.01073938, "balance_loss_clip": 1.06362963, "balance_loss_mlp": 1.0434916, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.6218665998754904, "language_loss": 0.79297256, "learning_rate": 3.999100754295471e-06, "loss": 0.81610829, "num_input_tokens_seen": 13771990, "step": 653, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01250652, "auxiliary_loss_mlp": 0.01075546, "balance_loss_clip": 1.06496143, "balance_loss_mlp": 1.04374111, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 2.0720296605490094, "language_loss": 0.85909009, "learning_rate": 3.999089038752085e-06, "loss": 0.88235211, "num_input_tokens_seen": 13792750, "step": 654, "time_per_iteration": 2.6775124073028564 }, { "auxiliary_loss_clip": 0.01126661, "auxiliary_loss_mlp": 0.01016641, "balance_loss_clip": 1.03977203, "balance_loss_mlp": 1.01001298, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.7366259780501333, "language_loss": 0.4997997, "learning_rate": 3.999077247403041e-06, "loss": 0.52123272, "num_input_tokens_seen": 13858570, "step": 655, "time_per_iteration": 3.3006510734558105 }, { "auxiliary_loss_clip": 0.01241143, "auxiliary_loss_mlp": 0.01076374, "balance_loss_clip": 1.0658412, "balance_loss_mlp": 1.04680991, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 4.17474796245144, "language_loss": 0.80903178, "learning_rate": 3.9990653802487886e-06, "loss": 0.83220696, "num_input_tokens_seen": 13876335, "step": 656, "time_per_iteration": 4.228931427001953 }, { "auxiliary_loss_clip": 0.01251519, "auxiliary_loss_mlp": 0.01093573, "balance_loss_clip": 1.06740427, "balance_loss_mlp": 1.05802524, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.068956760077258, "language_loss": 0.76289558, "learning_rate": 3.999053437289776e-06, "loss": 0.7863465, "num_input_tokens_seen": 13892640, "step": 657, "time_per_iteration": 4.218473434448242 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01076812, "balance_loss_clip": 1.06641233, "balance_loss_mlp": 1.04522133, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 2.07475431213476, "language_loss": 0.8179062, "learning_rate": 3.999041418526457e-06, "loss": 0.84115672, "num_input_tokens_seen": 13910085, "step": 658, "time_per_iteration": 2.671675682067871 }, { "auxiliary_loss_clip": 0.01242678, "auxiliary_loss_mlp": 0.01077963, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.0454669, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.2444983110753625, "language_loss": 0.90790772, "learning_rate": 3.999029323959287e-06, "loss": 0.93111408, "num_input_tokens_seen": 13928800, "step": 659, "time_per_iteration": 4.2601988315582275 }, { "auxiliary_loss_clip": 0.01247633, "auxiliary_loss_mlp": 0.01073069, "balance_loss_clip": 1.06654835, "balance_loss_mlp": 1.04215825, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.2083626038373656, "language_loss": 0.79760063, "learning_rate": 3.999017153588724e-06, "loss": 0.82080764, "num_input_tokens_seen": 13948325, "step": 660, "time_per_iteration": 2.62716007232666 }, { "auxiliary_loss_clip": 0.01246027, "auxiliary_loss_mlp": 0.01077579, "balance_loss_clip": 1.0675652, "balance_loss_mlp": 1.0456785, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.6747851381362888, "language_loss": 0.81757367, "learning_rate": 3.999004907415231e-06, "loss": 0.8408097, "num_input_tokens_seen": 13969090, "step": 661, "time_per_iteration": 2.645423412322998 }, { "auxiliary_loss_clip": 0.01119895, "auxiliary_loss_mlp": 0.01007167, "balance_loss_clip": 1.03320217, "balance_loss_mlp": 1.00077713, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.9117564509831767, "language_loss": 0.69349593, "learning_rate": 3.998992585439272e-06, "loss": 0.71476656, "num_input_tokens_seen": 14037555, "step": 662, "time_per_iteration": 3.3032331466674805 }, { "auxiliary_loss_clip": 0.01249217, "auxiliary_loss_mlp": 0.01074722, "balance_loss_clip": 1.06995225, "balance_loss_mlp": 1.04322648, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 2.160679749799672, "language_loss": 0.82765651, "learning_rate": 3.998980187661314e-06, "loss": 0.85089582, "num_input_tokens_seen": 14055765, "step": 663, "time_per_iteration": 2.6217782497406006 }, { "auxiliary_loss_clip": 0.01252759, "auxiliary_loss_mlp": 0.01063705, "balance_loss_clip": 1.06966817, "balance_loss_mlp": 1.03254378, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.19374813563436, "language_loss": 0.87302262, "learning_rate": 3.998967714081826e-06, "loss": 0.89618725, "num_input_tokens_seen": 14074195, "step": 664, "time_per_iteration": 2.6729183197021484 }, { "auxiliary_loss_clip": 0.01241647, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.06656313, "balance_loss_mlp": 1.03346384, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.036983550581997, "language_loss": 0.84821391, "learning_rate": 3.998955164701281e-06, "loss": 0.87128186, "num_input_tokens_seen": 14090215, "step": 665, "time_per_iteration": 2.593832015991211 }, { "auxiliary_loss_clip": 0.012521, "auxiliary_loss_mlp": 0.01085682, "balance_loss_clip": 1.06867695, "balance_loss_mlp": 1.05223155, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.172699570421913, "language_loss": 0.81745672, "learning_rate": 3.998942539520158e-06, "loss": 0.8408345, "num_input_tokens_seen": 14112150, "step": 666, "time_per_iteration": 2.6743290424346924 }, { "auxiliary_loss_clip": 0.01241565, "auxiliary_loss_mlp": 0.01073617, "balance_loss_clip": 1.06443083, "balance_loss_mlp": 1.04007161, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 2.1003520396389828, "language_loss": 0.87117827, "learning_rate": 3.998929838538932e-06, "loss": 0.89433014, "num_input_tokens_seen": 14131475, "step": 667, "time_per_iteration": 2.6147067546844482 }, { "auxiliary_loss_clip": 0.0124275, "auxiliary_loss_mlp": 0.01071583, "balance_loss_clip": 1.07009172, "balance_loss_mlp": 1.04161382, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.331266403294307, "language_loss": 0.80641299, "learning_rate": 3.998917061758087e-06, "loss": 0.82955635, "num_input_tokens_seen": 14146165, "step": 668, "time_per_iteration": 2.6015820503234863 }, { "auxiliary_loss_clip": 0.01115034, "auxiliary_loss_mlp": 0.01008949, "balance_loss_clip": 1.02975297, "balance_loss_mlp": 1.00317907, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7870483750596657, "language_loss": 0.60066259, "learning_rate": 3.998904209178107e-06, "loss": 0.62190247, "num_input_tokens_seen": 14215005, "step": 669, "time_per_iteration": 3.2993202209472656 }, { "auxiliary_loss_clip": 0.01242272, "auxiliary_loss_mlp": 0.01071485, "balance_loss_clip": 1.06408751, "balance_loss_mlp": 1.04120564, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.7022357666604506, "language_loss": 0.86290276, "learning_rate": 3.9988912807994785e-06, "loss": 0.88604033, "num_input_tokens_seen": 14235510, "step": 670, "time_per_iteration": 2.700657844543457 }, { "auxiliary_loss_clip": 0.01242087, "auxiliary_loss_mlp": 0.01080448, "balance_loss_clip": 1.06647801, "balance_loss_mlp": 1.05014467, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.8224152334464152, "language_loss": 0.75569212, "learning_rate": 3.998878276622692e-06, "loss": 0.77891749, "num_input_tokens_seen": 14254565, "step": 671, "time_per_iteration": 2.6698572635650635 }, { "auxiliary_loss_clip": 0.01248936, "auxiliary_loss_mlp": 0.01076667, "balance_loss_clip": 1.06943047, "balance_loss_mlp": 1.04605412, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 1.9730812981627939, "language_loss": 0.92416775, "learning_rate": 3.998865196648242e-06, "loss": 0.94742376, "num_input_tokens_seen": 14271885, "step": 672, "time_per_iteration": 2.567563533782959 }, { "auxiliary_loss_clip": 0.01245231, "auxiliary_loss_mlp": 0.010776, "balance_loss_clip": 1.0677104, "balance_loss_mlp": 1.04422188, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 1.800141829654062, "language_loss": 0.90174723, "learning_rate": 3.998852040876622e-06, "loss": 0.92497551, "num_input_tokens_seen": 14289670, "step": 673, "time_per_iteration": 2.547154426574707 }, { "auxiliary_loss_clip": 0.01239752, "auxiliary_loss_mlp": 0.01084248, "balance_loss_clip": 1.06466973, "balance_loss_mlp": 1.05184698, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.3989934860433486, "language_loss": 0.75016737, "learning_rate": 3.998838809308334e-06, "loss": 0.7734074, "num_input_tokens_seen": 14309285, "step": 674, "time_per_iteration": 2.681896924972534 }, { "auxiliary_loss_clip": 0.01249861, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06744063, "balance_loss_mlp": 1.03334963, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.55613513039197, "language_loss": 0.78289407, "learning_rate": 3.9988255019438766e-06, "loss": 0.80603576, "num_input_tokens_seen": 14328300, "step": 675, "time_per_iteration": 2.6965043544769287 }, { "auxiliary_loss_clip": 0.01241749, "auxiliary_loss_mlp": 0.01079652, "balance_loss_clip": 1.06532836, "balance_loss_mlp": 1.04648817, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 2.047384767684118, "language_loss": 0.76844448, "learning_rate": 3.998812118783757e-06, "loss": 0.79165846, "num_input_tokens_seen": 14346395, "step": 676, "time_per_iteration": 2.6216623783111572 }, { "auxiliary_loss_clip": 0.01248147, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06811619, "balance_loss_mlp": 1.04813254, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 2.318905665785744, "language_loss": 0.85139382, "learning_rate": 3.9987986598284804e-06, "loss": 0.8746683, "num_input_tokens_seen": 14364605, "step": 677, "time_per_iteration": 2.5663015842437744 }, { "auxiliary_loss_clip": 0.01240385, "auxiliary_loss_mlp": 0.01070741, "balance_loss_clip": 1.06558609, "balance_loss_mlp": 1.03901923, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 2.5041724349122645, "language_loss": 0.76572061, "learning_rate": 3.998785125078559e-06, "loss": 0.78883183, "num_input_tokens_seen": 14385265, "step": 678, "time_per_iteration": 2.624689817428589 }, { "auxiliary_loss_clip": 0.01240972, "auxiliary_loss_mlp": 0.01072606, "balance_loss_clip": 1.06374967, "balance_loss_mlp": 1.04242194, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 1.7096242150987748, "language_loss": 0.82139099, "learning_rate": 3.998771514534505e-06, "loss": 0.84452677, "num_input_tokens_seen": 14406090, "step": 679, "time_per_iteration": 2.7073023319244385 }, { "auxiliary_loss_clip": 0.01248879, "auxiliary_loss_mlp": 0.01064116, "balance_loss_clip": 1.07185793, "balance_loss_mlp": 1.0340035, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.963288262989073, "language_loss": 0.76260424, "learning_rate": 3.998757828196835e-06, "loss": 0.78573418, "num_input_tokens_seen": 14425130, "step": 680, "time_per_iteration": 2.6767218112945557 }, { "auxiliary_loss_clip": 0.01244441, "auxiliary_loss_mlp": 0.01071738, "balance_loss_clip": 1.06458521, "balance_loss_mlp": 1.03864551, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.713943858995997, "language_loss": 0.83089912, "learning_rate": 3.9987440660660685e-06, "loss": 0.85406095, "num_input_tokens_seen": 14447355, "step": 681, "time_per_iteration": 2.6386382579803467 }, { "auxiliary_loss_clip": 0.01244279, "auxiliary_loss_mlp": 0.01073303, "balance_loss_clip": 1.06438065, "balance_loss_mlp": 1.04127121, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.706698119772261, "language_loss": 0.71538687, "learning_rate": 3.998730228142726e-06, "loss": 0.7385627, "num_input_tokens_seen": 14466790, "step": 682, "time_per_iteration": 2.618792772293091 }, { "auxiliary_loss_clip": 0.01243156, "auxiliary_loss_mlp": 0.01078429, "balance_loss_clip": 1.06440282, "balance_loss_mlp": 1.04781592, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.6947476714586034, "language_loss": 0.72599399, "learning_rate": 3.998716314427333e-06, "loss": 0.74920982, "num_input_tokens_seen": 14485195, "step": 683, "time_per_iteration": 2.676133394241333 }, { "auxiliary_loss_clip": 0.01241071, "auxiliary_loss_mlp": 0.01079531, "balance_loss_clip": 1.07077932, "balance_loss_mlp": 1.04851258, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 2.098652785935233, "language_loss": 0.81419414, "learning_rate": 3.998702324920417e-06, "loss": 0.8374002, "num_input_tokens_seen": 14503370, "step": 684, "time_per_iteration": 2.6538476943969727 }, { "auxiliary_loss_clip": 0.01242791, "auxiliary_loss_mlp": 0.0107365, "balance_loss_clip": 1.06783867, "balance_loss_mlp": 1.04139185, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.5053911947555274, "language_loss": 0.90680599, "learning_rate": 3.9986882596225085e-06, "loss": 0.92997038, "num_input_tokens_seen": 14526415, "step": 685, "time_per_iteration": 2.6541450023651123 }, { "auxiliary_loss_clip": 0.01244219, "auxiliary_loss_mlp": 0.01072481, "balance_loss_clip": 1.06659365, "balance_loss_mlp": 1.04093838, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 2.2251875217653185, "language_loss": 0.87851977, "learning_rate": 3.998674118534141e-06, "loss": 0.90168673, "num_input_tokens_seen": 14546595, "step": 686, "time_per_iteration": 2.7298531532287598 }, { "auxiliary_loss_clip": 0.01247476, "auxiliary_loss_mlp": 0.01073385, "balance_loss_clip": 1.06586432, "balance_loss_mlp": 1.04224789, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 1.8582614005091855, "language_loss": 0.7152915, "learning_rate": 3.998659901655851e-06, "loss": 0.73850012, "num_input_tokens_seen": 14566590, "step": 687, "time_per_iteration": 2.6284232139587402 }, { "auxiliary_loss_clip": 0.01243582, "auxiliary_loss_mlp": 0.01076448, "balance_loss_clip": 1.06979251, "balance_loss_mlp": 1.04756403, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 2.596672934278983, "language_loss": 0.86028284, "learning_rate": 3.998645608988177e-06, "loss": 0.88348317, "num_input_tokens_seen": 14585965, "step": 688, "time_per_iteration": 2.522634506225586 }, { "auxiliary_loss_clip": 0.01241593, "auxiliary_loss_mlp": 0.01079647, "balance_loss_clip": 1.06802177, "balance_loss_mlp": 1.04908216, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.852238187591699, "language_loss": 0.83393514, "learning_rate": 3.998631240531661e-06, "loss": 0.85714757, "num_input_tokens_seen": 14606015, "step": 689, "time_per_iteration": 2.6140944957733154 }, { "auxiliary_loss_clip": 0.01238254, "auxiliary_loss_mlp": 0.01085009, "balance_loss_clip": 1.06293654, "balance_loss_mlp": 1.05463421, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.870474577544969, "language_loss": 0.68398476, "learning_rate": 3.998616796286848e-06, "loss": 0.70721734, "num_input_tokens_seen": 14629955, "step": 690, "time_per_iteration": 2.658987522125244 }, { "auxiliary_loss_clip": 0.01235903, "auxiliary_loss_mlp": 0.01075275, "balance_loss_clip": 1.0625304, "balance_loss_mlp": 1.04565191, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.634289561889102, "language_loss": 0.74927461, "learning_rate": 3.998602276254286e-06, "loss": 0.77238643, "num_input_tokens_seen": 14648000, "step": 691, "time_per_iteration": 2.599957227706909 }, { "auxiliary_loss_clip": 0.01239089, "auxiliary_loss_mlp": 0.01081705, "balance_loss_clip": 1.06458938, "balance_loss_mlp": 1.04978108, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 2.123432521314224, "language_loss": 0.84469771, "learning_rate": 3.998587680434526e-06, "loss": 0.86790562, "num_input_tokens_seen": 14662235, "step": 692, "time_per_iteration": 2.5748491287231445 }, { "auxiliary_loss_clip": 0.01242126, "auxiliary_loss_mlp": 0.01076613, "balance_loss_clip": 1.06274796, "balance_loss_mlp": 1.04409313, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.3463094595874665, "language_loss": 0.88948715, "learning_rate": 3.99857300882812e-06, "loss": 0.91267455, "num_input_tokens_seen": 14676065, "step": 693, "time_per_iteration": 2.569277286529541 }, { "auxiliary_loss_clip": 0.01245438, "auxiliary_loss_mlp": 0.01071471, "balance_loss_clip": 1.06845784, "balance_loss_mlp": 1.04123962, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 5.499777597079252, "language_loss": 0.81987685, "learning_rate": 3.998558261435626e-06, "loss": 0.84304595, "num_input_tokens_seen": 14694955, "step": 694, "time_per_iteration": 2.6798722743988037 }, { "auxiliary_loss_clip": 0.01242101, "auxiliary_loss_mlp": 0.01073692, "balance_loss_clip": 1.06179321, "balance_loss_mlp": 1.04303181, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 2.051302362473346, "language_loss": 0.83672506, "learning_rate": 3.9985434382576015e-06, "loss": 0.85988301, "num_input_tokens_seen": 14715510, "step": 695, "time_per_iteration": 2.684537649154663 }, { "auxiliary_loss_clip": 0.01242205, "auxiliary_loss_mlp": 0.01080004, "balance_loss_clip": 1.06535804, "balance_loss_mlp": 1.04822254, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.113561459264794, "language_loss": 0.84351176, "learning_rate": 3.99852853929461e-06, "loss": 0.86673379, "num_input_tokens_seen": 14731755, "step": 696, "time_per_iteration": 4.1141321659088135 }, { "auxiliary_loss_clip": 0.01238462, "auxiliary_loss_mlp": 0.01083207, "balance_loss_clip": 1.06265593, "balance_loss_mlp": 1.05099702, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 6.921460264787684, "language_loss": 0.93193012, "learning_rate": 3.998513564547216e-06, "loss": 0.95514685, "num_input_tokens_seen": 14750810, "step": 697, "time_per_iteration": 5.71666693687439 }, { "auxiliary_loss_clip": 0.01235964, "auxiliary_loss_mlp": 0.01074448, "balance_loss_clip": 1.06324339, "balance_loss_mlp": 1.04495573, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.1002029886241904, "language_loss": 0.83775562, "learning_rate": 3.998498514015987e-06, "loss": 0.86085975, "num_input_tokens_seen": 14768435, "step": 698, "time_per_iteration": 4.194530010223389 }, { "auxiliary_loss_clip": 0.01239177, "auxiliary_loss_mlp": 0.01093516, "balance_loss_clip": 1.06274605, "balance_loss_mlp": 1.06175828, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 2.1234669437327955, "language_loss": 0.91715962, "learning_rate": 3.998483387701495e-06, "loss": 0.94048655, "num_input_tokens_seen": 14786690, "step": 699, "time_per_iteration": 2.6399078369140625 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.0102038, "balance_loss_clip": 1.03020263, "balance_loss_mlp": 1.01403797, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.9035134571641164, "language_loss": 0.67873394, "learning_rate": 3.998468185604312e-06, "loss": 0.70007098, "num_input_tokens_seen": 14853840, "step": 700, "time_per_iteration": 3.192026376724243 }, { "auxiliary_loss_clip": 0.01246765, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.06717515, "balance_loss_mlp": 1.05017269, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 2.2754848646841888, "language_loss": 0.884673, "learning_rate": 3.998452907725016e-06, "loss": 0.90797025, "num_input_tokens_seen": 14869580, "step": 701, "time_per_iteration": 2.5790441036224365 }, { "auxiliary_loss_clip": 0.01242428, "auxiliary_loss_mlp": 0.01080259, "balance_loss_clip": 1.06793952, "balance_loss_mlp": 1.04833448, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 2.000128536077818, "language_loss": 0.67100394, "learning_rate": 3.998437554064184e-06, "loss": 0.69423079, "num_input_tokens_seen": 14891065, "step": 702, "time_per_iteration": 2.6247870922088623 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01005563, "balance_loss_clip": 1.02512407, "balance_loss_mlp": 0.99922067, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8439205282656718, "language_loss": 0.60756463, "learning_rate": 3.9984221246224006e-06, "loss": 0.62869191, "num_input_tokens_seen": 14954815, "step": 703, "time_per_iteration": 3.1991655826568604 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01006502, "balance_loss_clip": 1.02562141, "balance_loss_mlp": 0.99973089, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0471369072250156, "language_loss": 0.57677412, "learning_rate": 3.9984066194002494e-06, "loss": 0.59791845, "num_input_tokens_seen": 15003050, "step": 704, "time_per_iteration": 3.037705659866333 }, { "auxiliary_loss_clip": 0.01241513, "auxiliary_loss_mlp": 0.01072126, "balance_loss_clip": 1.06549489, "balance_loss_mlp": 1.0406549, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.9488804643242488, "language_loss": 0.87553984, "learning_rate": 3.998391038398319e-06, "loss": 0.89867628, "num_input_tokens_seen": 15021990, "step": 705, "time_per_iteration": 2.6233222484588623 }, { "auxiliary_loss_clip": 0.01230342, "auxiliary_loss_mlp": 0.0107194, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.04204249, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 2.556815837902013, "language_loss": 0.71071029, "learning_rate": 3.998375381617201e-06, "loss": 0.73373306, "num_input_tokens_seen": 15040700, "step": 706, "time_per_iteration": 2.560434579849243 }, { "auxiliary_loss_clip": 0.0123412, "auxiliary_loss_mlp": 0.01070349, "balance_loss_clip": 1.06249404, "balance_loss_mlp": 1.03799582, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 2.0814078632624167, "language_loss": 0.93418455, "learning_rate": 3.9983596490574875e-06, "loss": 0.95722926, "num_input_tokens_seen": 15056725, "step": 707, "time_per_iteration": 2.6130473613739014 }, { "auxiliary_loss_clip": 0.01237541, "auxiliary_loss_mlp": 0.01067908, "balance_loss_clip": 1.05994225, "balance_loss_mlp": 1.03617477, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 2.424205580643553, "language_loss": 0.81514043, "learning_rate": 3.998343840719776e-06, "loss": 0.83819497, "num_input_tokens_seen": 15077550, "step": 708, "time_per_iteration": 2.656277894973755 }, { "auxiliary_loss_clip": 0.01243932, "auxiliary_loss_mlp": 0.0108167, "balance_loss_clip": 1.06461239, "balance_loss_mlp": 1.04934049, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.0883592727868145, "language_loss": 0.82027614, "learning_rate": 3.998327956604666e-06, "loss": 0.8435322, "num_input_tokens_seen": 15094955, "step": 709, "time_per_iteration": 2.5758891105651855 }, { "auxiliary_loss_clip": 0.01243538, "auxiliary_loss_mlp": 0.01071217, "balance_loss_clip": 1.06374872, "balance_loss_mlp": 1.03960264, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 2.7686525844665133, "language_loss": 0.8502059, "learning_rate": 3.99831199671276e-06, "loss": 0.87335348, "num_input_tokens_seen": 15113395, "step": 710, "time_per_iteration": 2.571559429168701 }, { "auxiliary_loss_clip": 0.0124498, "auxiliary_loss_mlp": 0.01072229, "balance_loss_clip": 1.06788397, "balance_loss_mlp": 1.04166365, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 7.911177124524585, "language_loss": 0.84914303, "learning_rate": 3.998295961044662e-06, "loss": 0.87231517, "num_input_tokens_seen": 15132920, "step": 711, "time_per_iteration": 2.569959878921509 }, { "auxiliary_loss_clip": 0.01237769, "auxiliary_loss_mlp": 0.01074338, "balance_loss_clip": 1.06188083, "balance_loss_mlp": 1.04229426, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 1.7189790042473796, "language_loss": 0.85439789, "learning_rate": 3.9982798496009804e-06, "loss": 0.87751901, "num_input_tokens_seen": 15153115, "step": 712, "time_per_iteration": 2.6200509071350098 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.01069523, "balance_loss_clip": 1.06085837, "balance_loss_mlp": 1.03989983, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 5.490507523621204, "language_loss": 0.91178697, "learning_rate": 3.998263662382328e-06, "loss": 0.93490618, "num_input_tokens_seen": 15172770, "step": 713, "time_per_iteration": 2.6353416442871094 }, { "auxiliary_loss_clip": 0.01104693, "auxiliary_loss_mlp": 0.01006514, "balance_loss_clip": 1.02325606, "balance_loss_mlp": 0.99955195, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.9310328114407391, "language_loss": 0.63725489, "learning_rate": 3.9982473993893165e-06, "loss": 0.65836698, "num_input_tokens_seen": 15240055, "step": 714, "time_per_iteration": 3.2544445991516113 }, { "auxiliary_loss_clip": 0.01239175, "auxiliary_loss_mlp": 0.01085992, "balance_loss_clip": 1.06602359, "balance_loss_mlp": 1.05552244, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 1.8449858143817996, "language_loss": 0.75010103, "learning_rate": 3.998231060622563e-06, "loss": 0.77335274, "num_input_tokens_seen": 15261585, "step": 715, "time_per_iteration": 2.7048466205596924 }, { "auxiliary_loss_clip": 0.01242734, "auxiliary_loss_mlp": 0.01074126, "balance_loss_clip": 1.0666225, "balance_loss_mlp": 1.04227352, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 1.9505519101092619, "language_loss": 0.72289199, "learning_rate": 3.998214646082688e-06, "loss": 0.74606061, "num_input_tokens_seen": 15281160, "step": 716, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01104303, "auxiliary_loss_mlp": 0.01006894, "balance_loss_clip": 1.02277207, "balance_loss_mlp": 0.99997944, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.9245106661639481, "language_loss": 0.65587437, "learning_rate": 3.998198155770314e-06, "loss": 0.67698634, "num_input_tokens_seen": 15344505, "step": 717, "time_per_iteration": 3.250870943069458 }, { "auxiliary_loss_clip": 0.01103971, "auxiliary_loss_mlp": 0.01009587, "balance_loss_clip": 1.02238059, "balance_loss_mlp": 1.00267255, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9849394627593366, "language_loss": 0.58785796, "learning_rate": 3.998181589686065e-06, "loss": 0.60899353, "num_input_tokens_seen": 15404050, "step": 718, "time_per_iteration": 3.0402464866638184 }, { "auxiliary_loss_clip": 0.0124025, "auxiliary_loss_mlp": 0.0107507, "balance_loss_clip": 1.06784248, "balance_loss_mlp": 1.0424546, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 1.9557310597444375, "language_loss": 0.91440111, "learning_rate": 3.99816494783057e-06, "loss": 0.9375543, "num_input_tokens_seen": 15424190, "step": 719, "time_per_iteration": 2.6500089168548584 }, { "auxiliary_loss_clip": 0.01235843, "auxiliary_loss_mlp": 0.01072906, "balance_loss_clip": 1.06020999, "balance_loss_mlp": 1.04296041, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.7057721639328365, "language_loss": 0.66461253, "learning_rate": 3.99814823020446e-06, "loss": 0.68770003, "num_input_tokens_seen": 15446500, "step": 720, "time_per_iteration": 2.673184871673584 }, { "auxiliary_loss_clip": 0.01234245, "auxiliary_loss_mlp": 0.01072069, "balance_loss_clip": 1.06111717, "balance_loss_mlp": 1.04131258, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.9491363249287763, "language_loss": 0.77460182, "learning_rate": 3.9981314368083684e-06, "loss": 0.79766488, "num_input_tokens_seen": 15465830, "step": 721, "time_per_iteration": 2.6695611476898193 }, { "auxiliary_loss_clip": 0.01241854, "auxiliary_loss_mlp": 0.01087169, "balance_loss_clip": 1.06622314, "balance_loss_mlp": 1.05719972, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.8383174670702718, "language_loss": 0.88298881, "learning_rate": 3.998114567642933e-06, "loss": 0.90627909, "num_input_tokens_seen": 15479985, "step": 722, "time_per_iteration": 2.661313533782959 }, { "auxiliary_loss_clip": 0.01244836, "auxiliary_loss_mlp": 0.01076885, "balance_loss_clip": 1.06665182, "balance_loss_mlp": 1.0480125, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 5.515838365549148, "language_loss": 0.84387141, "learning_rate": 3.998097622708792e-06, "loss": 0.86708868, "num_input_tokens_seen": 15501545, "step": 723, "time_per_iteration": 2.6447954177856445 }, { "auxiliary_loss_clip": 0.01245825, "auxiliary_loss_mlp": 0.01081354, "balance_loss_clip": 1.06723523, "balance_loss_mlp": 1.05019248, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 1.7852936089408447, "language_loss": 0.82789439, "learning_rate": 3.99808060200659e-06, "loss": 0.85116619, "num_input_tokens_seen": 15521725, "step": 724, "time_per_iteration": 2.676985263824463 }, { "auxiliary_loss_clip": 0.0124127, "auxiliary_loss_mlp": 0.01087491, "balance_loss_clip": 1.06535757, "balance_loss_mlp": 1.05609179, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 2.011685360503238, "language_loss": 0.79444051, "learning_rate": 3.998063505536971e-06, "loss": 0.81772816, "num_input_tokens_seen": 15540910, "step": 725, "time_per_iteration": 2.6241447925567627 }, { "auxiliary_loss_clip": 0.01251777, "auxiliary_loss_mlp": 0.01074923, "balance_loss_clip": 1.06783843, "balance_loss_mlp": 1.04309392, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.2160842755462817, "language_loss": 0.87175703, "learning_rate": 3.998046333300584e-06, "loss": 0.89502406, "num_input_tokens_seen": 15558640, "step": 726, "time_per_iteration": 2.555551052093506 }, { "auxiliary_loss_clip": 0.01100917, "auxiliary_loss_mlp": 0.01015411, "balance_loss_clip": 1.02171838, "balance_loss_mlp": 1.00947404, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.908981905466007, "language_loss": 0.55868411, "learning_rate": 3.998029085298079e-06, "loss": 0.5798474, "num_input_tokens_seen": 15612975, "step": 727, "time_per_iteration": 3.375901699066162 }, { "auxiliary_loss_clip": 0.01245647, "auxiliary_loss_mlp": 0.0108809, "balance_loss_clip": 1.06717396, "balance_loss_mlp": 1.05614173, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.282663852625415, "language_loss": 0.82326066, "learning_rate": 3.998011761530112e-06, "loss": 0.84659809, "num_input_tokens_seen": 15631070, "step": 728, "time_per_iteration": 2.605970621109009 }, { "auxiliary_loss_clip": 0.01237902, "auxiliary_loss_mlp": 0.01073495, "balance_loss_clip": 1.06600416, "balance_loss_mlp": 1.04321551, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 2.1303486954703152, "language_loss": 0.76890069, "learning_rate": 3.997994361997338e-06, "loss": 0.7920146, "num_input_tokens_seen": 15647825, "step": 729, "time_per_iteration": 2.652466297149658 }, { "auxiliary_loss_clip": 0.01243746, "auxiliary_loss_mlp": 0.01079207, "balance_loss_clip": 1.06438255, "balance_loss_mlp": 1.04859376, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.1385115795714107, "language_loss": 0.95153189, "learning_rate": 3.997976886700417e-06, "loss": 0.97476137, "num_input_tokens_seen": 15668260, "step": 730, "time_per_iteration": 2.734614133834839 }, { "auxiliary_loss_clip": 0.01238581, "auxiliary_loss_mlp": 0.01074727, "balance_loss_clip": 1.06093788, "balance_loss_mlp": 1.04315984, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 2.333073864238008, "language_loss": 0.88456279, "learning_rate": 3.997959335640013e-06, "loss": 0.90769589, "num_input_tokens_seen": 15685630, "step": 731, "time_per_iteration": 2.5912294387817383 }, { "auxiliary_loss_clip": 0.01242247, "auxiliary_loss_mlp": 0.01076563, "balance_loss_clip": 1.06636512, "balance_loss_mlp": 1.04757094, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 3.0398759554531254, "language_loss": 0.88683128, "learning_rate": 3.997941708816791e-06, "loss": 0.9100194, "num_input_tokens_seen": 15698645, "step": 732, "time_per_iteration": 2.5897367000579834 }, { "auxiliary_loss_clip": 0.01242736, "auxiliary_loss_mlp": 0.01087795, "balance_loss_clip": 1.06544232, "balance_loss_mlp": 1.05646718, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.304959545118842, "language_loss": 0.85829747, "learning_rate": 3.997924006231419e-06, "loss": 0.88160276, "num_input_tokens_seen": 15716775, "step": 733, "time_per_iteration": 2.650681972503662 }, { "auxiliary_loss_clip": 0.01246603, "auxiliary_loss_mlp": 0.01088724, "balance_loss_clip": 1.06722379, "balance_loss_mlp": 1.05544066, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 2.207780377909299, "language_loss": 0.91189414, "learning_rate": 3.9979062278845685e-06, "loss": 0.93524742, "num_input_tokens_seen": 15733320, "step": 734, "time_per_iteration": 2.5956180095672607 }, { "auxiliary_loss_clip": 0.01238395, "auxiliary_loss_mlp": 0.01067579, "balance_loss_clip": 1.06596422, "balance_loss_mlp": 1.03781235, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 1.9297536072777384, "language_loss": 0.77884138, "learning_rate": 3.9978883737769125e-06, "loss": 0.8019011, "num_input_tokens_seen": 15752705, "step": 735, "time_per_iteration": 2.603809118270874 }, { "auxiliary_loss_clip": 0.01234188, "auxiliary_loss_mlp": 0.01070499, "balance_loss_clip": 1.06063068, "balance_loss_mlp": 1.04091144, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.266122200005257, "language_loss": 0.8832593, "learning_rate": 3.9978704439091305e-06, "loss": 0.90630615, "num_input_tokens_seen": 15772800, "step": 736, "time_per_iteration": 5.841086149215698 }, { "auxiliary_loss_clip": 0.01235947, "auxiliary_loss_mlp": 0.01081098, "balance_loss_clip": 1.06597185, "balance_loss_mlp": 1.05165362, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.8984177574034653, "language_loss": 0.84481263, "learning_rate": 3.997852438281901e-06, "loss": 0.8679831, "num_input_tokens_seen": 15793665, "step": 737, "time_per_iteration": 4.1386003494262695 }, { "auxiliary_loss_clip": 0.01240863, "auxiliary_loss_mlp": 0.01072388, "balance_loss_clip": 1.0653491, "balance_loss_mlp": 1.03961766, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.2366199062134706, "language_loss": 0.84712577, "learning_rate": 3.997834356895906e-06, "loss": 0.87025833, "num_input_tokens_seen": 15813175, "step": 738, "time_per_iteration": 4.447159290313721 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.0102196, "balance_loss_clip": 1.02144337, "balance_loss_mlp": 1.01685739, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.8779518557387592, "language_loss": 0.59179878, "learning_rate": 3.9978161997518324e-06, "loss": 0.61300576, "num_input_tokens_seen": 15872050, "step": 739, "time_per_iteration": 3.0780396461486816 }, { "auxiliary_loss_clip": 0.012386, "auxiliary_loss_mlp": 0.01067387, "balance_loss_clip": 1.06604302, "balance_loss_mlp": 1.03717899, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.295102845773205, "language_loss": 0.91329807, "learning_rate": 3.997797966850369e-06, "loss": 0.93635798, "num_input_tokens_seen": 15891085, "step": 740, "time_per_iteration": 2.6687562465667725 }, { "auxiliary_loss_clip": 0.01243424, "auxiliary_loss_mlp": 0.01067832, "balance_loss_clip": 1.06807768, "balance_loss_mlp": 1.03929377, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.0543845689042484, "language_loss": 0.71875739, "learning_rate": 3.997779658192205e-06, "loss": 0.74186987, "num_input_tokens_seen": 15914225, "step": 741, "time_per_iteration": 2.707231283187866 }, { "auxiliary_loss_clip": 0.01233192, "auxiliary_loss_mlp": 0.01084138, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.05476475, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 1.7086571433899975, "language_loss": 0.88933527, "learning_rate": 3.997761273778037e-06, "loss": 0.91250861, "num_input_tokens_seen": 15934540, "step": 742, "time_per_iteration": 2.6647751331329346 }, { "auxiliary_loss_clip": 0.01237248, "auxiliary_loss_mlp": 0.0106534, "balance_loss_clip": 1.06481838, "balance_loss_mlp": 1.03367805, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 1.9055071619943689, "language_loss": 0.83840811, "learning_rate": 3.997742813608561e-06, "loss": 0.86143398, "num_input_tokens_seen": 15952560, "step": 743, "time_per_iteration": 2.697864055633545 }, { "auxiliary_loss_clip": 0.01239398, "auxiliary_loss_mlp": 0.01073846, "balance_loss_clip": 1.06395566, "balance_loss_mlp": 1.04373407, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.2041873634107696, "language_loss": 0.80026019, "learning_rate": 3.997724277684479e-06, "loss": 0.82339263, "num_input_tokens_seen": 15970620, "step": 744, "time_per_iteration": 2.6551101207733154 }, { "auxiliary_loss_clip": 0.01236158, "auxiliary_loss_mlp": 0.01076186, "balance_loss_clip": 1.06385589, "balance_loss_mlp": 1.04665816, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 2.139129927663487, "language_loss": 0.85502481, "learning_rate": 3.99770566600649e-06, "loss": 0.87814826, "num_input_tokens_seen": 15987325, "step": 745, "time_per_iteration": 2.6686010360717773 }, { "auxiliary_loss_clip": 0.01235001, "auxiliary_loss_mlp": 0.01066107, "balance_loss_clip": 1.06320596, "balance_loss_mlp": 1.03594685, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.8251828520192552, "language_loss": 0.69291008, "learning_rate": 3.997686978575302e-06, "loss": 0.71592116, "num_input_tokens_seen": 16008310, "step": 746, "time_per_iteration": 2.6782095432281494 }, { "auxiliary_loss_clip": 0.01244022, "auxiliary_loss_mlp": 0.01081644, "balance_loss_clip": 1.07012939, "balance_loss_mlp": 1.05000615, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 3.6053643469900982, "language_loss": 0.68531066, "learning_rate": 3.997668215391625e-06, "loss": 0.70856726, "num_input_tokens_seen": 16029620, "step": 747, "time_per_iteration": 2.6589114665985107 }, { "auxiliary_loss_clip": 0.0124018, "auxiliary_loss_mlp": 0.01083594, "balance_loss_clip": 1.0652504, "balance_loss_mlp": 1.05183625, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 1.8376208182131786, "language_loss": 0.66778374, "learning_rate": 3.997649376456168e-06, "loss": 0.69102144, "num_input_tokens_seen": 16049065, "step": 748, "time_per_iteration": 2.674691677093506 }, { "auxiliary_loss_clip": 0.01243343, "auxiliary_loss_mlp": 0.01085665, "balance_loss_clip": 1.07101417, "balance_loss_mlp": 1.05596995, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 2.4197486882062322, "language_loss": 0.76684916, "learning_rate": 3.997630461769647e-06, "loss": 0.7901392, "num_input_tokens_seen": 16066765, "step": 749, "time_per_iteration": 2.5940611362457275 }, { "auxiliary_loss_clip": 0.01243381, "auxiliary_loss_mlp": 0.01083303, "balance_loss_clip": 1.06892776, "balance_loss_mlp": 1.05338168, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 1.926675828378473, "language_loss": 0.88739896, "learning_rate": 3.997611471332778e-06, "loss": 0.91066581, "num_input_tokens_seen": 16085980, "step": 750, "time_per_iteration": 2.551717758178711 }, { "auxiliary_loss_clip": 0.01238484, "auxiliary_loss_mlp": 0.01077419, "balance_loss_clip": 1.062783, "balance_loss_mlp": 1.04404092, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 3.4910287963746116, "language_loss": 0.74371743, "learning_rate": 3.9975924051462825e-06, "loss": 0.76687646, "num_input_tokens_seen": 16106260, "step": 751, "time_per_iteration": 2.6299028396606445 }, { "auxiliary_loss_clip": 0.0123577, "auxiliary_loss_mlp": 0.01078322, "balance_loss_clip": 1.06347609, "balance_loss_mlp": 1.04884171, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 3.3938056459605583, "language_loss": 0.69115144, "learning_rate": 3.997573263210883e-06, "loss": 0.71429229, "num_input_tokens_seen": 16123475, "step": 752, "time_per_iteration": 2.571223020553589 }, { "auxiliary_loss_clip": 0.01235899, "auxiliary_loss_mlp": 0.01060876, "balance_loss_clip": 1.0627141, "balance_loss_mlp": 1.03212225, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.69328062598792, "language_loss": 0.92126763, "learning_rate": 3.997554045527305e-06, "loss": 0.94423538, "num_input_tokens_seen": 16138335, "step": 753, "time_per_iteration": 2.6100237369537354 }, { "auxiliary_loss_clip": 0.01239023, "auxiliary_loss_mlp": 0.01080271, "balance_loss_clip": 1.06628633, "balance_loss_mlp": 1.05116034, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 4.138305317267875, "language_loss": 0.91373456, "learning_rate": 3.997534752096277e-06, "loss": 0.93692756, "num_input_tokens_seen": 16157110, "step": 754, "time_per_iteration": 2.642747402191162 }, { "auxiliary_loss_clip": 0.01229195, "auxiliary_loss_mlp": 0.01078016, "balance_loss_clip": 1.06402516, "balance_loss_mlp": 1.04725957, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 4.559941934311277, "language_loss": 0.78558046, "learning_rate": 3.997515382918531e-06, "loss": 0.80865264, "num_input_tokens_seen": 16174155, "step": 755, "time_per_iteration": 2.6316659450531006 }, { "auxiliary_loss_clip": 0.01240044, "auxiliary_loss_mlp": 0.01081048, "balance_loss_clip": 1.06624937, "balance_loss_mlp": 1.05099559, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.193539224658874, "language_loss": 0.78473848, "learning_rate": 3.9974959379948015e-06, "loss": 0.80794942, "num_input_tokens_seen": 16192240, "step": 756, "time_per_iteration": 2.6390748023986816 }, { "auxiliary_loss_clip": 0.01101224, "auxiliary_loss_mlp": 0.01013849, "balance_loss_clip": 1.02455997, "balance_loss_mlp": 1.0089612, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8202876780471967, "language_loss": 0.62756521, "learning_rate": 3.997476417325827e-06, "loss": 0.64871597, "num_input_tokens_seen": 16255775, "step": 757, "time_per_iteration": 3.2393198013305664 }, { "auxiliary_loss_clip": 0.01235136, "auxiliary_loss_mlp": 0.01071767, "balance_loss_clip": 1.06455243, "balance_loss_mlp": 1.04346693, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 1.6528285304744148, "language_loss": 0.84211069, "learning_rate": 3.997456820912346e-06, "loss": 0.86517978, "num_input_tokens_seen": 16277015, "step": 758, "time_per_iteration": 2.6508655548095703 }, { "auxiliary_loss_clip": 0.01228461, "auxiliary_loss_mlp": 0.01067033, "balance_loss_clip": 1.05912399, "balance_loss_mlp": 1.0391618, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 2.695805662282291, "language_loss": 0.88150775, "learning_rate": 3.997437148755101e-06, "loss": 0.9044627, "num_input_tokens_seen": 16296005, "step": 759, "time_per_iteration": 2.7782890796661377 }, { "auxiliary_loss_clip": 0.01240589, "auxiliary_loss_mlp": 0.01078815, "balance_loss_clip": 1.06747675, "balance_loss_mlp": 1.04846466, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2.392455009776849, "language_loss": 0.73440695, "learning_rate": 3.9974174008548405e-06, "loss": 0.75760102, "num_input_tokens_seen": 16315300, "step": 760, "time_per_iteration": 2.7138822078704834 }, { "auxiliary_loss_clip": 0.01240372, "auxiliary_loss_mlp": 0.01079791, "balance_loss_clip": 1.07095265, "balance_loss_mlp": 1.05162191, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 3.497321311688565, "language_loss": 0.81781888, "learning_rate": 3.9973975772123105e-06, "loss": 0.84102058, "num_input_tokens_seen": 16333820, "step": 761, "time_per_iteration": 2.631303310394287 }, { "auxiliary_loss_clip": 0.01231969, "auxiliary_loss_mlp": 0.01078623, "balance_loss_clip": 1.06324267, "balance_loss_mlp": 1.04922605, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 2.0632320043111965, "language_loss": 0.79811668, "learning_rate": 3.997377677828266e-06, "loss": 0.82122266, "num_input_tokens_seen": 16355290, "step": 762, "time_per_iteration": 2.646928071975708 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.01027943, "balance_loss_clip": 1.01857328, "balance_loss_mlp": 1.02288842, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0128965743658471, "language_loss": 0.58723813, "learning_rate": 3.9973577027034585e-06, "loss": 0.60845619, "num_input_tokens_seen": 16415995, "step": 763, "time_per_iteration": 3.1712563037872314 }, { "auxiliary_loss_clip": 0.012343, "auxiliary_loss_mlp": 0.01082461, "balance_loss_clip": 1.06205368, "balance_loss_mlp": 1.0531354, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 4.978761831483118, "language_loss": 0.87544954, "learning_rate": 3.9973376518386475e-06, "loss": 0.89861715, "num_input_tokens_seen": 16433120, "step": 764, "time_per_iteration": 2.5985426902770996 }, { "auxiliary_loss_clip": 0.01236145, "auxiliary_loss_mlp": 0.01087868, "balance_loss_clip": 1.06553543, "balance_loss_mlp": 1.05854285, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.0894169515773067, "language_loss": 0.85966802, "learning_rate": 3.997317525234592e-06, "loss": 0.88290817, "num_input_tokens_seen": 16453360, "step": 765, "time_per_iteration": 2.6572606563568115 }, { "auxiliary_loss_clip": 0.01239644, "auxiliary_loss_mlp": 0.01077398, "balance_loss_clip": 1.06530261, "balance_loss_mlp": 1.04573584, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 2.628046285830335, "language_loss": 0.88265938, "learning_rate": 3.997297322892056e-06, "loss": 0.90582979, "num_input_tokens_seen": 16471160, "step": 766, "time_per_iteration": 2.673226833343506 }, { "auxiliary_loss_clip": 0.01235506, "auxiliary_loss_mlp": 0.0107998, "balance_loss_clip": 1.06371713, "balance_loss_mlp": 1.05115545, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 2.343908591401411, "language_loss": 0.84302223, "learning_rate": 3.997277044811806e-06, "loss": 0.86617708, "num_input_tokens_seen": 16488940, "step": 767, "time_per_iteration": 2.683429002761841 }, { "auxiliary_loss_clip": 0.01236229, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.06769753, "balance_loss_mlp": 1.03791094, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 1.9268984031305718, "language_loss": 0.8669976, "learning_rate": 3.99725669099461e-06, "loss": 0.89003831, "num_input_tokens_seen": 16509505, "step": 768, "time_per_iteration": 2.8125200271606445 }, { "auxiliary_loss_clip": 0.01234175, "auxiliary_loss_mlp": 0.01076069, "balance_loss_clip": 1.06150854, "balance_loss_mlp": 1.04738712, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 2.115272554881108, "language_loss": 0.75152099, "learning_rate": 3.9972362614412395e-06, "loss": 0.77462339, "num_input_tokens_seen": 16528840, "step": 769, "time_per_iteration": 2.7286128997802734 }, { "auxiliary_loss_clip": 0.01229956, "auxiliary_loss_mlp": 0.01072391, "balance_loss_clip": 1.06326365, "balance_loss_mlp": 1.04462695, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 1.8368669953292174, "language_loss": 0.86292851, "learning_rate": 3.997215756152471e-06, "loss": 0.885952, "num_input_tokens_seen": 16548335, "step": 770, "time_per_iteration": 2.68608021736145 }, { "auxiliary_loss_clip": 0.01239009, "auxiliary_loss_mlp": 0.01072125, "balance_loss_clip": 1.06274092, "balance_loss_mlp": 1.04284704, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 2.058802627607224, "language_loss": 0.86842889, "learning_rate": 3.99719517512908e-06, "loss": 0.89154023, "num_input_tokens_seen": 16567725, "step": 771, "time_per_iteration": 2.637509822845459 }, { "auxiliary_loss_clip": 0.01239449, "auxiliary_loss_mlp": 0.01079651, "balance_loss_clip": 1.06184912, "balance_loss_mlp": 1.04884768, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 1.87920888608735, "language_loss": 0.83691382, "learning_rate": 3.997174518371848e-06, "loss": 0.8601048, "num_input_tokens_seen": 16588175, "step": 772, "time_per_iteration": 2.745006561279297 }, { "auxiliary_loss_clip": 0.01236322, "auxiliary_loss_mlp": 0.0107061, "balance_loss_clip": 1.06672883, "balance_loss_mlp": 1.04220271, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.9655107083336736, "language_loss": 0.73639083, "learning_rate": 3.997153785881557e-06, "loss": 0.75946015, "num_input_tokens_seen": 16607735, "step": 773, "time_per_iteration": 2.869290828704834 }, { "auxiliary_loss_clip": 0.01231219, "auxiliary_loss_mlp": 0.01071681, "balance_loss_clip": 1.06529772, "balance_loss_mlp": 1.04054356, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.096431798380756, "language_loss": 0.78228974, "learning_rate": 3.997132977658996e-06, "loss": 0.80531871, "num_input_tokens_seen": 16627225, "step": 774, "time_per_iteration": 2.6967568397521973 }, { "auxiliary_loss_clip": 0.01230587, "auxiliary_loss_mlp": 0.01069519, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.04131365, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 2.018140205527256, "language_loss": 0.73187691, "learning_rate": 3.997112093704952e-06, "loss": 0.75487792, "num_input_tokens_seen": 16647785, "step": 775, "time_per_iteration": 2.737140417098999 }, { "auxiliary_loss_clip": 0.01231996, "auxiliary_loss_mlp": 0.01066454, "balance_loss_clip": 1.06187618, "balance_loss_mlp": 1.03650832, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.668093168561758, "language_loss": 0.77180624, "learning_rate": 3.997091134020217e-06, "loss": 0.7947908, "num_input_tokens_seen": 16667555, "step": 776, "time_per_iteration": 4.154085159301758 }, { "auxiliary_loss_clip": 0.0122577, "auxiliary_loss_mlp": 0.01071334, "balance_loss_clip": 1.06031108, "balance_loss_mlp": 1.04352236, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 1.9054628166827923, "language_loss": 0.7087816, "learning_rate": 3.997070098605585e-06, "loss": 0.73175263, "num_input_tokens_seen": 16686875, "step": 777, "time_per_iteration": 4.176887512207031 }, { "auxiliary_loss_clip": 0.0122979, "auxiliary_loss_mlp": 0.01076806, "balance_loss_clip": 1.06275606, "balance_loss_mlp": 1.04705119, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.8083238359854679, "language_loss": 0.77069759, "learning_rate": 3.997048987461856e-06, "loss": 0.79376352, "num_input_tokens_seen": 16706420, "step": 778, "time_per_iteration": 5.943394422531128 }, { "auxiliary_loss_clip": 0.01227067, "auxiliary_loss_mlp": 0.01064982, "balance_loss_clip": 1.06043744, "balance_loss_mlp": 1.03563297, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.1737778598926463, "language_loss": 0.79181123, "learning_rate": 3.997027800589829e-06, "loss": 0.81473172, "num_input_tokens_seen": 16726390, "step": 779, "time_per_iteration": 2.611804485321045 }, { "auxiliary_loss_clip": 0.01219629, "auxiliary_loss_mlp": 0.01070238, "balance_loss_clip": 1.05842376, "balance_loss_mlp": 1.04271269, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.888854926622149, "language_loss": 0.77364886, "learning_rate": 3.997006537990308e-06, "loss": 0.79654753, "num_input_tokens_seen": 16748965, "step": 780, "time_per_iteration": 2.668239116668701 }, { "auxiliary_loss_clip": 0.012253, "auxiliary_loss_mlp": 0.01073321, "balance_loss_clip": 1.06098521, "balance_loss_mlp": 1.04605746, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.7616538282563206, "language_loss": 0.76700419, "learning_rate": 3.996985199664099e-06, "loss": 0.78999043, "num_input_tokens_seen": 16768620, "step": 781, "time_per_iteration": 2.5979926586151123 }, { "auxiliary_loss_clip": 0.01236637, "auxiliary_loss_mlp": 0.01077479, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.04836786, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 3.0946494667490856, "language_loss": 0.73786414, "learning_rate": 3.99696378561201e-06, "loss": 0.76100528, "num_input_tokens_seen": 16789755, "step": 782, "time_per_iteration": 2.708855390548706 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01069368, "balance_loss_clip": 1.06431556, "balance_loss_mlp": 1.04253423, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.1459158015790183, "language_loss": 0.80524659, "learning_rate": 3.996942295834855e-06, "loss": 0.82823706, "num_input_tokens_seen": 16807585, "step": 783, "time_per_iteration": 2.6355738639831543 }, { "auxiliary_loss_clip": 0.01222415, "auxiliary_loss_mlp": 0.01063155, "balance_loss_clip": 1.06221437, "balance_loss_mlp": 1.03663135, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.9084512066318515, "language_loss": 0.81687874, "learning_rate": 3.996920730333448e-06, "loss": 0.83973444, "num_input_tokens_seen": 16827220, "step": 784, "time_per_iteration": 2.64365291595459 }, { "auxiliary_loss_clip": 0.01226632, "auxiliary_loss_mlp": 0.01074549, "balance_loss_clip": 1.0582943, "balance_loss_mlp": 1.04719007, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 3.970707764370453, "language_loss": 0.80619848, "learning_rate": 3.996899089108607e-06, "loss": 0.82921028, "num_input_tokens_seen": 16846230, "step": 785, "time_per_iteration": 2.682971715927124 }, { "auxiliary_loss_clip": 0.01231621, "auxiliary_loss_mlp": 0.01063774, "balance_loss_clip": 1.06683421, "balance_loss_mlp": 1.03784585, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 2.074448818096939, "language_loss": 0.89784658, "learning_rate": 3.996877372161152e-06, "loss": 0.92080051, "num_input_tokens_seen": 16865325, "step": 786, "time_per_iteration": 2.6072235107421875 }, { "auxiliary_loss_clip": 0.01227201, "auxiliary_loss_mlp": 0.01069453, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.03912568, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 6.783818284100465, "language_loss": 0.76794451, "learning_rate": 3.9968555794919065e-06, "loss": 0.79091108, "num_input_tokens_seen": 16882930, "step": 787, "time_per_iteration": 2.595069646835327 }, { "auxiliary_loss_clip": 0.01233526, "auxiliary_loss_mlp": 0.01070856, "balance_loss_clip": 1.06563127, "balance_loss_mlp": 1.04248405, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.309745026689568, "language_loss": 0.81301165, "learning_rate": 3.996833711101698e-06, "loss": 0.83605546, "num_input_tokens_seen": 16900710, "step": 788, "time_per_iteration": 2.633812427520752 }, { "auxiliary_loss_clip": 0.01225447, "auxiliary_loss_mlp": 0.01078934, "balance_loss_clip": 1.06370282, "balance_loss_mlp": 1.04934621, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 2.941245147417381, "language_loss": 0.84428835, "learning_rate": 3.996811766991355e-06, "loss": 0.86733222, "num_input_tokens_seen": 16919210, "step": 789, "time_per_iteration": 2.6711082458496094 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.01071483, "balance_loss_clip": 1.06367648, "balance_loss_mlp": 1.0441606, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 2.0289407228390615, "language_loss": 0.81787878, "learning_rate": 3.996789747161709e-06, "loss": 0.84090227, "num_input_tokens_seen": 16937125, "step": 790, "time_per_iteration": 2.6136717796325684 }, { "auxiliary_loss_clip": 0.01224033, "auxiliary_loss_mlp": 0.01064065, "balance_loss_clip": 1.05880189, "balance_loss_mlp": 1.03546715, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.9735437778568965, "language_loss": 0.88116109, "learning_rate": 3.996767651613597e-06, "loss": 0.90404207, "num_input_tokens_seen": 16958610, "step": 791, "time_per_iteration": 2.747586727142334 }, { "auxiliary_loss_clip": 0.01226267, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06144643, "balance_loss_mlp": 1.03743124, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.1239226540804537, "language_loss": 0.90671498, "learning_rate": 3.996745480347854e-06, "loss": 0.92964232, "num_input_tokens_seen": 16977300, "step": 792, "time_per_iteration": 2.591477870941162 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.0107926, "balance_loss_clip": 1.05968022, "balance_loss_mlp": 1.05225897, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 1.9120988315570397, "language_loss": 0.73246223, "learning_rate": 3.996723233365324e-06, "loss": 0.75552362, "num_input_tokens_seen": 16994950, "step": 793, "time_per_iteration": 2.6319899559020996 }, { "auxiliary_loss_clip": 0.01231301, "auxiliary_loss_mlp": 0.01070716, "balance_loss_clip": 1.06213653, "balance_loss_mlp": 1.04146254, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.86347948201136, "language_loss": 0.86139679, "learning_rate": 3.996700910666847e-06, "loss": 0.88441694, "num_input_tokens_seen": 17014760, "step": 794, "time_per_iteration": 2.6835687160491943 }, { "auxiliary_loss_clip": 0.01228204, "auxiliary_loss_mlp": 0.01077895, "balance_loss_clip": 1.05969596, "balance_loss_mlp": 1.04935622, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 2.370166301863074, "language_loss": 0.69069195, "learning_rate": 3.996678512253272e-06, "loss": 0.71375293, "num_input_tokens_seen": 17032715, "step": 795, "time_per_iteration": 2.669261932373047 }, { "auxiliary_loss_clip": 0.01225748, "auxiliary_loss_mlp": 0.01076275, "balance_loss_clip": 1.06129098, "balance_loss_mlp": 1.04756904, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.744925212230271, "language_loss": 0.810256, "learning_rate": 3.996656038125449e-06, "loss": 0.83327615, "num_input_tokens_seen": 17052215, "step": 796, "time_per_iteration": 2.5800065994262695 }, { "auxiliary_loss_clip": 0.01228235, "auxiliary_loss_mlp": 0.01065433, "balance_loss_clip": 1.06224668, "balance_loss_mlp": 1.03638172, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 1.979164246440182, "language_loss": 0.8128069, "learning_rate": 3.996633488284228e-06, "loss": 0.83574355, "num_input_tokens_seen": 17069225, "step": 797, "time_per_iteration": 2.58878493309021 }, { "auxiliary_loss_clip": 0.01100259, "auxiliary_loss_mlp": 0.01007215, "balance_loss_clip": 1.02779806, "balance_loss_mlp": 1.00266171, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.912416075283383, "language_loss": 0.64532876, "learning_rate": 3.996610862730465e-06, "loss": 0.66640353, "num_input_tokens_seen": 17126680, "step": 798, "time_per_iteration": 3.0779380798339844 }, { "auxiliary_loss_clip": 0.01229665, "auxiliary_loss_mlp": 0.01068747, "balance_loss_clip": 1.05799031, "balance_loss_mlp": 1.04121017, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 2.0206600610723333, "language_loss": 0.91274291, "learning_rate": 3.996588161465018e-06, "loss": 0.935727, "num_input_tokens_seen": 17144835, "step": 799, "time_per_iteration": 2.660438299179077 }, { "auxiliary_loss_clip": 0.01230751, "auxiliary_loss_mlp": 0.010715, "balance_loss_clip": 1.06640434, "balance_loss_mlp": 1.04274678, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.0752654205923866, "language_loss": 0.86825287, "learning_rate": 3.996565384488748e-06, "loss": 0.89127541, "num_input_tokens_seen": 17165030, "step": 800, "time_per_iteration": 2.6700456142425537 }, { "auxiliary_loss_clip": 0.01229893, "auxiliary_loss_mlp": 0.01072058, "balance_loss_clip": 1.06186771, "balance_loss_mlp": 1.04618931, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.5310108886746976, "language_loss": 0.83949852, "learning_rate": 3.996542531802518e-06, "loss": 0.86251807, "num_input_tokens_seen": 17184895, "step": 801, "time_per_iteration": 2.7724695205688477 }, { "auxiliary_loss_clip": 0.01227846, "auxiliary_loss_mlp": 0.010756, "balance_loss_clip": 1.06226814, "balance_loss_mlp": 1.04847932, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 1.9607091513106172, "language_loss": 0.79818648, "learning_rate": 3.996519603407196e-06, "loss": 0.82122099, "num_input_tokens_seen": 17208225, "step": 802, "time_per_iteration": 2.861309766769409 }, { "auxiliary_loss_clip": 0.0122832, "auxiliary_loss_mlp": 0.01069086, "balance_loss_clip": 1.06392837, "balance_loss_mlp": 1.04278886, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 1.798745906633195, "language_loss": 0.86600745, "learning_rate": 3.996496599303649e-06, "loss": 0.88898146, "num_input_tokens_seen": 17226305, "step": 803, "time_per_iteration": 2.612684965133667 }, { "auxiliary_loss_clip": 0.01222438, "auxiliary_loss_mlp": 0.01063116, "balance_loss_clip": 1.06214345, "balance_loss_mlp": 1.03643703, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 5.958214069975319, "language_loss": 0.85139012, "learning_rate": 3.996473519492753e-06, "loss": 0.8742457, "num_input_tokens_seen": 17244545, "step": 804, "time_per_iteration": 2.596965789794922 }, { "auxiliary_loss_clip": 0.01225485, "auxiliary_loss_mlp": 0.0106948, "balance_loss_clip": 1.06206632, "balance_loss_mlp": 1.04222918, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 1.9492340448514227, "language_loss": 0.85939878, "learning_rate": 3.99645036397538e-06, "loss": 0.88234842, "num_input_tokens_seen": 17265730, "step": 805, "time_per_iteration": 2.6773781776428223 }, { "auxiliary_loss_clip": 0.01221339, "auxiliary_loss_mlp": 0.01071867, "balance_loss_clip": 1.05968738, "balance_loss_mlp": 1.04591477, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 1.8764849579047527, "language_loss": 0.68025368, "learning_rate": 3.9964271327524085e-06, "loss": 0.70318574, "num_input_tokens_seen": 17284820, "step": 806, "time_per_iteration": 2.6270596981048584 }, { "auxiliary_loss_clip": 0.01221043, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.06064904, "balance_loss_mlp": 1.03384972, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 8.586680684018, "language_loss": 0.76488906, "learning_rate": 3.9964038258247214e-06, "loss": 0.78770459, "num_input_tokens_seen": 17305085, "step": 807, "time_per_iteration": 2.6783089637756348 }, { "auxiliary_loss_clip": 0.01218859, "auxiliary_loss_mlp": 0.01068871, "balance_loss_clip": 1.05734789, "balance_loss_mlp": 1.04290676, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 2.4056749627509157, "language_loss": 0.86882269, "learning_rate": 3.9963804431932005e-06, "loss": 0.89170003, "num_input_tokens_seen": 17322715, "step": 808, "time_per_iteration": 2.6447641849517822 }, { "auxiliary_loss_clip": 0.01227529, "auxiliary_loss_mlp": 0.01069446, "balance_loss_clip": 1.06140316, "balance_loss_mlp": 1.0424329, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 2.6040733531164424, "language_loss": 0.89710444, "learning_rate": 3.996356984858732e-06, "loss": 0.92007422, "num_input_tokens_seen": 17341455, "step": 809, "time_per_iteration": 2.6679790019989014 }, { "auxiliary_loss_clip": 0.01226608, "auxiliary_loss_mlp": 0.01067211, "balance_loss_clip": 1.0643065, "balance_loss_mlp": 1.04060316, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 3.0721319202916324, "language_loss": 0.84918916, "learning_rate": 3.996333450822208e-06, "loss": 0.87212729, "num_input_tokens_seen": 17360765, "step": 810, "time_per_iteration": 2.696772575378418 }, { "auxiliary_loss_clip": 0.01227202, "auxiliary_loss_mlp": 0.01067343, "balance_loss_clip": 1.0622344, "balance_loss_mlp": 1.04049683, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 1.8136675943398954, "language_loss": 0.80799425, "learning_rate": 3.99630984108452e-06, "loss": 0.83093977, "num_input_tokens_seen": 17380625, "step": 811, "time_per_iteration": 2.653808355331421 }, { "auxiliary_loss_clip": 0.01217843, "auxiliary_loss_mlp": 0.01070621, "balance_loss_clip": 1.05928314, "balance_loss_mlp": 1.04466903, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.7193599003225197, "language_loss": 0.74634516, "learning_rate": 3.9962861556465615e-06, "loss": 0.76922977, "num_input_tokens_seen": 17399355, "step": 812, "time_per_iteration": 2.7274649143218994 }, { "auxiliary_loss_clip": 0.01222659, "auxiliary_loss_mlp": 0.01073562, "balance_loss_clip": 1.06445217, "balance_loss_mlp": 1.04862356, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 1.9311665765462733, "language_loss": 0.90124279, "learning_rate": 3.996262394509233e-06, "loss": 0.92420495, "num_input_tokens_seen": 17418240, "step": 813, "time_per_iteration": 2.654874801635742 }, { "auxiliary_loss_clip": 0.0122, "auxiliary_loss_mlp": 0.01057827, "balance_loss_clip": 1.06157589, "balance_loss_mlp": 1.03248262, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 1.9238840150723209, "language_loss": 0.74904704, "learning_rate": 3.9962385576734335e-06, "loss": 0.77182531, "num_input_tokens_seen": 17436250, "step": 814, "time_per_iteration": 2.7381603717803955 }, { "auxiliary_loss_clip": 0.01223782, "auxiliary_loss_mlp": 0.01069686, "balance_loss_clip": 1.06125045, "balance_loss_mlp": 1.04289961, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 2.1966001004582596, "language_loss": 0.83816808, "learning_rate": 3.9962146451400675e-06, "loss": 0.86110282, "num_input_tokens_seen": 17455750, "step": 815, "time_per_iteration": 2.7289621829986572 }, { "auxiliary_loss_clip": 0.01227011, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.06326818, "balance_loss_mlp": 1.0344646, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.3329994981275943, "language_loss": 0.90796101, "learning_rate": 3.996190656910043e-06, "loss": 0.93083686, "num_input_tokens_seen": 17474995, "step": 816, "time_per_iteration": 4.174290180206299 }, { "auxiliary_loss_clip": 0.01226278, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.06172895, "balance_loss_mlp": 1.03054583, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 2.2253098946667853, "language_loss": 0.79834002, "learning_rate": 3.996166592984268e-06, "loss": 0.82116789, "num_input_tokens_seen": 17493395, "step": 817, "time_per_iteration": 4.2819907665252686 }, { "auxiliary_loss_clip": 0.01222491, "auxiliary_loss_mlp": 0.01072358, "balance_loss_clip": 1.06228495, "balance_loss_mlp": 1.04563141, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.9292138186207266, "language_loss": 0.8532303, "learning_rate": 3.996142453363656e-06, "loss": 0.8761788, "num_input_tokens_seen": 17514565, "step": 818, "time_per_iteration": 7.687308073043823 }, { "auxiliary_loss_clip": 0.01228571, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06170368, "balance_loss_mlp": 1.0369786, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.1064810754058407, "language_loss": 0.75623614, "learning_rate": 3.996118238049124e-06, "loss": 0.77916616, "num_input_tokens_seen": 17534590, "step": 819, "time_per_iteration": 2.5708072185516357 }, { "auxiliary_loss_clip": 0.01227988, "auxiliary_loss_mlp": 0.010616, "balance_loss_clip": 1.06580663, "balance_loss_mlp": 1.03785336, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.8685299631500487, "language_loss": 0.85082126, "learning_rate": 3.996093947041586e-06, "loss": 0.87371719, "num_input_tokens_seen": 17551900, "step": 820, "time_per_iteration": 2.695204973220825 }, { "auxiliary_loss_clip": 0.01224953, "auxiliary_loss_mlp": 0.01065985, "balance_loss_clip": 1.06082845, "balance_loss_mlp": 1.04037917, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 1.734636988660555, "language_loss": 0.90459162, "learning_rate": 3.996069580341966e-06, "loss": 0.92750102, "num_input_tokens_seen": 17571485, "step": 821, "time_per_iteration": 2.6284992694854736 }, { "auxiliary_loss_clip": 0.01222526, "auxiliary_loss_mlp": 0.01080357, "balance_loss_clip": 1.06015635, "balance_loss_mlp": 1.05485809, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 1.7915267676548876, "language_loss": 0.89795959, "learning_rate": 3.996045137951188e-06, "loss": 0.92098844, "num_input_tokens_seen": 17591410, "step": 822, "time_per_iteration": 2.6085855960845947 }, { "auxiliary_loss_clip": 0.0122571, "auxiliary_loss_mlp": 0.01062887, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.03472972, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 2.28747155105076, "language_loss": 0.67558801, "learning_rate": 3.996020619870178e-06, "loss": 0.69847399, "num_input_tokens_seen": 17612010, "step": 823, "time_per_iteration": 2.644277572631836 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.0100741, "balance_loss_clip": 1.0267303, "balance_loss_mlp": 1.00266516, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.3456360586087317, "language_loss": 0.62254131, "learning_rate": 3.995996026099866e-06, "loss": 0.64360917, "num_input_tokens_seen": 17673430, "step": 824, "time_per_iteration": 3.230381488800049 }, { "auxiliary_loss_clip": 0.01228758, "auxiliary_loss_mlp": 0.01066541, "balance_loss_clip": 1.06346989, "balance_loss_mlp": 1.03909945, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.8854339538524305, "language_loss": 0.90479428, "learning_rate": 3.995971356641185e-06, "loss": 0.92774737, "num_input_tokens_seen": 17689545, "step": 825, "time_per_iteration": 2.58868670463562 }, { "auxiliary_loss_clip": 0.01227734, "auxiliary_loss_mlp": 0.01066527, "balance_loss_clip": 1.06315517, "balance_loss_mlp": 1.03844118, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 2.307419213246734, "language_loss": 0.66851091, "learning_rate": 3.9959466114950695e-06, "loss": 0.69145352, "num_input_tokens_seen": 17705965, "step": 826, "time_per_iteration": 2.59468412399292 }, { "auxiliary_loss_clip": 0.01230149, "auxiliary_loss_mlp": 0.01069061, "balance_loss_clip": 1.06421614, "balance_loss_mlp": 1.04216766, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 1.8316571551414482, "language_loss": 0.78298402, "learning_rate": 3.995921790662459e-06, "loss": 0.80597603, "num_input_tokens_seen": 17724580, "step": 827, "time_per_iteration": 2.7148005962371826 }, { "auxiliary_loss_clip": 0.01230507, "auxiliary_loss_mlp": 0.01079145, "balance_loss_clip": 1.06385946, "balance_loss_mlp": 1.05119085, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.6017511297862308, "language_loss": 0.78696525, "learning_rate": 3.995896894144294e-06, "loss": 0.81006181, "num_input_tokens_seen": 17747755, "step": 828, "time_per_iteration": 2.86991548538208 }, { "auxiliary_loss_clip": 0.0121958, "auxiliary_loss_mlp": 0.01059689, "balance_loss_clip": 1.05939984, "balance_loss_mlp": 1.03390431, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 2.48577103336206, "language_loss": 0.83530867, "learning_rate": 3.995871921941519e-06, "loss": 0.85810131, "num_input_tokens_seen": 17768550, "step": 829, "time_per_iteration": 2.655895948410034 }, { "auxiliary_loss_clip": 0.01226863, "auxiliary_loss_mlp": 0.01080723, "balance_loss_clip": 1.06109536, "balance_loss_mlp": 1.05068195, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.078538436430036, "language_loss": 0.74857247, "learning_rate": 3.99584687405508e-06, "loss": 0.77164829, "num_input_tokens_seen": 17786080, "step": 830, "time_per_iteration": 2.5820400714874268 }, { "auxiliary_loss_clip": 0.0122584, "auxiliary_loss_mlp": 0.01074077, "balance_loss_clip": 1.06154907, "balance_loss_mlp": 1.04667115, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 1.8327841960194244, "language_loss": 0.79279459, "learning_rate": 3.995821750485929e-06, "loss": 0.81579381, "num_input_tokens_seen": 17803635, "step": 831, "time_per_iteration": 2.5980231761932373 }, { "auxiliary_loss_clip": 0.01173206, "auxiliary_loss_mlp": 0.01072743, "balance_loss_clip": 1.0542444, "balance_loss_mlp": 1.04725623, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 3.034319898285603, "language_loss": 0.91497368, "learning_rate": 3.995796551235016e-06, "loss": 0.93743312, "num_input_tokens_seen": 17822190, "step": 832, "time_per_iteration": 2.7498815059661865 }, { "auxiliary_loss_clip": 0.01194428, "auxiliary_loss_mlp": 0.01081719, "balance_loss_clip": 1.05826366, "balance_loss_mlp": 1.05667353, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 1.887029338258115, "language_loss": 0.83167893, "learning_rate": 3.9957712763032974e-06, "loss": 0.85444039, "num_input_tokens_seen": 17846915, "step": 833, "time_per_iteration": 2.863208770751953 }, { "auxiliary_loss_clip": 0.01199525, "auxiliary_loss_mlp": 0.01061962, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.03468657, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.8753922020214033, "language_loss": 0.82409853, "learning_rate": 3.995745925691733e-06, "loss": 0.84671336, "num_input_tokens_seen": 17867270, "step": 834, "time_per_iteration": 2.7868030071258545 }, { "auxiliary_loss_clip": 0.01216246, "auxiliary_loss_mlp": 0.01064427, "balance_loss_clip": 1.06272483, "balance_loss_mlp": 1.03672278, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.2306487397141646, "language_loss": 0.92186153, "learning_rate": 3.995720499401282e-06, "loss": 0.94466823, "num_input_tokens_seen": 17884880, "step": 835, "time_per_iteration": 2.6224496364593506 }, { "auxiliary_loss_clip": 0.01229494, "auxiliary_loss_mlp": 0.01074922, "balance_loss_clip": 1.06143415, "balance_loss_mlp": 1.0464313, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.196832783808158, "language_loss": 0.76143622, "learning_rate": 3.995694997432911e-06, "loss": 0.78448039, "num_input_tokens_seen": 17903695, "step": 836, "time_per_iteration": 2.5648462772369385 }, { "auxiliary_loss_clip": 0.01211162, "auxiliary_loss_mlp": 0.01075977, "balance_loss_clip": 1.06259084, "balance_loss_mlp": 1.04992962, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.100773352560791, "language_loss": 0.83627856, "learning_rate": 3.9956694197875855e-06, "loss": 0.85914999, "num_input_tokens_seen": 17920745, "step": 837, "time_per_iteration": 2.7420156002044678 }, { "auxiliary_loss_clip": 0.01198815, "auxiliary_loss_mlp": 0.0078439, "balance_loss_clip": 1.06345344, "balance_loss_mlp": 1.00053763, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.1353335821274477, "language_loss": 0.72857559, "learning_rate": 3.995643766466275e-06, "loss": 0.7484076, "num_input_tokens_seen": 17938220, "step": 838, "time_per_iteration": 2.679177761077881 }, { "auxiliary_loss_clip": 0.01189223, "auxiliary_loss_mlp": 0.01071526, "balance_loss_clip": 1.05415273, "balance_loss_mlp": 1.04510927, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.8138261016039334, "language_loss": 0.83462799, "learning_rate": 3.995618037469953e-06, "loss": 0.85723549, "num_input_tokens_seen": 17957325, "step": 839, "time_per_iteration": 2.69063663482666 }, { "auxiliary_loss_clip": 0.01220356, "auxiliary_loss_mlp": 0.01069331, "balance_loss_clip": 1.05991399, "balance_loss_mlp": 1.04411805, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.7513762525269907, "language_loss": 0.85775483, "learning_rate": 3.995592232799595e-06, "loss": 0.88065171, "num_input_tokens_seen": 17975875, "step": 840, "time_per_iteration": 2.6477303504943848 }, { "auxiliary_loss_clip": 0.01192112, "auxiliary_loss_mlp": 0.01064377, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.036291, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.7956760046069329, "language_loss": 0.9457823, "learning_rate": 3.99556635245618e-06, "loss": 0.96834719, "num_input_tokens_seen": 17994340, "step": 841, "time_per_iteration": 2.8354220390319824 }, { "auxiliary_loss_clip": 0.0122473, "auxiliary_loss_mlp": 0.01070125, "balance_loss_clip": 1.06219172, "balance_loss_mlp": 1.04329097, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.3106044659054104, "language_loss": 0.77566791, "learning_rate": 3.995540396440688e-06, "loss": 0.79861641, "num_input_tokens_seen": 18015260, "step": 842, "time_per_iteration": 2.6909749507904053 }, { "auxiliary_loss_clip": 0.01214637, "auxiliary_loss_mlp": 0.01071033, "balance_loss_clip": 1.06270838, "balance_loss_mlp": 1.04391265, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 2.8849837971101864, "language_loss": 0.78126526, "learning_rate": 3.995514364754105e-06, "loss": 0.80412203, "num_input_tokens_seen": 18033960, "step": 843, "time_per_iteration": 2.6534156799316406 }, { "auxiliary_loss_clip": 0.01212948, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.06317043, "balance_loss_mlp": 1.03894806, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.9320015451631862, "language_loss": 0.83256191, "learning_rate": 3.995488257397417e-06, "loss": 0.85532749, "num_input_tokens_seen": 18056700, "step": 844, "time_per_iteration": 2.7682149410247803 }, { "auxiliary_loss_clip": 0.01216308, "auxiliary_loss_mlp": 0.01067162, "balance_loss_clip": 1.06307864, "balance_loss_mlp": 1.04138875, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.113957107027846, "language_loss": 0.77108061, "learning_rate": 3.995462074371614e-06, "loss": 0.79391527, "num_input_tokens_seen": 18075815, "step": 845, "time_per_iteration": 2.6720399856567383 }, { "auxiliary_loss_clip": 0.01206643, "auxiliary_loss_mlp": 0.01065522, "balance_loss_clip": 1.05881417, "balance_loss_mlp": 1.03885484, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 1.8497392628450484, "language_loss": 0.87773871, "learning_rate": 3.99543581567769e-06, "loss": 0.90046036, "num_input_tokens_seen": 18095095, "step": 846, "time_per_iteration": 2.696049690246582 }, { "auxiliary_loss_clip": 0.01206291, "auxiliary_loss_mlp": 0.01069231, "balance_loss_clip": 1.06204462, "balance_loss_mlp": 1.04330277, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.695550491545423, "language_loss": 0.87364423, "learning_rate": 3.9954094813166394e-06, "loss": 0.89639944, "num_input_tokens_seen": 18112675, "step": 847, "time_per_iteration": 2.666907548904419 }, { "auxiliary_loss_clip": 0.01175052, "auxiliary_loss_mlp": 0.01071976, "balance_loss_clip": 1.06267309, "balance_loss_mlp": 1.0447005, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.5687168450386637, "language_loss": 0.81878662, "learning_rate": 3.995383071289462e-06, "loss": 0.84125686, "num_input_tokens_seen": 18130745, "step": 848, "time_per_iteration": 2.782135486602783 }, { "auxiliary_loss_clip": 0.0122638, "auxiliary_loss_mlp": 0.01071388, "balance_loss_clip": 1.06619906, "balance_loss_mlp": 1.04544854, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.678404869397893, "language_loss": 0.87187904, "learning_rate": 3.995356585597158e-06, "loss": 0.89485669, "num_input_tokens_seen": 18152410, "step": 849, "time_per_iteration": 2.787992000579834 }, { "auxiliary_loss_clip": 0.01220251, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.06049275, "balance_loss_mlp": 1.03545308, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 2.125711462362114, "language_loss": 0.8315587, "learning_rate": 3.995330024240732e-06, "loss": 0.85437429, "num_input_tokens_seen": 18170870, "step": 850, "time_per_iteration": 2.6548752784729004 }, { "auxiliary_loss_clip": 0.01210598, "auxiliary_loss_mlp": 0.01063491, "balance_loss_clip": 1.06061506, "balance_loss_mlp": 1.0379566, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.2115645013354253, "language_loss": 0.65423882, "learning_rate": 3.995303387221192e-06, "loss": 0.67697972, "num_input_tokens_seen": 18191555, "step": 851, "time_per_iteration": 2.817197322845459 }, { "auxiliary_loss_clip": 0.0120566, "auxiliary_loss_mlp": 0.01075745, "balance_loss_clip": 1.05822444, "balance_loss_mlp": 1.04761147, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 2.3720786299251073, "language_loss": 0.83587611, "learning_rate": 3.995276674539547e-06, "loss": 0.8586902, "num_input_tokens_seen": 18208620, "step": 852, "time_per_iteration": 2.685727119445801 }, { "auxiliary_loss_clip": 0.01193575, "auxiliary_loss_mlp": 0.01074152, "balance_loss_clip": 1.05924761, "balance_loss_mlp": 1.04737723, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.1832763559951234, "language_loss": 0.80761266, "learning_rate": 3.995249886196811e-06, "loss": 0.8302899, "num_input_tokens_seen": 18226370, "step": 853, "time_per_iteration": 2.6078240871429443 }, { "auxiliary_loss_clip": 0.01222394, "auxiliary_loss_mlp": 0.01065268, "balance_loss_clip": 1.06223083, "balance_loss_mlp": 1.03780222, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 1.8511550328562763, "language_loss": 0.75617325, "learning_rate": 3.995223022193999e-06, "loss": 0.77904987, "num_input_tokens_seen": 18247075, "step": 854, "time_per_iteration": 2.633543014526367 }, { "auxiliary_loss_clip": 0.01202415, "auxiliary_loss_mlp": 0.01065973, "balance_loss_clip": 1.06141627, "balance_loss_mlp": 1.03828049, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.04057054323539, "language_loss": 0.81722355, "learning_rate": 3.99519608253213e-06, "loss": 0.83990741, "num_input_tokens_seen": 18265680, "step": 855, "time_per_iteration": 2.760880708694458 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.00762392, "balance_loss_clip": 1.0358243, "balance_loss_mlp": 1.00074518, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9894594919315515, "language_loss": 0.65634769, "learning_rate": 3.995169067212227e-06, "loss": 0.67473871, "num_input_tokens_seen": 18327015, "step": 856, "time_per_iteration": 6.271182298660278 }, { "auxiliary_loss_clip": 0.01194232, "auxiliary_loss_mlp": 0.01056626, "balance_loss_clip": 1.05972147, "balance_loss_mlp": 1.02994716, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 1.8001295724347575, "language_loss": 0.77139348, "learning_rate": 3.9951419762353116e-06, "loss": 0.79390204, "num_input_tokens_seen": 18345235, "step": 857, "time_per_iteration": 4.905239582061768 }, { "auxiliary_loss_clip": 0.01183581, "auxiliary_loss_mlp": 0.01059685, "balance_loss_clip": 1.05640614, "balance_loss_mlp": 1.03291047, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.111656321737554, "language_loss": 0.89194518, "learning_rate": 3.995114809602412e-06, "loss": 0.91437781, "num_input_tokens_seen": 18362350, "step": 858, "time_per_iteration": 2.7349045276641846 }, { "auxiliary_loss_clip": 0.01196113, "auxiliary_loss_mlp": 0.01060739, "balance_loss_clip": 1.06114125, "balance_loss_mlp": 1.03398848, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.030377637624243, "language_loss": 0.75684321, "learning_rate": 3.9950875673145605e-06, "loss": 0.77941179, "num_input_tokens_seen": 18383390, "step": 859, "time_per_iteration": 2.7611751556396484 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.0107269, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04354358, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.134655488493178, "language_loss": 0.91122925, "learning_rate": 3.995060249372788e-06, "loss": 0.93381929, "num_input_tokens_seen": 18399220, "step": 860, "time_per_iteration": 2.666740894317627 }, { "auxiliary_loss_clip": 0.0122488, "auxiliary_loss_mlp": 0.01060586, "balance_loss_clip": 1.06531346, "balance_loss_mlp": 1.03536153, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.7954568874114027, "language_loss": 0.82378531, "learning_rate": 3.99503285577813e-06, "loss": 0.84663993, "num_input_tokens_seen": 18419005, "step": 861, "time_per_iteration": 2.6337814331054688 }, { "auxiliary_loss_clip": 0.01198486, "auxiliary_loss_mlp": 0.01060236, "balance_loss_clip": 1.06147969, "balance_loss_mlp": 1.03437924, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 2.5785699637959776, "language_loss": 0.78664875, "learning_rate": 3.995005386531627e-06, "loss": 0.80923599, "num_input_tokens_seen": 18440550, "step": 862, "time_per_iteration": 2.7570109367370605 }, { "auxiliary_loss_clip": 0.01189664, "auxiliary_loss_mlp": 0.01070327, "balance_loss_clip": 1.058797, "balance_loss_mlp": 1.04547238, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 1.7880881456146414, "language_loss": 0.89090264, "learning_rate": 3.9949778416343195e-06, "loss": 0.91350257, "num_input_tokens_seen": 18461950, "step": 863, "time_per_iteration": 2.7118866443634033 }, { "auxiliary_loss_clip": 0.01201772, "auxiliary_loss_mlp": 0.01064316, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.0369451, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 2.081656150811602, "language_loss": 0.76119763, "learning_rate": 3.9949502210872525e-06, "loss": 0.78385854, "num_input_tokens_seen": 18480555, "step": 864, "time_per_iteration": 2.6946637630462646 }, { "auxiliary_loss_clip": 0.01186585, "auxiliary_loss_mlp": 0.01067959, "balance_loss_clip": 1.05559874, "balance_loss_mlp": 1.04046965, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 1.9374308734697678, "language_loss": 0.7908361, "learning_rate": 3.994922524891474e-06, "loss": 0.81338149, "num_input_tokens_seen": 18499645, "step": 865, "time_per_iteration": 2.7700579166412354 }, { "auxiliary_loss_clip": 0.01210067, "auxiliary_loss_mlp": 0.01067568, "balance_loss_clip": 1.06164694, "balance_loss_mlp": 1.04152083, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.269489500676155, "language_loss": 0.85860598, "learning_rate": 3.994894753048032e-06, "loss": 0.88138229, "num_input_tokens_seen": 18516810, "step": 866, "time_per_iteration": 2.659614086151123 }, { "auxiliary_loss_clip": 0.01186536, "auxiliary_loss_mlp": 0.01070465, "balance_loss_clip": 1.06327558, "balance_loss_mlp": 1.04371393, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.1733876112564565, "language_loss": 0.87495244, "learning_rate": 3.9948669055579815e-06, "loss": 0.89752245, "num_input_tokens_seen": 18532510, "step": 867, "time_per_iteration": 2.740238904953003 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01078445, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.05437636, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.8498678854952728, "language_loss": 0.63917863, "learning_rate": 3.9948389824223785e-06, "loss": 0.66163892, "num_input_tokens_seen": 18557380, "step": 868, "time_per_iteration": 2.9310383796691895 }, { "auxiliary_loss_clip": 0.01225135, "auxiliary_loss_mlp": 0.01069894, "balance_loss_clip": 1.06287289, "balance_loss_mlp": 1.04173636, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 2.742912036955754, "language_loss": 0.83379138, "learning_rate": 3.994810983642281e-06, "loss": 0.85674161, "num_input_tokens_seen": 18575720, "step": 869, "time_per_iteration": 2.6453137397766113 }, { "auxiliary_loss_clip": 0.01216406, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.0645746, "balance_loss_mlp": 1.03053236, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.188953802542244, "language_loss": 0.87822217, "learning_rate": 3.994782909218751e-06, "loss": 0.90095031, "num_input_tokens_seen": 18592185, "step": 870, "time_per_iteration": 2.7044875621795654 }, { "auxiliary_loss_clip": 0.01226316, "auxiliary_loss_mlp": 0.01064746, "balance_loss_clip": 1.06603277, "balance_loss_mlp": 1.03965199, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 1.975067156516721, "language_loss": 0.80651748, "learning_rate": 3.994754759152854e-06, "loss": 0.82942802, "num_input_tokens_seen": 18609560, "step": 871, "time_per_iteration": 2.6892175674438477 }, { "auxiliary_loss_clip": 0.0119502, "auxiliary_loss_mlp": 0.01064309, "balance_loss_clip": 1.0650804, "balance_loss_mlp": 1.0396452, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.7402390708810018, "language_loss": 0.81330585, "learning_rate": 3.994726533445656e-06, "loss": 0.83589917, "num_input_tokens_seen": 18629405, "step": 872, "time_per_iteration": 2.8044185638427734 }, { "auxiliary_loss_clip": 0.0107835, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.03168392, "balance_loss_mlp": 1.01515913, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 0.883483589670371, "language_loss": 0.61589074, "learning_rate": 3.9946982320982274e-06, "loss": 0.63688087, "num_input_tokens_seen": 18681480, "step": 873, "time_per_iteration": 3.1711297035217285 }, { "auxiliary_loss_clip": 0.01197438, "auxiliary_loss_mlp": 0.01056818, "balance_loss_clip": 1.06202292, "balance_loss_mlp": 1.03120041, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 2.1995328011281488, "language_loss": 0.88965189, "learning_rate": 3.994669855111643e-06, "loss": 0.91219449, "num_input_tokens_seen": 18700390, "step": 874, "time_per_iteration": 2.8240153789520264 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.01063458, "balance_loss_clip": 1.0614326, "balance_loss_mlp": 1.03682709, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.858649685360537, "language_loss": 0.74537963, "learning_rate": 3.994641402486977e-06, "loss": 0.76798666, "num_input_tokens_seen": 18721280, "step": 875, "time_per_iteration": 2.9111931324005127 }, { "auxiliary_loss_clip": 0.01206205, "auxiliary_loss_mlp": 0.01058912, "balance_loss_clip": 1.06306934, "balance_loss_mlp": 1.03210175, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.7697857141051123, "language_loss": 0.92843151, "learning_rate": 3.99461287422531e-06, "loss": 0.95108265, "num_input_tokens_seen": 18741545, "step": 876, "time_per_iteration": 2.800252676010132 }, { "auxiliary_loss_clip": 0.01100151, "auxiliary_loss_mlp": 0.01006341, "balance_loss_clip": 1.02669787, "balance_loss_mlp": 1.0020256, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8383495859932864, "language_loss": 0.62929404, "learning_rate": 3.994584270327722e-06, "loss": 0.65035897, "num_input_tokens_seen": 18801400, "step": 877, "time_per_iteration": 3.2090368270874023 }, { "auxiliary_loss_clip": 0.01200578, "auxiliary_loss_mlp": 0.0106702, "balance_loss_clip": 1.06150424, "balance_loss_mlp": 1.03931606, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.042786693643985, "language_loss": 0.85383844, "learning_rate": 3.994555590795299e-06, "loss": 0.87651443, "num_input_tokens_seen": 18819670, "step": 878, "time_per_iteration": 2.823835849761963 }, { "auxiliary_loss_clip": 0.0122514, "auxiliary_loss_mlp": 0.01061117, "balance_loss_clip": 1.0635035, "balance_loss_mlp": 1.03551078, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.7462717669338121, "language_loss": 0.83076209, "learning_rate": 3.9945268356291275e-06, "loss": 0.8536247, "num_input_tokens_seen": 18840580, "step": 879, "time_per_iteration": 2.743673086166382 }, { "auxiliary_loss_clip": 0.0119139, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06152987, "balance_loss_mlp": 1.04013824, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 1.9601789563010765, "language_loss": 0.84284604, "learning_rate": 3.9944980048302985e-06, "loss": 0.86542469, "num_input_tokens_seen": 18859295, "step": 880, "time_per_iteration": 2.7560529708862305 }, { "auxiliary_loss_clip": 0.01184956, "auxiliary_loss_mlp": 0.01065063, "balance_loss_clip": 1.05969453, "balance_loss_mlp": 1.03887296, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.4477328752698564, "language_loss": 0.86870736, "learning_rate": 3.994469098399906e-06, "loss": 0.89120758, "num_input_tokens_seen": 18877485, "step": 881, "time_per_iteration": 2.855395555496216 }, { "auxiliary_loss_clip": 0.01207858, "auxiliary_loss_mlp": 0.01070235, "balance_loss_clip": 1.05984437, "balance_loss_mlp": 1.04238808, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7611192020675561, "language_loss": 0.87967896, "learning_rate": 3.994440116339046e-06, "loss": 0.90245986, "num_input_tokens_seen": 18898275, "step": 882, "time_per_iteration": 2.8480119705200195 }, { "auxiliary_loss_clip": 0.01224906, "auxiliary_loss_mlp": 0.01057944, "balance_loss_clip": 1.06268644, "balance_loss_mlp": 1.03059733, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.3555018967788635, "language_loss": 0.69469339, "learning_rate": 3.994411058648816e-06, "loss": 0.71752191, "num_input_tokens_seen": 18920665, "step": 883, "time_per_iteration": 2.8808236122131348 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01063991, "balance_loss_clip": 1.05333591, "balance_loss_mlp": 1.03855157, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 2.039016812023355, "language_loss": 0.76100993, "learning_rate": 3.994381925330319e-06, "loss": 0.78331757, "num_input_tokens_seen": 18939835, "step": 884, "time_per_iteration": 2.8462212085723877 }, { "auxiliary_loss_clip": 0.01172569, "auxiliary_loss_mlp": 0.01066856, "balance_loss_clip": 1.06269383, "balance_loss_mlp": 1.04147613, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 1.9865896222141148, "language_loss": 0.86195529, "learning_rate": 3.994352716384659e-06, "loss": 0.88434947, "num_input_tokens_seen": 18958405, "step": 885, "time_per_iteration": 2.7825753688812256 }, { "auxiliary_loss_clip": 0.0118405, "auxiliary_loss_mlp": 0.01068976, "balance_loss_clip": 1.05229151, "balance_loss_mlp": 1.04203486, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 2.608647457747672, "language_loss": 0.85971159, "learning_rate": 3.994323431812945e-06, "loss": 0.88224185, "num_input_tokens_seen": 18975445, "step": 886, "time_per_iteration": 2.7393639087677 }, { "auxiliary_loss_clip": 0.0117343, "auxiliary_loss_mlp": 0.01065966, "balance_loss_clip": 1.05620933, "balance_loss_mlp": 1.03879774, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.040002880698432, "language_loss": 0.8961553, "learning_rate": 3.994294071616286e-06, "loss": 0.91854936, "num_input_tokens_seen": 18991930, "step": 887, "time_per_iteration": 2.8606581687927246 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01072438, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.04411352, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 2.062562868466936, "language_loss": 0.74852538, "learning_rate": 3.994264635795796e-06, "loss": 0.77069414, "num_input_tokens_seen": 19009790, "step": 888, "time_per_iteration": 2.8675312995910645 }, { "auxiliary_loss_clip": 0.01164085, "auxiliary_loss_mlp": 0.01072324, "balance_loss_clip": 1.05659473, "balance_loss_mlp": 1.04525173, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 1.7884280759117637, "language_loss": 0.88440782, "learning_rate": 3.994235124352592e-06, "loss": 0.9067719, "num_input_tokens_seen": 19030170, "step": 889, "time_per_iteration": 2.9419636726379395 }, { "auxiliary_loss_clip": 0.0121577, "auxiliary_loss_mlp": 0.0105125, "balance_loss_clip": 1.06085157, "balance_loss_mlp": 1.02607334, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 1.9333059575084248, "language_loss": 0.88386381, "learning_rate": 3.994205537287791e-06, "loss": 0.90653402, "num_input_tokens_seen": 19048075, "step": 890, "time_per_iteration": 2.7030327320098877 }, { "auxiliary_loss_clip": 0.01195034, "auxiliary_loss_mlp": 0.01069003, "balance_loss_clip": 1.05835462, "balance_loss_mlp": 1.04450595, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 2.435204176890571, "language_loss": 0.93450797, "learning_rate": 3.994175874602517e-06, "loss": 0.95714831, "num_input_tokens_seen": 19067465, "step": 891, "time_per_iteration": 2.81527042388916 }, { "auxiliary_loss_clip": 0.01190797, "auxiliary_loss_mlp": 0.01066955, "balance_loss_clip": 1.05605483, "balance_loss_mlp": 1.03909576, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.3400199158693087, "language_loss": 0.71625131, "learning_rate": 3.994146136297893e-06, "loss": 0.73882878, "num_input_tokens_seen": 19085505, "step": 892, "time_per_iteration": 2.825984239578247 }, { "auxiliary_loss_clip": 0.01191313, "auxiliary_loss_mlp": 0.0078394, "balance_loss_clip": 1.05727172, "balance_loss_mlp": 1.00024366, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 1.6058100223173828, "language_loss": 0.82331586, "learning_rate": 3.994116322375049e-06, "loss": 0.84306836, "num_input_tokens_seen": 19104360, "step": 893, "time_per_iteration": 2.8618266582489014 }, { "auxiliary_loss_clip": 0.01192677, "auxiliary_loss_mlp": 0.01063531, "balance_loss_clip": 1.0572021, "balance_loss_mlp": 1.03850877, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.0228714136718122, "language_loss": 0.82052565, "learning_rate": 3.994086432835114e-06, "loss": 0.84308773, "num_input_tokens_seen": 19124680, "step": 894, "time_per_iteration": 2.8347885608673096 }, { "auxiliary_loss_clip": 0.0120111, "auxiliary_loss_mlp": 0.01065233, "balance_loss_clip": 1.0570271, "balance_loss_mlp": 1.03997254, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.260594705980758, "language_loss": 0.76133072, "learning_rate": 3.994056467679221e-06, "loss": 0.78399414, "num_input_tokens_seen": 19142895, "step": 895, "time_per_iteration": 2.7288858890533447 }, { "auxiliary_loss_clip": 0.01200143, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.06422663, "balance_loss_mlp": 1.03547084, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 2.0450623179174974, "language_loss": 0.86767507, "learning_rate": 3.9940264269085065e-06, "loss": 0.89028239, "num_input_tokens_seen": 19163125, "step": 896, "time_per_iteration": 4.404265642166138 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.00782931, "balance_loss_clip": 1.06062579, "balance_loss_mlp": 1.0002867, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 3.0866230440609805, "language_loss": 0.8797363, "learning_rate": 3.9939963105241115e-06, "loss": 0.89978594, "num_input_tokens_seen": 19179385, "step": 897, "time_per_iteration": 4.843130588531494 }, { "auxiliary_loss_clip": 0.01201639, "auxiliary_loss_mlp": 0.01063724, "balance_loss_clip": 1.05896854, "balance_loss_mlp": 1.03658032, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.8270040910241792, "language_loss": 0.90170419, "learning_rate": 3.993966118527175e-06, "loss": 0.92435783, "num_input_tokens_seen": 19198725, "step": 898, "time_per_iteration": 2.695235252380371 }, { "auxiliary_loss_clip": 0.01200189, "auxiliary_loss_mlp": 0.01076438, "balance_loss_clip": 1.05787873, "balance_loss_mlp": 1.05105805, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 2.793625116693953, "language_loss": 0.91544139, "learning_rate": 3.993935850918845e-06, "loss": 0.93820769, "num_input_tokens_seen": 19212380, "step": 899, "time_per_iteration": 2.7509548664093018 }, { "auxiliary_loss_clip": 0.01186479, "auxiliary_loss_mlp": 0.01068594, "balance_loss_clip": 1.05614042, "balance_loss_mlp": 1.04154527, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 1.983572968760697, "language_loss": 0.75742769, "learning_rate": 3.9939055077002665e-06, "loss": 0.77997845, "num_input_tokens_seen": 19232235, "step": 900, "time_per_iteration": 2.771371364593506 }, { "auxiliary_loss_clip": 0.01211506, "auxiliary_loss_mlp": 0.01058176, "balance_loss_clip": 1.05839145, "balance_loss_mlp": 1.03401244, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.192527627735503, "language_loss": 0.74331856, "learning_rate": 3.993875088872592e-06, "loss": 0.76601535, "num_input_tokens_seen": 19251460, "step": 901, "time_per_iteration": 2.859912157058716 }, { "auxiliary_loss_clip": 0.01177502, "auxiliary_loss_mlp": 0.01065445, "balance_loss_clip": 1.0569309, "balance_loss_mlp": 1.04166329, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.352700712836257, "language_loss": 0.85287452, "learning_rate": 3.9938445944369745e-06, "loss": 0.87530404, "num_input_tokens_seen": 19269060, "step": 902, "time_per_iteration": 2.7940642833709717 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01066664, "balance_loss_clip": 1.04903233, "balance_loss_mlp": 1.04112983, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 1.9620711230312637, "language_loss": 0.86385572, "learning_rate": 3.993814024394569e-06, "loss": 0.88613302, "num_input_tokens_seen": 19288620, "step": 903, "time_per_iteration": 2.9258980751037598 }, { "auxiliary_loss_clip": 0.0121005, "auxiliary_loss_mlp": 0.01059616, "balance_loss_clip": 1.06094384, "balance_loss_mlp": 1.03534508, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 2.175127974944855, "language_loss": 0.74927866, "learning_rate": 3.993783378746537e-06, "loss": 0.7719754, "num_input_tokens_seen": 19306615, "step": 904, "time_per_iteration": 2.7239954471588135 }, { "auxiliary_loss_clip": 0.01208402, "auxiliary_loss_mlp": 0.01067543, "balance_loss_clip": 1.06052148, "balance_loss_mlp": 1.04325962, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 2.5191963984804535, "language_loss": 0.85946918, "learning_rate": 3.993752657494039e-06, "loss": 0.88222867, "num_input_tokens_seen": 19321680, "step": 905, "time_per_iteration": 2.693896532058716 }, { "auxiliary_loss_clip": 0.01198232, "auxiliary_loss_mlp": 0.01078072, "balance_loss_clip": 1.06483209, "balance_loss_mlp": 1.05400348, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.7753581401878566, "language_loss": 0.74413162, "learning_rate": 3.993721860638241e-06, "loss": 0.7668947, "num_input_tokens_seen": 19339760, "step": 906, "time_per_iteration": 2.6679019927978516 }, { "auxiliary_loss_clip": 0.01192373, "auxiliary_loss_mlp": 0.01064381, "balance_loss_clip": 1.05954027, "balance_loss_mlp": 1.0397284, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 2.3037248114268896, "language_loss": 0.87340188, "learning_rate": 3.993690988180309e-06, "loss": 0.89596951, "num_input_tokens_seen": 19359585, "step": 907, "time_per_iteration": 2.7363240718841553 }, { "auxiliary_loss_clip": 0.01205519, "auxiliary_loss_mlp": 0.01068463, "balance_loss_clip": 1.0616293, "balance_loss_mlp": 1.04332149, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.6666873589767146, "language_loss": 0.86928803, "learning_rate": 3.9936600401214165e-06, "loss": 0.89202785, "num_input_tokens_seen": 19378590, "step": 908, "time_per_iteration": 2.6266026496887207 }, { "auxiliary_loss_clip": 0.01198848, "auxiliary_loss_mlp": 0.01067336, "balance_loss_clip": 1.05974221, "balance_loss_mlp": 1.04107404, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.1282794409977215, "language_loss": 0.89792144, "learning_rate": 3.9936290164627345e-06, "loss": 0.92058325, "num_input_tokens_seen": 19397910, "step": 909, "time_per_iteration": 2.7163166999816895 }, { "auxiliary_loss_clip": 0.01200393, "auxiliary_loss_mlp": 0.01073374, "balance_loss_clip": 1.06157839, "balance_loss_mlp": 1.04742169, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 2.095924869989121, "language_loss": 0.70949811, "learning_rate": 3.99359791720544e-06, "loss": 0.73223579, "num_input_tokens_seen": 19415950, "step": 910, "time_per_iteration": 2.6697354316711426 }, { "auxiliary_loss_clip": 0.01187784, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.05651259, "balance_loss_mlp": 1.02975583, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.6633724338567386, "language_loss": 0.83651805, "learning_rate": 3.993566742350714e-06, "loss": 0.85894263, "num_input_tokens_seen": 19435275, "step": 911, "time_per_iteration": 2.692798137664795 }, { "auxiliary_loss_clip": 0.01187113, "auxiliary_loss_mlp": 0.01073028, "balance_loss_clip": 1.05334687, "balance_loss_mlp": 1.04719508, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.283907419545301, "language_loss": 0.76320881, "learning_rate": 3.993535491899736e-06, "loss": 0.78581023, "num_input_tokens_seen": 19452090, "step": 912, "time_per_iteration": 2.6653189659118652 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.05707574, "balance_loss_mlp": 1.0271548, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.366460016615147, "language_loss": 0.82826668, "learning_rate": 3.993504165853694e-06, "loss": 0.85064626, "num_input_tokens_seen": 19470865, "step": 913, "time_per_iteration": 2.6826348304748535 }, { "auxiliary_loss_clip": 0.01194515, "auxiliary_loss_mlp": 0.01060483, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.03651023, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 3.3338391252510586, "language_loss": 0.8373239, "learning_rate": 3.993472764213772e-06, "loss": 0.85987389, "num_input_tokens_seen": 19492145, "step": 914, "time_per_iteration": 2.7358829975128174 }, { "auxiliary_loss_clip": 0.0120705, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.06039774, "balance_loss_mlp": 1.00027478, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.520244909384168, "language_loss": 0.90146536, "learning_rate": 3.9934412869811655e-06, "loss": 0.92135859, "num_input_tokens_seen": 19511015, "step": 915, "time_per_iteration": 2.9398341178894043 }, { "auxiliary_loss_clip": 0.01201461, "auxiliary_loss_mlp": 0.01059252, "balance_loss_clip": 1.06274199, "balance_loss_mlp": 1.03558862, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 2.182721785653499, "language_loss": 0.89710975, "learning_rate": 3.993409734157064e-06, "loss": 0.91971689, "num_input_tokens_seen": 19529040, "step": 916, "time_per_iteration": 2.7210159301757812 }, { "auxiliary_loss_clip": 0.01175226, "auxiliary_loss_mlp": 0.01066073, "balance_loss_clip": 1.05741024, "balance_loss_mlp": 1.04103947, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 1.7899379897310368, "language_loss": 0.8016991, "learning_rate": 3.993378105742666e-06, "loss": 0.82411212, "num_input_tokens_seen": 19549540, "step": 917, "time_per_iteration": 2.7923104763031006 }, { "auxiliary_loss_clip": 0.01139072, "auxiliary_loss_mlp": 0.0105947, "balance_loss_clip": 1.05135942, "balance_loss_mlp": 1.03414989, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 2.106744179667805, "language_loss": 0.79437333, "learning_rate": 3.9933464017391705e-06, "loss": 0.81635869, "num_input_tokens_seen": 19567570, "step": 918, "time_per_iteration": 2.8051092624664307 }, { "auxiliary_loss_clip": 0.01196947, "auxiliary_loss_mlp": 0.01055679, "balance_loss_clip": 1.05616307, "balance_loss_mlp": 1.03166997, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.454030193031321, "language_loss": 0.89019686, "learning_rate": 3.99331462214778e-06, "loss": 0.91272312, "num_input_tokens_seen": 19585330, "step": 919, "time_per_iteration": 2.6846773624420166 }, { "auxiliary_loss_clip": 0.01213326, "auxiliary_loss_mlp": 0.01069349, "balance_loss_clip": 1.05950904, "balance_loss_mlp": 1.04417229, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.246354931091656, "language_loss": 0.8746047, "learning_rate": 3.993282766969699e-06, "loss": 0.89743137, "num_input_tokens_seen": 19604970, "step": 920, "time_per_iteration": 2.6699845790863037 }, { "auxiliary_loss_clip": 0.01190424, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.06023288, "balance_loss_mlp": 1.03657782, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 1.975714125194334, "language_loss": 0.6568011, "learning_rate": 3.993250836206136e-06, "loss": 0.67930895, "num_input_tokens_seen": 19626235, "step": 921, "time_per_iteration": 2.833644390106201 }, { "auxiliary_loss_clip": 0.01209678, "auxiliary_loss_mlp": 0.01065483, "balance_loss_clip": 1.06060767, "balance_loss_mlp": 1.03874445, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 1.7242493696651606, "language_loss": 0.71861136, "learning_rate": 3.993218829858301e-06, "loss": 0.74136293, "num_input_tokens_seen": 19644305, "step": 922, "time_per_iteration": 2.6168808937072754 }, { "auxiliary_loss_clip": 0.01187138, "auxiliary_loss_mlp": 0.01067213, "balance_loss_clip": 1.05423355, "balance_loss_mlp": 1.04223895, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 2.6848185900705412, "language_loss": 0.82304025, "learning_rate": 3.993186747927408e-06, "loss": 0.8455838, "num_input_tokens_seen": 19662130, "step": 923, "time_per_iteration": 2.7298316955566406 }, { "auxiliary_loss_clip": 0.01202941, "auxiliary_loss_mlp": 0.01064106, "balance_loss_clip": 1.05725455, "balance_loss_mlp": 1.03933442, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 1.9334372940525173, "language_loss": 0.78759122, "learning_rate": 3.993154590414675e-06, "loss": 0.81026167, "num_input_tokens_seen": 19680715, "step": 924, "time_per_iteration": 2.6869630813598633 }, { "auxiliary_loss_clip": 0.0116422, "auxiliary_loss_mlp": 0.01053758, "balance_loss_clip": 1.05395627, "balance_loss_mlp": 1.02844954, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.005203138116014, "language_loss": 1.02005315, "learning_rate": 3.993122357321319e-06, "loss": 1.04223299, "num_input_tokens_seen": 19700535, "step": 925, "time_per_iteration": 2.716089963912964 }, { "auxiliary_loss_clip": 0.01163201, "auxiliary_loss_mlp": 0.01052104, "balance_loss_clip": 1.05070591, "balance_loss_mlp": 1.02739179, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 2.0106641835017482, "language_loss": 0.80939209, "learning_rate": 3.993090048648564e-06, "loss": 0.83154511, "num_input_tokens_seen": 19718825, "step": 926, "time_per_iteration": 2.895803451538086 }, { "auxiliary_loss_clip": 0.01207515, "auxiliary_loss_mlp": 0.01068168, "balance_loss_clip": 1.05892682, "balance_loss_mlp": 1.0419066, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 2.9732625845644045, "language_loss": 0.73220479, "learning_rate": 3.993057664397634e-06, "loss": 0.75496161, "num_input_tokens_seen": 19739080, "step": 927, "time_per_iteration": 2.677725076675415 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01015011, "balance_loss_clip": 1.02922702, "balance_loss_mlp": 1.01014709, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.8406874373244947, "language_loss": 0.59841412, "learning_rate": 3.9930252045697585e-06, "loss": 0.61957431, "num_input_tokens_seen": 19802960, "step": 928, "time_per_iteration": 3.187382221221924 }, { "auxiliary_loss_clip": 0.01202438, "auxiliary_loss_mlp": 0.01065066, "balance_loss_clip": 1.05921853, "balance_loss_mlp": 1.04070008, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.0668361967965994, "language_loss": 0.95411372, "learning_rate": 3.992992669166168e-06, "loss": 0.97678876, "num_input_tokens_seen": 19822765, "step": 929, "time_per_iteration": 2.6930506229400635 }, { "auxiliary_loss_clip": 0.01171806, "auxiliary_loss_mlp": 0.01068051, "balance_loss_clip": 1.05343258, "balance_loss_mlp": 1.04101443, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.1442452677256627, "language_loss": 0.71756601, "learning_rate": 3.992960058188094e-06, "loss": 0.7399646, "num_input_tokens_seen": 19843590, "step": 930, "time_per_iteration": 2.803219795227051 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01058888, "balance_loss_clip": 1.05783677, "balance_loss_mlp": 1.03377056, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 2.381261552273062, "language_loss": 0.85279298, "learning_rate": 3.992927371636776e-06, "loss": 0.87526459, "num_input_tokens_seen": 19860230, "step": 931, "time_per_iteration": 2.6215872764587402 }, { "auxiliary_loss_clip": 0.01203533, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05677414, "balance_loss_mlp": 1.00025761, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 2.2861197477099973, "language_loss": 0.83645165, "learning_rate": 3.9928946095134525e-06, "loss": 0.85631776, "num_input_tokens_seen": 19880795, "step": 932, "time_per_iteration": 2.664062261581421 }, { "auxiliary_loss_clip": 0.01200637, "auxiliary_loss_mlp": 0.0107041, "balance_loss_clip": 1.05897784, "balance_loss_mlp": 1.04407716, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 1.8036739452122519, "language_loss": 0.73694205, "learning_rate": 3.992861771819365e-06, "loss": 0.7596525, "num_input_tokens_seen": 19897960, "step": 933, "time_per_iteration": 2.631620168685913 }, { "auxiliary_loss_clip": 0.01153445, "auxiliary_loss_mlp": 0.01076903, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.05060577, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 2.385249039382274, "language_loss": 0.86660421, "learning_rate": 3.99282885855576e-06, "loss": 0.88890779, "num_input_tokens_seen": 19913315, "step": 934, "time_per_iteration": 2.7739439010620117 }, { "auxiliary_loss_clip": 0.01164295, "auxiliary_loss_mlp": 0.0108083, "balance_loss_clip": 1.05509257, "balance_loss_mlp": 1.0557723, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.2740258482680433, "language_loss": 0.80388415, "learning_rate": 3.992795869723885e-06, "loss": 0.82633543, "num_input_tokens_seen": 19928790, "step": 935, "time_per_iteration": 5.93512487411499 }, { "auxiliary_loss_clip": 0.01093927, "auxiliary_loss_mlp": 0.01019701, "balance_loss_clip": 1.02288604, "balance_loss_mlp": 1.01540911, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.820561718243334, "language_loss": 0.69191676, "learning_rate": 3.99276280532499e-06, "loss": 0.71305299, "num_input_tokens_seen": 19988785, "step": 936, "time_per_iteration": 4.862478733062744 }, { "auxiliary_loss_clip": 0.01213648, "auxiliary_loss_mlp": 0.01068507, "balance_loss_clip": 1.05806684, "balance_loss_mlp": 1.04429567, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 1.9573264311231433, "language_loss": 0.7572521, "learning_rate": 3.992729665360331e-06, "loss": 0.78007358, "num_input_tokens_seen": 20007685, "step": 937, "time_per_iteration": 4.219425916671753 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01013805, "balance_loss_clip": 1.02476001, "balance_loss_mlp": 1.00944233, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.9053055994078011, "language_loss": 0.64309287, "learning_rate": 3.992696449831162e-06, "loss": 0.66408622, "num_input_tokens_seen": 20072750, "step": 938, "time_per_iteration": 3.1298794746398926 }, { "auxiliary_loss_clip": 0.01171203, "auxiliary_loss_mlp": 0.01068815, "balance_loss_clip": 1.05175185, "balance_loss_mlp": 1.0426966, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 2.7427540631348832, "language_loss": 0.79751205, "learning_rate": 3.992663158738745e-06, "loss": 0.8199122, "num_input_tokens_seen": 20089070, "step": 939, "time_per_iteration": 2.6863484382629395 }, { "auxiliary_loss_clip": 0.01175528, "auxiliary_loss_mlp": 0.01068297, "balance_loss_clip": 1.0509069, "balance_loss_mlp": 1.04338217, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.8374791395473227, "language_loss": 0.73919088, "learning_rate": 3.992629792084341e-06, "loss": 0.76162916, "num_input_tokens_seen": 20108790, "step": 940, "time_per_iteration": 2.7111120223999023 }, { "auxiliary_loss_clip": 0.01198483, "auxiliary_loss_mlp": 0.01058511, "balance_loss_clip": 1.05900669, "balance_loss_mlp": 1.03252339, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 2.2993716569389813, "language_loss": 0.70622003, "learning_rate": 3.992596349869216e-06, "loss": 0.72878999, "num_input_tokens_seen": 20128455, "step": 941, "time_per_iteration": 2.657594680786133 }, { "auxiliary_loss_clip": 0.01135396, "auxiliary_loss_mlp": 0.01059543, "balance_loss_clip": 1.04961574, "balance_loss_mlp": 1.03382993, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 2.0678542992190847, "language_loss": 0.80921417, "learning_rate": 3.992562832094637e-06, "loss": 0.83116359, "num_input_tokens_seen": 20145775, "step": 942, "time_per_iteration": 2.7379891872406006 }, { "auxiliary_loss_clip": 0.01186767, "auxiliary_loss_mlp": 0.01062055, "balance_loss_clip": 1.05228579, "balance_loss_mlp": 1.03554332, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.245249922529115, "language_loss": 0.88858449, "learning_rate": 3.9925292387618755e-06, "loss": 0.91107273, "num_input_tokens_seen": 20164315, "step": 943, "time_per_iteration": 2.6502583026885986 }, { "auxiliary_loss_clip": 0.01199122, "auxiliary_loss_mlp": 0.0105963, "balance_loss_clip": 1.05991781, "balance_loss_mlp": 1.03534663, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.5514256959015995, "language_loss": 0.74771839, "learning_rate": 3.992495569872206e-06, "loss": 0.77030593, "num_input_tokens_seen": 20182760, "step": 944, "time_per_iteration": 2.676079034805298 }, { "auxiliary_loss_clip": 0.01204502, "auxiliary_loss_mlp": 0.01064591, "balance_loss_clip": 1.05980551, "balance_loss_mlp": 1.04085672, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.5959266123312272, "language_loss": 0.79406166, "learning_rate": 3.992461825426906e-06, "loss": 0.81675267, "num_input_tokens_seen": 20203830, "step": 945, "time_per_iteration": 2.734299421310425 }, { "auxiliary_loss_clip": 0.01195984, "auxiliary_loss_mlp": 0.0105672, "balance_loss_clip": 1.05686593, "balance_loss_mlp": 1.03156662, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.5637081249861824, "language_loss": 0.82651746, "learning_rate": 3.992428005427252e-06, "loss": 0.84904456, "num_input_tokens_seen": 20220365, "step": 946, "time_per_iteration": 2.6636929512023926 }, { "auxiliary_loss_clip": 0.0122014, "auxiliary_loss_mlp": 0.01061449, "balance_loss_clip": 1.06224144, "balance_loss_mlp": 1.03524721, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 1.8433174156507384, "language_loss": 0.79031301, "learning_rate": 3.992394109874529e-06, "loss": 0.81312895, "num_input_tokens_seen": 20238640, "step": 947, "time_per_iteration": 2.623671293258667 }, { "auxiliary_loss_clip": 0.0117587, "auxiliary_loss_mlp": 0.01061489, "balance_loss_clip": 1.05605412, "balance_loss_mlp": 1.03569245, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 6.8661947111986725, "language_loss": 0.85425055, "learning_rate": 3.9923601387700225e-06, "loss": 0.87662411, "num_input_tokens_seen": 20251025, "step": 948, "time_per_iteration": 2.7410409450531006 }, { "auxiliary_loss_clip": 0.01214005, "auxiliary_loss_mlp": 0.01063231, "balance_loss_clip": 1.05969238, "balance_loss_mlp": 1.03598022, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 3.649211317819821, "language_loss": 0.87346625, "learning_rate": 3.992326092115019e-06, "loss": 0.89623863, "num_input_tokens_seen": 20269775, "step": 949, "time_per_iteration": 2.6893157958984375 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.0106695, "balance_loss_clip": 1.05799937, "balance_loss_mlp": 1.04266715, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 1.8324883776363103, "language_loss": 0.7874645, "learning_rate": 3.992291969910811e-06, "loss": 0.8100794, "num_input_tokens_seen": 20287715, "step": 950, "time_per_iteration": 2.623924732208252 }, { "auxiliary_loss_clip": 0.01180518, "auxiliary_loss_mlp": 0.01068771, "balance_loss_clip": 1.05322623, "balance_loss_mlp": 1.04384422, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 3.8045132244795816, "language_loss": 0.82477522, "learning_rate": 3.992257772158691e-06, "loss": 0.8472681, "num_input_tokens_seen": 20307070, "step": 951, "time_per_iteration": 2.697479724884033 }, { "auxiliary_loss_clip": 0.01167302, "auxiliary_loss_mlp": 0.01061039, "balance_loss_clip": 1.04906607, "balance_loss_mlp": 1.03375173, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 2.4180383362968634, "language_loss": 0.86899263, "learning_rate": 3.992223498859958e-06, "loss": 0.89127606, "num_input_tokens_seen": 20324945, "step": 952, "time_per_iteration": 2.707716226577759 }, { "auxiliary_loss_clip": 0.01191405, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.05511189, "balance_loss_mlp": 1.03630924, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.195434645270168, "language_loss": 0.79087842, "learning_rate": 3.9921891500159084e-06, "loss": 0.81343949, "num_input_tokens_seen": 20346135, "step": 953, "time_per_iteration": 2.671255588531494 }, { "auxiliary_loss_clip": 0.01190026, "auxiliary_loss_mlp": 0.01066447, "balance_loss_clip": 1.05984342, "balance_loss_mlp": 1.04056656, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.2066085695914466, "language_loss": 0.86644447, "learning_rate": 3.992154725627848e-06, "loss": 0.88900924, "num_input_tokens_seen": 20364450, "step": 954, "time_per_iteration": 2.671657085418701 }, { "auxiliary_loss_clip": 0.01210569, "auxiliary_loss_mlp": 0.01062619, "balance_loss_clip": 1.06119955, "balance_loss_mlp": 1.03723955, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.2872795023766113, "language_loss": 0.88071024, "learning_rate": 3.9921202256970804e-06, "loss": 0.90344214, "num_input_tokens_seen": 20383500, "step": 955, "time_per_iteration": 2.69960880279541 }, { "auxiliary_loss_clip": 0.01179864, "auxiliary_loss_mlp": 0.01068889, "balance_loss_clip": 1.0523231, "balance_loss_mlp": 1.04209054, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 1.9113555723128555, "language_loss": 0.89160776, "learning_rate": 3.992085650224914e-06, "loss": 0.91409534, "num_input_tokens_seen": 20400295, "step": 956, "time_per_iteration": 2.667868137359619 }, { "auxiliary_loss_clip": 0.01167867, "auxiliary_loss_mlp": 0.01060669, "balance_loss_clip": 1.05720079, "balance_loss_mlp": 1.03450251, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 3.2877973901728095, "language_loss": 0.75473189, "learning_rate": 3.99205099921266e-06, "loss": 0.77701724, "num_input_tokens_seen": 20419085, "step": 957, "time_per_iteration": 2.6938796043395996 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01072849, "balance_loss_clip": 1.05432248, "balance_loss_mlp": 1.0448705, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.0004055711005257, "language_loss": 0.79582155, "learning_rate": 3.992016272661633e-06, "loss": 0.81831264, "num_input_tokens_seen": 20437465, "step": 958, "time_per_iteration": 2.6933834552764893 }, { "auxiliary_loss_clip": 0.01186244, "auxiliary_loss_mlp": 0.01059908, "balance_loss_clip": 1.05851364, "balance_loss_mlp": 1.03572011, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.669863855173802, "language_loss": 0.8840394, "learning_rate": 3.99198147057315e-06, "loss": 0.906501, "num_input_tokens_seen": 20456235, "step": 959, "time_per_iteration": 2.7094578742980957 }, { "auxiliary_loss_clip": 0.01169479, "auxiliary_loss_mlp": 0.01063656, "balance_loss_clip": 1.05511999, "balance_loss_mlp": 1.03881276, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.0960373333994764, "language_loss": 0.78850955, "learning_rate": 3.991946592948529e-06, "loss": 0.8108409, "num_input_tokens_seen": 20476825, "step": 960, "time_per_iteration": 2.822922945022583 }, { "auxiliary_loss_clip": 0.0113413, "auxiliary_loss_mlp": 0.01067189, "balance_loss_clip": 1.05177355, "balance_loss_mlp": 1.04020023, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 2.063464892179025, "language_loss": 0.92986894, "learning_rate": 3.991911639789094e-06, "loss": 0.95188212, "num_input_tokens_seen": 20496965, "step": 961, "time_per_iteration": 2.793952226638794 }, { "auxiliary_loss_clip": 0.01182535, "auxiliary_loss_mlp": 0.0106764, "balance_loss_clip": 1.0554297, "balance_loss_mlp": 1.04091299, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.0649993155313067, "language_loss": 0.68164188, "learning_rate": 3.991876611096169e-06, "loss": 0.70414358, "num_input_tokens_seen": 20518035, "step": 962, "time_per_iteration": 2.8396694660186768 }, { "auxiliary_loss_clip": 0.01159524, "auxiliary_loss_mlp": 0.01073851, "balance_loss_clip": 1.05128908, "balance_loss_mlp": 1.04909074, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.2685465488517074, "language_loss": 0.8848027, "learning_rate": 3.991841506871084e-06, "loss": 0.90713644, "num_input_tokens_seen": 20534740, "step": 963, "time_per_iteration": 2.7077019214630127 }, { "auxiliary_loss_clip": 0.01183778, "auxiliary_loss_mlp": 0.01061251, "balance_loss_clip": 1.06018209, "balance_loss_mlp": 1.03516829, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.392959969035536, "language_loss": 0.85288298, "learning_rate": 3.99180632711517e-06, "loss": 0.87533331, "num_input_tokens_seen": 20553485, "step": 964, "time_per_iteration": 2.7218217849731445 }, { "auxiliary_loss_clip": 0.01188683, "auxiliary_loss_mlp": 0.01069422, "balance_loss_clip": 1.05959499, "balance_loss_mlp": 1.04325557, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 3.087349735715565, "language_loss": 0.78159416, "learning_rate": 3.99177107182976e-06, "loss": 0.80417526, "num_input_tokens_seen": 20572155, "step": 965, "time_per_iteration": 2.6902661323547363 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.0107109, "balance_loss_clip": 1.0523715, "balance_loss_mlp": 1.04462528, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9742288518319486, "language_loss": 0.81403655, "learning_rate": 3.99173574101619e-06, "loss": 0.83631277, "num_input_tokens_seen": 20590395, "step": 966, "time_per_iteration": 2.7423267364501953 }, { "auxiliary_loss_clip": 0.01198908, "auxiliary_loss_mlp": 0.01065021, "balance_loss_clip": 1.058887, "balance_loss_mlp": 1.04113197, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.8776530142118544, "language_loss": 0.76480806, "learning_rate": 3.9917003346758035e-06, "loss": 0.78744727, "num_input_tokens_seen": 20608435, "step": 967, "time_per_iteration": 2.642885446548462 }, { "auxiliary_loss_clip": 0.01084339, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.02675521, "balance_loss_mlp": 1.0269078, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.985564929959949, "language_loss": 0.57357776, "learning_rate": 3.991664852809939e-06, "loss": 0.59473509, "num_input_tokens_seen": 20668575, "step": 968, "time_per_iteration": 3.1017024517059326 }, { "auxiliary_loss_clip": 0.01188824, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.05784404, "balance_loss_mlp": 1.03147697, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.1276337565108485, "language_loss": 0.82286429, "learning_rate": 3.991629295419945e-06, "loss": 0.84532964, "num_input_tokens_seen": 20687355, "step": 969, "time_per_iteration": 2.669055461883545 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.00782724, "balance_loss_clip": 1.06255269, "balance_loss_mlp": 1.00024962, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 7.916507288074279, "language_loss": 0.7803669, "learning_rate": 3.991593662507167e-06, "loss": 0.80025685, "num_input_tokens_seen": 20705710, "step": 970, "time_per_iteration": 2.733030080795288 }, { "auxiliary_loss_clip": 0.01181452, "auxiliary_loss_mlp": 0.01064945, "balance_loss_clip": 1.05691695, "balance_loss_mlp": 1.03887415, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 3.163102883752813, "language_loss": 0.92229038, "learning_rate": 3.991557954072958e-06, "loss": 0.94475436, "num_input_tokens_seen": 20722405, "step": 971, "time_per_iteration": 2.730377435684204 }, { "auxiliary_loss_clip": 0.01180948, "auxiliary_loss_mlp": 0.01062613, "balance_loss_clip": 1.05320477, "balance_loss_mlp": 1.03722143, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.700187330091603, "language_loss": 0.85959208, "learning_rate": 3.991522170118673e-06, "loss": 0.88202775, "num_input_tokens_seen": 20741480, "step": 972, "time_per_iteration": 2.687185049057007 }, { "auxiliary_loss_clip": 0.0116993, "auxiliary_loss_mlp": 0.01079713, "balance_loss_clip": 1.05714142, "balance_loss_mlp": 1.05601454, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.00599255988541, "language_loss": 0.87503272, "learning_rate": 3.991486310645667e-06, "loss": 0.89752913, "num_input_tokens_seen": 20759685, "step": 973, "time_per_iteration": 2.7166664600372314 }, { "auxiliary_loss_clip": 0.01206524, "auxiliary_loss_mlp": 0.00784111, "balance_loss_clip": 1.06111121, "balance_loss_mlp": 1.00026989, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 1.879365930358842, "language_loss": 0.74800295, "learning_rate": 3.991450375655301e-06, "loss": 0.76790935, "num_input_tokens_seen": 20778180, "step": 974, "time_per_iteration": 2.713594675064087 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.00782207, "balance_loss_clip": 1.059551, "balance_loss_mlp": 1.00025892, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.5923993506380014, "language_loss": 0.76874506, "learning_rate": 3.991414365148936e-06, "loss": 0.78853816, "num_input_tokens_seen": 20802705, "step": 975, "time_per_iteration": 7.600914716720581 }, { "auxiliary_loss_clip": 0.01215491, "auxiliary_loss_mlp": 0.01069506, "balance_loss_clip": 1.06030774, "balance_loss_mlp": 1.0444721, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 3.6132976830219734, "language_loss": 0.76748288, "learning_rate": 3.99137827912794e-06, "loss": 0.79033279, "num_input_tokens_seen": 20822540, "step": 976, "time_per_iteration": 4.324799537658691 }, { "auxiliary_loss_clip": 0.01176132, "auxiliary_loss_mlp": 0.01077003, "balance_loss_clip": 1.05271626, "balance_loss_mlp": 1.04963279, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 1.943198757110789, "language_loss": 0.87343585, "learning_rate": 3.991342117593679e-06, "loss": 0.89596725, "num_input_tokens_seen": 20844175, "step": 977, "time_per_iteration": 2.7742488384246826 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.06209528, "balance_loss_mlp": 1.04231977, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 1.718987046197629, "language_loss": 0.7969116, "learning_rate": 3.991305880547527e-06, "loss": 0.81946236, "num_input_tokens_seen": 20864730, "step": 978, "time_per_iteration": 2.733372926712036 }, { "auxiliary_loss_clip": 0.01136264, "auxiliary_loss_mlp": 0.01076585, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04927468, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 1.8692877257975375, "language_loss": 0.80665666, "learning_rate": 3.991269567990855e-06, "loss": 0.82878518, "num_input_tokens_seen": 20885200, "step": 979, "time_per_iteration": 3.2624220848083496 }, { "auxiliary_loss_clip": 0.01074686, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.02640033, "balance_loss_mlp": 1.02495658, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9436493040005753, "language_loss": 0.59004962, "learning_rate": 3.9912331799250415e-06, "loss": 0.6110934, "num_input_tokens_seen": 20940325, "step": 980, "time_per_iteration": 3.4688587188720703 }, { "auxiliary_loss_clip": 0.01211665, "auxiliary_loss_mlp": 0.01078603, "balance_loss_clip": 1.06178868, "balance_loss_mlp": 1.05242431, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.2770545408130514, "language_loss": 0.86436182, "learning_rate": 3.9911967163514665e-06, "loss": 0.88726455, "num_input_tokens_seen": 20958220, "step": 981, "time_per_iteration": 2.5824644565582275 }, { "auxiliary_loss_clip": 0.01190085, "auxiliary_loss_mlp": 0.0106921, "balance_loss_clip": 1.05943286, "balance_loss_mlp": 1.04629803, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.1333982175691855, "language_loss": 0.79293346, "learning_rate": 3.991160177271513e-06, "loss": 0.81552643, "num_input_tokens_seen": 20978920, "step": 982, "time_per_iteration": 2.68428897857666 }, { "auxiliary_loss_clip": 0.01192274, "auxiliary_loss_mlp": 0.01068234, "balance_loss_clip": 1.05926657, "balance_loss_mlp": 1.04356933, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 2.319627739094249, "language_loss": 0.84413779, "learning_rate": 3.9911235626865654e-06, "loss": 0.86674285, "num_input_tokens_seen": 20999490, "step": 983, "time_per_iteration": 2.7006261348724365 }, { "auxiliary_loss_clip": 0.0120015, "auxiliary_loss_mlp": 0.01072669, "balance_loss_clip": 1.05969584, "balance_loss_mlp": 1.04799283, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.8014395118859294, "language_loss": 0.84510243, "learning_rate": 3.9910868725980125e-06, "loss": 0.86783063, "num_input_tokens_seen": 21017865, "step": 984, "time_per_iteration": 2.640246868133545 }, { "auxiliary_loss_clip": 0.01188594, "auxiliary_loss_mlp": 0.01055296, "balance_loss_clip": 1.05650342, "balance_loss_mlp": 1.03171611, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 2.473231587287368, "language_loss": 0.77611595, "learning_rate": 3.9910501070072465e-06, "loss": 0.7985549, "num_input_tokens_seen": 21035900, "step": 985, "time_per_iteration": 2.626371383666992 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03542209, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.9082382068459252, "language_loss": 0.90593231, "learning_rate": 3.991013265915661e-06, "loss": 0.92804158, "num_input_tokens_seen": 21053235, "step": 986, "time_per_iteration": 2.7834935188293457 }, { "auxiliary_loss_clip": 0.01200704, "auxiliary_loss_mlp": 0.01061312, "balance_loss_clip": 1.05555892, "balance_loss_mlp": 1.03425193, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.216017383423336, "language_loss": 0.75688565, "learning_rate": 3.9909763493246525e-06, "loss": 0.77950585, "num_input_tokens_seen": 21073090, "step": 987, "time_per_iteration": 2.6669981479644775 }, { "auxiliary_loss_clip": 0.01203558, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.06134868, "balance_loss_mlp": 1.03331852, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.2869993581633827, "language_loss": 0.71867943, "learning_rate": 3.990939357235621e-06, "loss": 0.7412926, "num_input_tokens_seen": 21094895, "step": 988, "time_per_iteration": 2.805851697921753 }, { "auxiliary_loss_clip": 0.0105006, "auxiliary_loss_mlp": 0.0101134, "balance_loss_clip": 1.02230322, "balance_loss_mlp": 1.00688171, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9416454944601763, "language_loss": 0.7124939, "learning_rate": 3.99090228964997e-06, "loss": 0.73310792, "num_input_tokens_seen": 21147555, "step": 989, "time_per_iteration": 3.100306749343872 }, { "auxiliary_loss_clip": 0.0117797, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.05793095, "balance_loss_mlp": 1.04389191, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 2.0167260155417113, "language_loss": 0.78245646, "learning_rate": 3.990865146569105e-06, "loss": 0.80495882, "num_input_tokens_seen": 21167845, "step": 990, "time_per_iteration": 2.8133904933929443 }, { "auxiliary_loss_clip": 0.01198295, "auxiliary_loss_mlp": 0.01053485, "balance_loss_clip": 1.06166339, "balance_loss_mlp": 1.02761686, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.2411623387553727, "language_loss": 0.86522102, "learning_rate": 3.990827927994434e-06, "loss": 0.88773882, "num_input_tokens_seen": 21185085, "step": 991, "time_per_iteration": 2.6964831352233887 }, { "auxiliary_loss_clip": 0.0121783, "auxiliary_loss_mlp": 0.01064707, "balance_loss_clip": 1.0613625, "balance_loss_mlp": 1.03943431, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 1.8566945591898132, "language_loss": 0.76738375, "learning_rate": 3.9907906339273674e-06, "loss": 0.79020917, "num_input_tokens_seen": 21204230, "step": 992, "time_per_iteration": 2.646942377090454 }, { "auxiliary_loss_clip": 0.01146457, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.05571234, "balance_loss_mlp": 1.03834832, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.3469050968731233, "language_loss": 0.75117075, "learning_rate": 3.9907532643693215e-06, "loss": 0.77325845, "num_input_tokens_seen": 21222655, "step": 993, "time_per_iteration": 2.7642974853515625 }, { "auxiliary_loss_clip": 0.01157785, "auxiliary_loss_mlp": 0.01075532, "balance_loss_clip": 1.05397618, "balance_loss_mlp": 1.04774487, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 2.725207959052886, "language_loss": 0.79177904, "learning_rate": 3.990715819321712e-06, "loss": 0.81411219, "num_input_tokens_seen": 21242310, "step": 994, "time_per_iteration": 2.8414714336395264 }, { "auxiliary_loss_clip": 0.01214724, "auxiliary_loss_mlp": 0.01079016, "balance_loss_clip": 1.06264019, "balance_loss_mlp": 1.05361295, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.8097993094234983, "language_loss": 0.79917169, "learning_rate": 3.99067829878596e-06, "loss": 0.82210916, "num_input_tokens_seen": 21261410, "step": 995, "time_per_iteration": 2.6524364948272705 }, { "auxiliary_loss_clip": 0.0116696, "auxiliary_loss_mlp": 0.01068218, "balance_loss_clip": 1.05704355, "balance_loss_mlp": 1.04208767, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 1.902030256537741, "language_loss": 0.87013257, "learning_rate": 3.990640702763487e-06, "loss": 0.89248431, "num_input_tokens_seen": 21280080, "step": 996, "time_per_iteration": 2.7431676387786865 }, { "auxiliary_loss_clip": 0.01177854, "auxiliary_loss_mlp": 0.01081123, "balance_loss_clip": 1.05684328, "balance_loss_mlp": 1.05055761, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 2.971039758986745, "language_loss": 0.87273014, "learning_rate": 3.990603031255718e-06, "loss": 0.89531994, "num_input_tokens_seen": 21296765, "step": 997, "time_per_iteration": 2.748448371887207 }, { "auxiliary_loss_clip": 0.01069915, "auxiliary_loss_mlp": 0.01014417, "balance_loss_clip": 1.02303648, "balance_loss_mlp": 1.00972033, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0091092068179202, "language_loss": 0.75381488, "learning_rate": 3.990565284264083e-06, "loss": 0.7746582, "num_input_tokens_seen": 21363345, "step": 998, "time_per_iteration": 3.2950518131256104 }, { "auxiliary_loss_clip": 0.01170062, "auxiliary_loss_mlp": 0.01065521, "balance_loss_clip": 1.05893683, "balance_loss_mlp": 1.03893745, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 1.8197691299520968, "language_loss": 0.76053095, "learning_rate": 3.990527461790013e-06, "loss": 0.7828868, "num_input_tokens_seen": 21385290, "step": 999, "time_per_iteration": 2.733802556991577 }, { "auxiliary_loss_clip": 0.01197834, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.05646563, "balance_loss_mlp": 1.03339899, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 2.5948629341774874, "language_loss": 0.82992184, "learning_rate": 3.990489563834943e-06, "loss": 0.85249555, "num_input_tokens_seen": 21407625, "step": 1000, "time_per_iteration": 2.710981845855713 }, { "auxiliary_loss_clip": 0.0118571, "auxiliary_loss_mlp": 0.01062188, "balance_loss_clip": 1.05856955, "balance_loss_mlp": 1.03480577, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 2.111409807940472, "language_loss": 0.85820085, "learning_rate": 3.990451590400309e-06, "loss": 0.88067985, "num_input_tokens_seen": 21426835, "step": 1001, "time_per_iteration": 2.73445463180542 }, { "auxiliary_loss_clip": 0.01191917, "auxiliary_loss_mlp": 0.01062059, "balance_loss_clip": 1.06167853, "balance_loss_mlp": 1.03719211, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 1.8359711451165206, "language_loss": 0.74128318, "learning_rate": 3.990413541487551e-06, "loss": 0.76382297, "num_input_tokens_seen": 21444920, "step": 1002, "time_per_iteration": 2.8861100673675537 }, { "auxiliary_loss_clip": 0.01214316, "auxiliary_loss_mlp": 0.01062589, "balance_loss_clip": 1.06316125, "balance_loss_mlp": 1.03737664, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.1835040648243997, "language_loss": 0.75520515, "learning_rate": 3.990375417098112e-06, "loss": 0.77797419, "num_input_tokens_seen": 21463555, "step": 1003, "time_per_iteration": 2.632889747619629 }, { "auxiliary_loss_clip": 0.01187709, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.05934548, "balance_loss_mlp": 1.03928304, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 2.3150099602993155, "language_loss": 0.70349169, "learning_rate": 3.990337217233437e-06, "loss": 0.72601682, "num_input_tokens_seen": 21481990, "step": 1004, "time_per_iteration": 2.6947617530822754 }, { "auxiliary_loss_clip": 0.01212815, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.06629324, "balance_loss_mlp": 1.05168116, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 2.276868338025253, "language_loss": 0.83444524, "learning_rate": 3.990298941894976e-06, "loss": 0.85734791, "num_input_tokens_seen": 21500385, "step": 1005, "time_per_iteration": 2.581683397293091 }, { "auxiliary_loss_clip": 0.01077621, "auxiliary_loss_mlp": 0.01004707, "balance_loss_clip": 1.02541244, "balance_loss_mlp": 1.00029612, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.903813421793838, "language_loss": 0.59018111, "learning_rate": 3.9902605910841794e-06, "loss": 0.61100447, "num_input_tokens_seen": 21561040, "step": 1006, "time_per_iteration": 3.222104787826538 }, { "auxiliary_loss_clip": 0.01183553, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.05334234, "balance_loss_mlp": 1.03284812, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.1584333764290853, "language_loss": 0.74229443, "learning_rate": 3.990222164802503e-06, "loss": 0.76472032, "num_input_tokens_seen": 21580655, "step": 1007, "time_per_iteration": 2.7130653858184814 }, { "auxiliary_loss_clip": 0.0119408, "auxiliary_loss_mlp": 0.01060431, "balance_loss_clip": 1.06008601, "balance_loss_mlp": 1.03493261, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 1.7956876298455304, "language_loss": 0.8081426, "learning_rate": 3.9901836630514006e-06, "loss": 0.8306877, "num_input_tokens_seen": 21599650, "step": 1008, "time_per_iteration": 2.7151994705200195 }, { "auxiliary_loss_clip": 0.01175291, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.0305717, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 2.3069524306559837, "language_loss": 0.78198558, "learning_rate": 3.990145085832335e-06, "loss": 0.8043021, "num_input_tokens_seen": 21617550, "step": 1009, "time_per_iteration": 2.7313599586486816 }, { "auxiliary_loss_clip": 0.01194621, "auxiliary_loss_mlp": 0.01061233, "balance_loss_clip": 1.06150866, "balance_loss_mlp": 1.03726041, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7452257697216769, "language_loss": 0.93148172, "learning_rate": 3.990106433146769e-06, "loss": 0.95404023, "num_input_tokens_seen": 21635865, "step": 1010, "time_per_iteration": 2.7233662605285645 }, { "auxiliary_loss_clip": 0.01148246, "auxiliary_loss_mlp": 0.00784144, "balance_loss_clip": 1.05304599, "balance_loss_mlp": 1.00029802, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.9999367504779517, "language_loss": 0.72022474, "learning_rate": 3.9900677049961665e-06, "loss": 0.73954868, "num_input_tokens_seen": 21653945, "step": 1011, "time_per_iteration": 2.804858446121216 }, { "auxiliary_loss_clip": 0.01194231, "auxiliary_loss_mlp": 0.01077344, "balance_loss_clip": 1.05968046, "balance_loss_mlp": 1.04868615, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 1.9573218215833301, "language_loss": 0.87526691, "learning_rate": 3.990028901381999e-06, "loss": 0.89798272, "num_input_tokens_seen": 21671230, "step": 1012, "time_per_iteration": 2.6466245651245117 }, { "auxiliary_loss_clip": 0.01184459, "auxiliary_loss_mlp": 0.01064264, "balance_loss_clip": 1.05652905, "balance_loss_mlp": 1.03838325, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 1.9062230938156723, "language_loss": 0.76947677, "learning_rate": 3.989990022305734e-06, "loss": 0.79196405, "num_input_tokens_seen": 21691155, "step": 1013, "time_per_iteration": 4.297588586807251 }, { "auxiliary_loss_clip": 0.01207383, "auxiliary_loss_mlp": 0.00783488, "balance_loss_clip": 1.06573224, "balance_loss_mlp": 1.00034499, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.441711811862119, "language_loss": 0.86151874, "learning_rate": 3.98995106776885e-06, "loss": 0.88142747, "num_input_tokens_seen": 21707405, "step": 1014, "time_per_iteration": 4.301488637924194 }, { "auxiliary_loss_clip": 0.0121503, "auxiliary_loss_mlp": 0.01072817, "balance_loss_clip": 1.06605387, "balance_loss_mlp": 1.04508948, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 2.4309754772209184, "language_loss": 0.73197287, "learning_rate": 3.98991203777282e-06, "loss": 0.75485134, "num_input_tokens_seen": 21728090, "step": 1015, "time_per_iteration": 4.384514808654785 }, { "auxiliary_loss_clip": 0.01187374, "auxiliary_loss_mlp": 0.01068593, "balance_loss_clip": 1.06084347, "balance_loss_mlp": 1.04228365, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.5896529502124837, "language_loss": 0.79109907, "learning_rate": 3.9898729323191275e-06, "loss": 0.81365877, "num_input_tokens_seen": 21747950, "step": 1016, "time_per_iteration": 4.3249351978302 }, { "auxiliary_loss_clip": 0.01173015, "auxiliary_loss_mlp": 0.0105746, "balance_loss_clip": 1.06036103, "balance_loss_mlp": 1.03249741, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.6772682648410928, "language_loss": 0.76014191, "learning_rate": 3.989833751409254e-06, "loss": 0.78244662, "num_input_tokens_seen": 21767900, "step": 1017, "time_per_iteration": 2.7983243465423584 }, { "auxiliary_loss_clip": 0.01188817, "auxiliary_loss_mlp": 0.01074603, "balance_loss_clip": 1.06584609, "balance_loss_mlp": 1.0483532, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 2.001716657382839, "language_loss": 0.85798436, "learning_rate": 3.989794495044685e-06, "loss": 0.88061857, "num_input_tokens_seen": 21787375, "step": 1018, "time_per_iteration": 2.702399253845215 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01069438, "balance_loss_clip": 1.06325769, "balance_loss_mlp": 1.04231787, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.9546929267460813, "language_loss": 0.76985347, "learning_rate": 3.989755163226909e-06, "loss": 0.79229128, "num_input_tokens_seen": 21806275, "step": 1019, "time_per_iteration": 2.780104875564575 }, { "auxiliary_loss_clip": 0.01160861, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.05355084, "balance_loss_mlp": 1.03511262, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.1848809329980288, "language_loss": 0.84122044, "learning_rate": 3.989715755957418e-06, "loss": 0.86344314, "num_input_tokens_seen": 21826430, "step": 1020, "time_per_iteration": 2.785963535308838 }, { "auxiliary_loss_clip": 0.01198473, "auxiliary_loss_mlp": 0.01063342, "balance_loss_clip": 1.06365371, "balance_loss_mlp": 1.03604269, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.933053672026977, "language_loss": 0.79114467, "learning_rate": 3.989676273237705e-06, "loss": 0.81376278, "num_input_tokens_seen": 21847800, "step": 1021, "time_per_iteration": 2.7968955039978027 }, { "auxiliary_loss_clip": 0.01189659, "auxiliary_loss_mlp": 0.01064044, "balance_loss_clip": 1.06159925, "balance_loss_mlp": 1.04114437, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 2.089525934673828, "language_loss": 0.87768298, "learning_rate": 3.9896367150692705e-06, "loss": 0.90022004, "num_input_tokens_seen": 21863385, "step": 1022, "time_per_iteration": 2.70906138420105 }, { "auxiliary_loss_clip": 0.01198737, "auxiliary_loss_mlp": 0.0106635, "balance_loss_clip": 1.06627858, "balance_loss_mlp": 1.04079151, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.7284486983379121, "language_loss": 0.82892007, "learning_rate": 3.989597081453611e-06, "loss": 0.85157096, "num_input_tokens_seen": 21881880, "step": 1023, "time_per_iteration": 2.71539568901062 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01010751, "balance_loss_clip": 1.03727341, "balance_loss_mlp": 1.00614953, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8894752517384502, "language_loss": 0.6505782, "learning_rate": 3.989557372392231e-06, "loss": 0.67169237, "num_input_tokens_seen": 21940550, "step": 1024, "time_per_iteration": 3.175217628479004 }, { "auxiliary_loss_clip": 0.01167458, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.05906856, "balance_loss_mlp": 1.04553604, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 2.320347485789288, "language_loss": 0.88069236, "learning_rate": 3.989517587886636e-06, "loss": 0.90310359, "num_input_tokens_seen": 21958390, "step": 1025, "time_per_iteration": 2.690725564956665 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.01066504, "balance_loss_clip": 1.06452, "balance_loss_mlp": 1.04173219, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.5217294712155414, "language_loss": 0.84536898, "learning_rate": 3.989477727938335e-06, "loss": 0.86800468, "num_input_tokens_seen": 21978625, "step": 1026, "time_per_iteration": 2.7420806884765625 }, { "auxiliary_loss_clip": 0.01160797, "auxiliary_loss_mlp": 0.0107525, "balance_loss_clip": 1.05669701, "balance_loss_mlp": 1.04934609, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 2.354014397396182, "language_loss": 0.8228389, "learning_rate": 3.989437792548839e-06, "loss": 0.84519935, "num_input_tokens_seen": 21996035, "step": 1027, "time_per_iteration": 2.6683874130249023 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01067253, "balance_loss_clip": 1.06181073, "balance_loss_mlp": 1.04232645, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 4.43492107605727, "language_loss": 0.83898664, "learning_rate": 3.989397781719663e-06, "loss": 0.86126107, "num_input_tokens_seen": 22011625, "step": 1028, "time_per_iteration": 2.705387592315674 }, { "auxiliary_loss_clip": 0.0106503, "auxiliary_loss_mlp": 0.01008074, "balance_loss_clip": 1.02410197, "balance_loss_mlp": 1.00347257, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 0.9383255649985517, "language_loss": 0.604738, "learning_rate": 3.989357695452323e-06, "loss": 0.62546903, "num_input_tokens_seen": 22066035, "step": 1029, "time_per_iteration": 3.0268616676330566 }, { "auxiliary_loss_clip": 0.01176182, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.05641246, "balance_loss_mlp": 1.04737473, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 4.246634693563946, "language_loss": 0.82589179, "learning_rate": 3.98931753374834e-06, "loss": 0.84839535, "num_input_tokens_seen": 22085015, "step": 1030, "time_per_iteration": 2.7035892009735107 }, { "auxiliary_loss_clip": 0.0122298, "auxiliary_loss_mlp": 0.01077745, "balance_loss_clip": 1.06850278, "balance_loss_mlp": 1.05185235, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 2.585240230669548, "language_loss": 0.79576576, "learning_rate": 3.989277296609237e-06, "loss": 0.81877303, "num_input_tokens_seen": 22102775, "step": 1031, "time_per_iteration": 2.60622501373291 }, { "auxiliary_loss_clip": 0.01188957, "auxiliary_loss_mlp": 0.01076754, "balance_loss_clip": 1.06396544, "balance_loss_mlp": 1.04982424, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.8815476991595563, "language_loss": 0.77384412, "learning_rate": 3.98923698403654e-06, "loss": 0.79650116, "num_input_tokens_seen": 22121680, "step": 1032, "time_per_iteration": 2.6753971576690674 }, { "auxiliary_loss_clip": 0.01198757, "auxiliary_loss_mlp": 0.01074736, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.04848623, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 3.147941025479245, "language_loss": 0.89323574, "learning_rate": 3.989196596031776e-06, "loss": 0.91597068, "num_input_tokens_seen": 22138155, "step": 1033, "time_per_iteration": 2.7313079833984375 }, { "auxiliary_loss_clip": 0.01209161, "auxiliary_loss_mlp": 0.01066082, "balance_loss_clip": 1.06214237, "balance_loss_mlp": 1.04119134, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 2.1035343880884145, "language_loss": 0.8455385, "learning_rate": 3.989156132596479e-06, "loss": 0.8682909, "num_input_tokens_seen": 22157420, "step": 1034, "time_per_iteration": 2.7541439533233643 }, { "auxiliary_loss_clip": 0.01180042, "auxiliary_loss_mlp": 0.01057312, "balance_loss_clip": 1.05896068, "balance_loss_mlp": 1.03155136, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8983498110529735, "language_loss": 0.8082794, "learning_rate": 3.989115593732182e-06, "loss": 0.83065289, "num_input_tokens_seen": 22178620, "step": 1035, "time_per_iteration": 2.7965424060821533 }, { "auxiliary_loss_clip": 0.01158806, "auxiliary_loss_mlp": 0.01072478, "balance_loss_clip": 1.05936599, "balance_loss_mlp": 1.04432034, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 2.145216314952277, "language_loss": 0.78365827, "learning_rate": 3.989074979440421e-06, "loss": 0.80597103, "num_input_tokens_seen": 22197125, "step": 1036, "time_per_iteration": 2.7858450412750244 }, { "auxiliary_loss_clip": 0.01192097, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.05977845, "balance_loss_mlp": 1.04663444, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.9535870339716077, "language_loss": 0.86544567, "learning_rate": 3.989034289722739e-06, "loss": 0.88808048, "num_input_tokens_seen": 22217575, "step": 1037, "time_per_iteration": 2.685373306274414 }, { "auxiliary_loss_clip": 0.01197778, "auxiliary_loss_mlp": 0.01057095, "balance_loss_clip": 1.06127763, "balance_loss_mlp": 1.02966499, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.697396725345887, "language_loss": 0.8067717, "learning_rate": 3.988993524580676e-06, "loss": 0.82932043, "num_input_tokens_seen": 22236840, "step": 1038, "time_per_iteration": 2.7305831909179688 }, { "auxiliary_loss_clip": 0.01145896, "auxiliary_loss_mlp": 0.01072721, "balance_loss_clip": 1.05226004, "balance_loss_mlp": 1.04330015, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 1.8888526922505675, "language_loss": 0.85465872, "learning_rate": 3.98895268401578e-06, "loss": 0.87684488, "num_input_tokens_seen": 22256465, "step": 1039, "time_per_iteration": 2.7351109981536865 }, { "auxiliary_loss_clip": 0.01188545, "auxiliary_loss_mlp": 0.01070323, "balance_loss_clip": 1.05834138, "balance_loss_mlp": 1.04472923, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 2.217895985816133, "language_loss": 0.81172895, "learning_rate": 3.9889117680296e-06, "loss": 0.83431756, "num_input_tokens_seen": 22274025, "step": 1040, "time_per_iteration": 2.6532907485961914 }, { "auxiliary_loss_clip": 0.0121654, "auxiliary_loss_mlp": 0.0106312, "balance_loss_clip": 1.06718016, "balance_loss_mlp": 1.03808582, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.1960038080149817, "language_loss": 0.69304991, "learning_rate": 3.988870776623685e-06, "loss": 0.71584648, "num_input_tokens_seen": 22292245, "step": 1041, "time_per_iteration": 2.6445486545562744 }, { "auxiliary_loss_clip": 0.01214659, "auxiliary_loss_mlp": 0.01057975, "balance_loss_clip": 1.06247008, "balance_loss_mlp": 1.03182077, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.7326158002445, "language_loss": 0.81187552, "learning_rate": 3.9888297097995905e-06, "loss": 0.83460188, "num_input_tokens_seen": 22311455, "step": 1042, "time_per_iteration": 2.6111559867858887 }, { "auxiliary_loss_clip": 0.01211653, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.06253886, "balance_loss_mlp": 1.02871442, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 1.7165873820424848, "language_loss": 0.76349056, "learning_rate": 3.988788567558874e-06, "loss": 0.78613389, "num_input_tokens_seen": 22333750, "step": 1043, "time_per_iteration": 2.761768341064453 }, { "auxiliary_loss_clip": 0.0118944, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.06111181, "balance_loss_mlp": 1.03912091, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 8.34017761542712, "language_loss": 0.92031956, "learning_rate": 3.988747349903097e-06, "loss": 0.94284582, "num_input_tokens_seen": 22351940, "step": 1044, "time_per_iteration": 2.636179208755493 }, { "auxiliary_loss_clip": 0.01192566, "auxiliary_loss_mlp": 0.01070128, "balance_loss_clip": 1.05862689, "balance_loss_mlp": 1.0456785, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 2.3486674311430944, "language_loss": 0.85913992, "learning_rate": 3.988706056833821e-06, "loss": 0.88176692, "num_input_tokens_seen": 22372085, "step": 1045, "time_per_iteration": 2.7749502658843994 }, { "auxiliary_loss_clip": 0.01179197, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.05804443, "balance_loss_mlp": 1.04053521, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 1.9846122850853416, "language_loss": 0.7796576, "learning_rate": 3.9886646883526125e-06, "loss": 0.80210131, "num_input_tokens_seen": 22392020, "step": 1046, "time_per_iteration": 2.803135871887207 }, { "auxiliary_loss_clip": 0.01197344, "auxiliary_loss_mlp": 0.01069269, "balance_loss_clip": 1.06361508, "balance_loss_mlp": 1.04558206, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 2.174325060947129, "language_loss": 0.77326387, "learning_rate": 3.988623244461039e-06, "loss": 0.79592997, "num_input_tokens_seen": 22411180, "step": 1047, "time_per_iteration": 2.647446632385254 }, { "auxiliary_loss_clip": 0.01200907, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.06238222, "balance_loss_mlp": 1.03314662, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.4899372640825046, "language_loss": 0.77190751, "learning_rate": 3.988581725160672e-06, "loss": 0.79449654, "num_input_tokens_seen": 22435105, "step": 1048, "time_per_iteration": 2.8167293071746826 }, { "auxiliary_loss_clip": 0.0118184, "auxiliary_loss_mlp": 0.01064361, "balance_loss_clip": 1.0613215, "balance_loss_mlp": 1.03914821, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 4.606540291271834, "language_loss": 0.77258086, "learning_rate": 3.988540130453087e-06, "loss": 0.79504287, "num_input_tokens_seen": 22452710, "step": 1049, "time_per_iteration": 2.6908538341522217 }, { "auxiliary_loss_clip": 0.01194538, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.06043661, "balance_loss_mlp": 1.03290701, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.515998307474139, "language_loss": 0.83302009, "learning_rate": 3.988498460339862e-06, "loss": 0.85553372, "num_input_tokens_seen": 22470175, "step": 1050, "time_per_iteration": 2.62186861038208 }, { "auxiliary_loss_clip": 0.01210654, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.06468701, "balance_loss_mlp": 1.04008913, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 5.5478202090132065, "language_loss": 0.76564771, "learning_rate": 3.988456714822575e-06, "loss": 0.78838319, "num_input_tokens_seen": 22490020, "step": 1051, "time_per_iteration": 2.732269525527954 }, { "auxiliary_loss_clip": 0.01188416, "auxiliary_loss_mlp": 0.01069443, "balance_loss_clip": 1.06340146, "balance_loss_mlp": 1.04492211, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 1.9993900469270787, "language_loss": 0.80410004, "learning_rate": 3.98841489390281e-06, "loss": 0.82667863, "num_input_tokens_seen": 22509685, "step": 1052, "time_per_iteration": 2.7683873176574707 }, { "auxiliary_loss_clip": 0.01211333, "auxiliary_loss_mlp": 0.01058255, "balance_loss_clip": 1.06324601, "balance_loss_mlp": 1.03468728, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 2.370007457349547, "language_loss": 0.77433288, "learning_rate": 3.988372997582155e-06, "loss": 0.79702866, "num_input_tokens_seen": 22527905, "step": 1053, "time_per_iteration": 5.757168531417847 }, { "auxiliary_loss_clip": 0.01190721, "auxiliary_loss_mlp": 0.00780448, "balance_loss_clip": 1.06378174, "balance_loss_mlp": 1.00028598, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 3.085258828985267, "language_loss": 0.84931248, "learning_rate": 3.988331025862195e-06, "loss": 0.86902416, "num_input_tokens_seen": 22546335, "step": 1054, "time_per_iteration": 2.7733829021453857 }, { "auxiliary_loss_clip": 0.01172281, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.05722666, "balance_loss_mlp": 1.03753328, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.0168531459993435, "language_loss": 0.85884213, "learning_rate": 3.9882889787445225e-06, "loss": 0.88117731, "num_input_tokens_seen": 22563885, "step": 1055, "time_per_iteration": 4.490305185317993 }, { "auxiliary_loss_clip": 0.01164237, "auxiliary_loss_mlp": 0.01069785, "balance_loss_clip": 1.05727792, "balance_loss_mlp": 1.04534709, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 2.4509218988768, "language_loss": 0.8113938, "learning_rate": 3.988246856230734e-06, "loss": 0.83373404, "num_input_tokens_seen": 22583035, "step": 1056, "time_per_iteration": 5.345282793045044 }, { "auxiliary_loss_clip": 0.01144181, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.04991364, "balance_loss_mlp": 1.03449368, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.2117272688527128, "language_loss": 0.81083393, "learning_rate": 3.988204658322426e-06, "loss": 0.83288693, "num_input_tokens_seen": 22605055, "step": 1057, "time_per_iteration": 2.866757392883301 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01061742, "balance_loss_clip": 1.04970908, "balance_loss_mlp": 1.03918755, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 1.9636971972870172, "language_loss": 0.83353591, "learning_rate": 3.988162385021196e-06, "loss": 0.85554409, "num_input_tokens_seen": 22623760, "step": 1058, "time_per_iteration": 2.767024278640747 }, { "auxiliary_loss_clip": 0.0117752, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.0576936, "balance_loss_mlp": 1.03408027, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 2.137077300251244, "language_loss": 0.87556928, "learning_rate": 3.988120036328651e-06, "loss": 0.89794087, "num_input_tokens_seen": 22643000, "step": 1059, "time_per_iteration": 2.794734239578247 }, { "auxiliary_loss_clip": 0.01169658, "auxiliary_loss_mlp": 0.01063463, "balance_loss_clip": 1.06196678, "balance_loss_mlp": 1.0383693, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 2.543966627588717, "language_loss": 0.91561133, "learning_rate": 3.988077612246394e-06, "loss": 0.93794256, "num_input_tokens_seen": 22660460, "step": 1060, "time_per_iteration": 2.8223626613616943 }, { "auxiliary_loss_clip": 0.01173933, "auxiliary_loss_mlp": 0.01065151, "balance_loss_clip": 1.05715585, "balance_loss_mlp": 1.03981876, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 1.9401711052692647, "language_loss": 0.87242293, "learning_rate": 3.988035112776035e-06, "loss": 0.89481378, "num_input_tokens_seen": 22679270, "step": 1061, "time_per_iteration": 2.7783865928649902 }, { "auxiliary_loss_clip": 0.01190039, "auxiliary_loss_mlp": 0.01059971, "balance_loss_clip": 1.05976009, "balance_loss_mlp": 1.03388786, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 5.360593029379932, "language_loss": 0.77407908, "learning_rate": 3.987992537919185e-06, "loss": 0.79657912, "num_input_tokens_seen": 22699330, "step": 1062, "time_per_iteration": 2.872587203979492 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.01061175, "balance_loss_clip": 1.05884075, "balance_loss_mlp": 1.03798842, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 2.2658654128491436, "language_loss": 0.86522883, "learning_rate": 3.987949887677459e-06, "loss": 0.88756967, "num_input_tokens_seen": 22717945, "step": 1063, "time_per_iteration": 2.7915029525756836 }, { "auxiliary_loss_clip": 0.01207773, "auxiliary_loss_mlp": 0.01062698, "balance_loss_clip": 1.05969334, "balance_loss_mlp": 1.03846335, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.302236346678267, "language_loss": 0.79908657, "learning_rate": 3.9879071620524744e-06, "loss": 0.82179129, "num_input_tokens_seen": 22736790, "step": 1064, "time_per_iteration": 2.6880991458892822 }, { "auxiliary_loss_clip": 0.01198826, "auxiliary_loss_mlp": 0.01066465, "balance_loss_clip": 1.0603801, "balance_loss_mlp": 1.04149103, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 3.1552731138796215, "language_loss": 0.84327948, "learning_rate": 3.987864361045851e-06, "loss": 0.8659324, "num_input_tokens_seen": 22754745, "step": 1065, "time_per_iteration": 2.6956398487091064 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.01054905, "balance_loss_clip": 1.0597136, "balance_loss_mlp": 1.03162324, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.52830872536012, "language_loss": 0.68177885, "learning_rate": 3.987821484659211e-06, "loss": 0.70400894, "num_input_tokens_seen": 22776780, "step": 1066, "time_per_iteration": 2.9867773056030273 }, { "auxiliary_loss_clip": 0.01214184, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.06780005, "balance_loss_mlp": 1.04609215, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.8546001537284342, "language_loss": 0.90349269, "learning_rate": 3.987778532894181e-06, "loss": 0.926341, "num_input_tokens_seen": 22793915, "step": 1067, "time_per_iteration": 2.685896873474121 }, { "auxiliary_loss_clip": 0.01188134, "auxiliary_loss_mlp": 0.01063022, "balance_loss_clip": 1.0623709, "balance_loss_mlp": 1.03969264, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 2.189788428445167, "language_loss": 0.83437371, "learning_rate": 3.987735505752391e-06, "loss": 0.85688531, "num_input_tokens_seen": 22812670, "step": 1068, "time_per_iteration": 2.851602554321289 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01057745, "balance_loss_clip": 1.05909026, "balance_loss_mlp": 1.03426039, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 3.045176948020938, "language_loss": 0.89311272, "learning_rate": 3.987692403235471e-06, "loss": 0.9154799, "num_input_tokens_seen": 22832440, "step": 1069, "time_per_iteration": 2.7825255393981934 }, { "auxiliary_loss_clip": 0.01185672, "auxiliary_loss_mlp": 0.01071834, "balance_loss_clip": 1.06158304, "balance_loss_mlp": 1.04663301, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 2.7038488706194808, "language_loss": 0.95759481, "learning_rate": 3.987649225345056e-06, "loss": 0.98016989, "num_input_tokens_seen": 22845495, "step": 1070, "time_per_iteration": 2.715296506881714 }, { "auxiliary_loss_clip": 0.01140792, "auxiliary_loss_mlp": 0.01056718, "balance_loss_clip": 1.05607581, "balance_loss_mlp": 1.03027749, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.630790580283393, "language_loss": 0.8811003, "learning_rate": 3.987605972082782e-06, "loss": 0.90307534, "num_input_tokens_seen": 22865390, "step": 1071, "time_per_iteration": 2.8445394039154053 }, { "auxiliary_loss_clip": 0.01155172, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.05483651, "balance_loss_mlp": 1.03102481, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.8349443396730127, "language_loss": 0.76116478, "learning_rate": 3.987562643450292e-06, "loss": 0.78326637, "num_input_tokens_seen": 22885495, "step": 1072, "time_per_iteration": 2.8330819606781006 }, { "auxiliary_loss_clip": 0.01172997, "auxiliary_loss_mlp": 0.01070104, "balance_loss_clip": 1.05975842, "balance_loss_mlp": 1.04362798, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 2.724283900767911, "language_loss": 0.80849886, "learning_rate": 3.987519239449226e-06, "loss": 0.83092993, "num_input_tokens_seen": 22904845, "step": 1073, "time_per_iteration": 2.748286247253418 }, { "auxiliary_loss_clip": 0.01194712, "auxiliary_loss_mlp": 0.01062452, "balance_loss_clip": 1.06345201, "balance_loss_mlp": 1.03825283, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 5.0538746884234245, "language_loss": 0.80282539, "learning_rate": 3.987475760081233e-06, "loss": 0.82539707, "num_input_tokens_seen": 22925940, "step": 1074, "time_per_iteration": 2.7482337951660156 }, { "auxiliary_loss_clip": 0.01173366, "auxiliary_loss_mlp": 0.01057774, "balance_loss_clip": 1.05920076, "balance_loss_mlp": 1.03256142, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 2.0209756517373707, "language_loss": 0.79249811, "learning_rate": 3.987432205347958e-06, "loss": 0.8148095, "num_input_tokens_seen": 22944375, "step": 1075, "time_per_iteration": 2.6937224864959717 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01063569, "balance_loss_clip": 1.05735481, "balance_loss_mlp": 1.04025126, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.9028991302223357, "language_loss": 0.88208115, "learning_rate": 3.987388575251055e-06, "loss": 0.90442967, "num_input_tokens_seen": 22959145, "step": 1076, "time_per_iteration": 2.878103256225586 }, { "auxiliary_loss_clip": 0.01192915, "auxiliary_loss_mlp": 0.01052877, "balance_loss_clip": 1.06164443, "balance_loss_mlp": 1.0288558, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 2.225760792581628, "language_loss": 0.80876106, "learning_rate": 3.98734486979218e-06, "loss": 0.83121902, "num_input_tokens_seen": 22978100, "step": 1077, "time_per_iteration": 2.7221076488494873 }, { "auxiliary_loss_clip": 0.01200466, "auxiliary_loss_mlp": 0.01064019, "balance_loss_clip": 1.0656153, "balance_loss_mlp": 1.03866291, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 2.256787147683815, "language_loss": 0.91727465, "learning_rate": 3.987301088972986e-06, "loss": 0.93991947, "num_input_tokens_seen": 22997285, "step": 1078, "time_per_iteration": 2.862365484237671 }, { "auxiliary_loss_clip": 0.0122435, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.06826639, "balance_loss_mlp": 1.03552508, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 2.080056711608912, "language_loss": 0.78349572, "learning_rate": 3.987257232795137e-06, "loss": 0.80634147, "num_input_tokens_seen": 23016285, "step": 1079, "time_per_iteration": 2.6435368061065674 }, { "auxiliary_loss_clip": 0.01156927, "auxiliary_loss_mlp": 0.01063794, "balance_loss_clip": 1.05512071, "balance_loss_mlp": 1.03899896, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 2.274862403364013, "language_loss": 0.68702769, "learning_rate": 3.987213301260294e-06, "loss": 0.70923495, "num_input_tokens_seen": 23036420, "step": 1080, "time_per_iteration": 2.7782626152038574 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01062351, "balance_loss_clip": 1.06640029, "balance_loss_mlp": 1.03610086, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.886196453243775, "language_loss": 0.72291583, "learning_rate": 3.987169294370123e-06, "loss": 0.74529618, "num_input_tokens_seen": 23056945, "step": 1081, "time_per_iteration": 2.7983880043029785 }, { "auxiliary_loss_clip": 0.01139671, "auxiliary_loss_mlp": 0.01066686, "balance_loss_clip": 1.0504055, "balance_loss_mlp": 1.04076982, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 3.3093934650613566, "language_loss": 0.84059012, "learning_rate": 3.987125212126294e-06, "loss": 0.86265367, "num_input_tokens_seen": 23074940, "step": 1082, "time_per_iteration": 2.8351900577545166 }, { "auxiliary_loss_clip": 0.01204185, "auxiliary_loss_mlp": 0.01063692, "balance_loss_clip": 1.06306195, "balance_loss_mlp": 1.03809738, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.894360492506304, "language_loss": 0.82550305, "learning_rate": 3.987081054530478e-06, "loss": 0.84818184, "num_input_tokens_seen": 23093420, "step": 1083, "time_per_iteration": 2.866729974746704 }, { "auxiliary_loss_clip": 0.01168245, "auxiliary_loss_mlp": 0.01062938, "balance_loss_clip": 1.06021011, "balance_loss_mlp": 1.03655696, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.468736383036802, "language_loss": 0.79289383, "learning_rate": 3.987036821584348e-06, "loss": 0.81520569, "num_input_tokens_seen": 23111550, "step": 1084, "time_per_iteration": 2.816601276397705 }, { "auxiliary_loss_clip": 0.01174068, "auxiliary_loss_mlp": 0.0106167, "balance_loss_clip": 1.05854714, "balance_loss_mlp": 1.03667152, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.571590277205686, "language_loss": 0.66443276, "learning_rate": 3.986992513289584e-06, "loss": 0.68679011, "num_input_tokens_seen": 23130335, "step": 1085, "time_per_iteration": 2.8260092735290527 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01062435, "balance_loss_clip": 1.0600934, "balance_loss_mlp": 1.03833067, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 2.0478791529086977, "language_loss": 0.76548934, "learning_rate": 3.9869481296478645e-06, "loss": 0.78782183, "num_input_tokens_seen": 23152380, "step": 1086, "time_per_iteration": 2.7937023639678955 }, { "auxiliary_loss_clip": 0.01198609, "auxiliary_loss_mlp": 0.01059288, "balance_loss_clip": 1.06335294, "balance_loss_mlp": 1.03519547, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 2.1629448601391017, "language_loss": 0.85109925, "learning_rate": 3.986903670660872e-06, "loss": 0.87367821, "num_input_tokens_seen": 23171630, "step": 1087, "time_per_iteration": 2.7510013580322266 }, { "auxiliary_loss_clip": 0.01184978, "auxiliary_loss_mlp": 0.01059017, "balance_loss_clip": 1.06293821, "balance_loss_mlp": 1.03510392, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.7886353193129139, "language_loss": 0.77776635, "learning_rate": 3.9868591363302945e-06, "loss": 0.80020636, "num_input_tokens_seen": 23192520, "step": 1088, "time_per_iteration": 2.7792751789093018 }, { "auxiliary_loss_clip": 0.01192707, "auxiliary_loss_mlp": 0.01067634, "balance_loss_clip": 1.06569457, "balance_loss_mlp": 1.04498422, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 3.0334087154373375, "language_loss": 0.71050513, "learning_rate": 3.9868145266578186e-06, "loss": 0.73310852, "num_input_tokens_seen": 23210710, "step": 1089, "time_per_iteration": 2.8832852840423584 }, { "auxiliary_loss_clip": 0.01173663, "auxiliary_loss_mlp": 0.00781529, "balance_loss_clip": 1.06159782, "balance_loss_mlp": 1.00019014, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 2.02973275746688, "language_loss": 0.85650897, "learning_rate": 3.9867698416451366e-06, "loss": 0.87606084, "num_input_tokens_seen": 23230305, "step": 1090, "time_per_iteration": 2.7933149337768555 }, { "auxiliary_loss_clip": 0.01214666, "auxiliary_loss_mlp": 0.0105885, "balance_loss_clip": 1.06735325, "balance_loss_mlp": 1.03460288, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 2.137212216289862, "language_loss": 0.71829313, "learning_rate": 3.9867250812939434e-06, "loss": 0.74102825, "num_input_tokens_seen": 23249015, "step": 1091, "time_per_iteration": 2.646592855453491 }, { "auxiliary_loss_clip": 0.01121055, "auxiliary_loss_mlp": 0.0106405, "balance_loss_clip": 1.05242276, "balance_loss_mlp": 1.03961205, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.2773849385721956, "language_loss": 0.82839823, "learning_rate": 3.986680245605936e-06, "loss": 0.85024923, "num_input_tokens_seen": 23265105, "step": 1092, "time_per_iteration": 4.799649715423584 }, { "auxiliary_loss_clip": 0.01215092, "auxiliary_loss_mlp": 0.01059151, "balance_loss_clip": 1.0640471, "balance_loss_mlp": 1.03352082, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 2.268968080418226, "language_loss": 0.71134168, "learning_rate": 3.986635334582814e-06, "loss": 0.73408413, "num_input_tokens_seen": 23283950, "step": 1093, "time_per_iteration": 5.3356239795684814 }, { "auxiliary_loss_clip": 0.01190682, "auxiliary_loss_mlp": 0.01064498, "balance_loss_clip": 1.06751943, "balance_loss_mlp": 1.0392611, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 3.829837904337144, "language_loss": 0.87996346, "learning_rate": 3.986590348226282e-06, "loss": 0.90251523, "num_input_tokens_seen": 23305005, "step": 1094, "time_per_iteration": 2.853489637374878 }, { "auxiliary_loss_clip": 0.01192742, "auxiliary_loss_mlp": 0.01065068, "balance_loss_clip": 1.06367433, "balance_loss_mlp": 1.03843689, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 1.6736216436017588, "language_loss": 0.81483954, "learning_rate": 3.986545286538044e-06, "loss": 0.8374176, "num_input_tokens_seen": 23323220, "step": 1095, "time_per_iteration": 5.1613922119140625 }, { "auxiliary_loss_clip": 0.01166049, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.06295943, "balance_loss_mlp": 1.03598547, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.0200125290673068, "language_loss": 0.69789279, "learning_rate": 3.986500149519811e-06, "loss": 0.72014272, "num_input_tokens_seen": 23342235, "step": 1096, "time_per_iteration": 2.804025173187256 }, { "auxiliary_loss_clip": 0.01201939, "auxiliary_loss_mlp": 0.01070786, "balance_loss_clip": 1.06405246, "balance_loss_mlp": 1.04614568, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 1.7011375462517908, "language_loss": 0.77430046, "learning_rate": 3.986454937173292e-06, "loss": 0.79702777, "num_input_tokens_seen": 23363680, "step": 1097, "time_per_iteration": 2.7658958435058594 }, { "auxiliary_loss_clip": 0.01215996, "auxiliary_loss_mlp": 0.01063445, "balance_loss_clip": 1.06707537, "balance_loss_mlp": 1.03959155, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 1.8316558452843608, "language_loss": 0.78217584, "learning_rate": 3.986409649500203e-06, "loss": 0.80497026, "num_input_tokens_seen": 23385590, "step": 1098, "time_per_iteration": 2.865684747695923 }, { "auxiliary_loss_clip": 0.01197349, "auxiliary_loss_mlp": 0.01069192, "balance_loss_clip": 1.06328607, "balance_loss_mlp": 1.04443276, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.9237510259783663, "language_loss": 0.81525648, "learning_rate": 3.986364286502261e-06, "loss": 0.83792192, "num_input_tokens_seen": 23402945, "step": 1099, "time_per_iteration": 2.690377950668335 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.0105819, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03428841, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.9906927310803755, "language_loss": 0.82793295, "learning_rate": 3.986318848181186e-06, "loss": 0.8503148, "num_input_tokens_seen": 23421410, "step": 1100, "time_per_iteration": 2.7613909244537354 }, { "auxiliary_loss_clip": 0.01191263, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.06985724, "balance_loss_mlp": 1.03529549, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 2.079994286400427, "language_loss": 0.73502243, "learning_rate": 3.986273334538702e-06, "loss": 0.75752538, "num_input_tokens_seen": 23438870, "step": 1101, "time_per_iteration": 2.7795870304107666 }, { "auxiliary_loss_clip": 0.01199256, "auxiliary_loss_mlp": 0.01061171, "balance_loss_clip": 1.06278944, "balance_loss_mlp": 1.03773487, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.875757629612747, "language_loss": 0.85861301, "learning_rate": 3.986227745576533e-06, "loss": 0.88121736, "num_input_tokens_seen": 23456975, "step": 1102, "time_per_iteration": 2.737269401550293 }, { "auxiliary_loss_clip": 0.01191982, "auxiliary_loss_mlp": 0.01058639, "balance_loss_clip": 1.06898165, "balance_loss_mlp": 1.03410578, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 2.8924251757501778, "language_loss": 0.81655926, "learning_rate": 3.98618208129641e-06, "loss": 0.83906543, "num_input_tokens_seen": 23473440, "step": 1103, "time_per_iteration": 2.9345293045043945 }, { "auxiliary_loss_clip": 0.01203522, "auxiliary_loss_mlp": 0.00780451, "balance_loss_clip": 1.06721628, "balance_loss_mlp": 1.00042021, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 5.176370819061919, "language_loss": 0.81749105, "learning_rate": 3.986136341700063e-06, "loss": 0.83733076, "num_input_tokens_seen": 23493880, "step": 1104, "time_per_iteration": 2.753657102584839 }, { "auxiliary_loss_clip": 0.0116508, "auxiliary_loss_mlp": 0.01050687, "balance_loss_clip": 1.0576005, "balance_loss_mlp": 1.02608228, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.5448539486038575, "language_loss": 0.80422902, "learning_rate": 3.986090526789227e-06, "loss": 0.82638663, "num_input_tokens_seen": 23514920, "step": 1105, "time_per_iteration": 2.8904521465301514 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0106197, "balance_loss_clip": 1.06348729, "balance_loss_mlp": 1.0391891, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 2.7426455725749896, "language_loss": 0.96762037, "learning_rate": 3.986044636565639e-06, "loss": 0.99003398, "num_input_tokens_seen": 23531635, "step": 1106, "time_per_iteration": 2.890073299407959 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01059975, "balance_loss_clip": 1.06069684, "balance_loss_mlp": 1.03511953, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 1.9297768479693453, "language_loss": 0.82528949, "learning_rate": 3.985998671031039e-06, "loss": 0.84786987, "num_input_tokens_seen": 23551020, "step": 1107, "time_per_iteration": 2.778857469558716 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.01010935, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.0072155, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.7967940032222198, "language_loss": 0.56789279, "learning_rate": 3.9859526301871705e-06, "loss": 0.58904392, "num_input_tokens_seen": 23610675, "step": 1108, "time_per_iteration": 3.2717819213867188 }, { "auxiliary_loss_clip": 0.0118327, "auxiliary_loss_mlp": 0.01062625, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.0376507, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 2.682842555407744, "language_loss": 0.7287578, "learning_rate": 3.9859065140357795e-06, "loss": 0.75121677, "num_input_tokens_seen": 23628710, "step": 1109, "time_per_iteration": 2.829623222351074 }, { "auxiliary_loss_clip": 0.01148971, "auxiliary_loss_mlp": 0.01071895, "balance_loss_clip": 1.05459642, "balance_loss_mlp": 1.04714715, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.7914435942805436, "language_loss": 0.78140426, "learning_rate": 3.985860322578614e-06, "loss": 0.80361295, "num_input_tokens_seen": 23649160, "step": 1110, "time_per_iteration": 2.892786741256714 }, { "auxiliary_loss_clip": 0.01153553, "auxiliary_loss_mlp": 0.0106147, "balance_loss_clip": 1.05590594, "balance_loss_mlp": 1.03700781, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 2.5260725451831805, "language_loss": 0.71425366, "learning_rate": 3.985814055817427e-06, "loss": 0.73640382, "num_input_tokens_seen": 23671995, "step": 1111, "time_per_iteration": 2.9349052906036377 }, { "auxiliary_loss_clip": 0.01170538, "auxiliary_loss_mlp": 0.01066103, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.04199934, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.8396663794990693, "language_loss": 0.78767776, "learning_rate": 3.985767713753971e-06, "loss": 0.81004417, "num_input_tokens_seen": 23690705, "step": 1112, "time_per_iteration": 2.8676345348358154 }, { "auxiliary_loss_clip": 0.01153291, "auxiliary_loss_mlp": 0.01065421, "balance_loss_clip": 1.05340791, "balance_loss_mlp": 1.04163861, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.071048188460824, "language_loss": 0.78481978, "learning_rate": 3.985721296390005e-06, "loss": 0.80700684, "num_input_tokens_seen": 23709990, "step": 1113, "time_per_iteration": 2.8688411712646484 }, { "auxiliary_loss_clip": 0.0114872, "auxiliary_loss_mlp": 0.01057074, "balance_loss_clip": 1.05157375, "balance_loss_mlp": 1.03376842, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 1.7560007918285245, "language_loss": 0.82399213, "learning_rate": 3.985674803727289e-06, "loss": 0.84605002, "num_input_tokens_seen": 23728485, "step": 1114, "time_per_iteration": 2.832458019256592 }, { "auxiliary_loss_clip": 0.01075626, "auxiliary_loss_mlp": 0.01006906, "balance_loss_clip": 1.04995251, "balance_loss_mlp": 1.00271022, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8370646888074905, "language_loss": 0.58147323, "learning_rate": 3.985628235767584e-06, "loss": 0.60229862, "num_input_tokens_seen": 23786650, "step": 1115, "time_per_iteration": 3.550837755203247 }, { "auxiliary_loss_clip": 0.01177193, "auxiliary_loss_mlp": 0.01059174, "balance_loss_clip": 1.05986214, "balance_loss_mlp": 1.03381801, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.8944873563712235, "language_loss": 0.91280693, "learning_rate": 3.985581592512658e-06, "loss": 0.93517065, "num_input_tokens_seen": 23802555, "step": 1116, "time_per_iteration": 2.994608163833618 }, { "auxiliary_loss_clip": 0.01169376, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.05839634, "balance_loss_mlp": 1.00045347, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9249158333763592, "language_loss": 0.87154609, "learning_rate": 3.985534873964279e-06, "loss": 0.89106256, "num_input_tokens_seen": 23822945, "step": 1117, "time_per_iteration": 2.794400453567505 }, { "auxiliary_loss_clip": 0.01095782, "auxiliary_loss_mlp": 0.01003785, "balance_loss_clip": 1.0387876, "balance_loss_mlp": 0.99963647, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8644388721740246, "language_loss": 0.5981611, "learning_rate": 3.985488080124218e-06, "loss": 0.61915678, "num_input_tokens_seen": 23874075, "step": 1118, "time_per_iteration": 3.1695809364318848 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01051993, "balance_loss_clip": 1.05301392, "balance_loss_mlp": 1.02780545, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.6923711141076447, "language_loss": 0.83045954, "learning_rate": 3.985441210994251e-06, "loss": 0.85276914, "num_input_tokens_seen": 23889720, "step": 1119, "time_per_iteration": 2.7538814544677734 }, { "auxiliary_loss_clip": 0.01182384, "auxiliary_loss_mlp": 0.01058422, "balance_loss_clip": 1.06102347, "balance_loss_mlp": 1.03566504, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 4.541743494234462, "language_loss": 0.8451674, "learning_rate": 3.9853942665761545e-06, "loss": 0.86757541, "num_input_tokens_seen": 23909385, "step": 1120, "time_per_iteration": 2.76581072807312 }, { "auxiliary_loss_clip": 0.0121565, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.06757379, "balance_loss_mlp": 1.04028773, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 2.503866645162978, "language_loss": 0.78722781, "learning_rate": 3.985347246871708e-06, "loss": 0.81003344, "num_input_tokens_seen": 23926830, "step": 1121, "time_per_iteration": 2.651175022125244 }, { "auxiliary_loss_clip": 0.01080914, "auxiliary_loss_mlp": 0.01011889, "balance_loss_clip": 1.03108025, "balance_loss_mlp": 1.00802636, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7540288133642103, "language_loss": 0.58320796, "learning_rate": 3.985300151882694e-06, "loss": 0.60413599, "num_input_tokens_seen": 23992640, "step": 1122, "time_per_iteration": 3.3794541358947754 }, { "auxiliary_loss_clip": 0.01145486, "auxiliary_loss_mlp": 0.01066136, "balance_loss_clip": 1.05581403, "balance_loss_mlp": 1.04167438, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 2.3361170394687076, "language_loss": 0.71965349, "learning_rate": 3.985252981610901e-06, "loss": 0.74176967, "num_input_tokens_seen": 24011135, "step": 1123, "time_per_iteration": 2.8049354553222656 }, { "auxiliary_loss_clip": 0.01144994, "auxiliary_loss_mlp": 0.01064196, "balance_loss_clip": 1.05373979, "balance_loss_mlp": 1.03612232, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 1.7380479869896208, "language_loss": 0.78987843, "learning_rate": 3.985205736058114e-06, "loss": 0.81197035, "num_input_tokens_seen": 24030695, "step": 1124, "time_per_iteration": 2.8595056533813477 }, { "auxiliary_loss_clip": 0.01189686, "auxiliary_loss_mlp": 0.01055169, "balance_loss_clip": 1.05663013, "balance_loss_mlp": 1.03200674, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 3.1450673626590793, "language_loss": 0.70999855, "learning_rate": 3.985158415226128e-06, "loss": 0.73244709, "num_input_tokens_seen": 24050680, "step": 1125, "time_per_iteration": 2.726163625717163 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01068918, "balance_loss_clip": 1.05826426, "balance_loss_mlp": 1.04290628, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 3.340323364887528, "language_loss": 0.81440383, "learning_rate": 3.985111019116736e-06, "loss": 0.83674812, "num_input_tokens_seen": 24067205, "step": 1126, "time_per_iteration": 2.7356598377227783 }, { "auxiliary_loss_clip": 0.0107201, "auxiliary_loss_mlp": 0.01004999, "balance_loss_clip": 1.0293622, "balance_loss_mlp": 1.00092208, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.77802311726495, "language_loss": 0.59720373, "learning_rate": 3.985063547731735e-06, "loss": 0.6179738, "num_input_tokens_seen": 24131320, "step": 1127, "time_per_iteration": 3.2627320289611816 }, { "auxiliary_loss_clip": 0.01206438, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.06308687, "balance_loss_mlp": 1.03189397, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 2.2535941175889054, "language_loss": 0.81097019, "learning_rate": 3.985016001072925e-06, "loss": 0.83358967, "num_input_tokens_seen": 24149930, "step": 1128, "time_per_iteration": 2.6652371883392334 }, { "auxiliary_loss_clip": 0.01158345, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.02804112, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.24200367657907, "language_loss": 0.75559127, "learning_rate": 3.984968379142109e-06, "loss": 0.77770138, "num_input_tokens_seen": 24169590, "step": 1129, "time_per_iteration": 2.7023732662200928 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01053995, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.03006983, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.890559803272908, "language_loss": 0.71710479, "learning_rate": 3.984920681941094e-06, "loss": 0.73882067, "num_input_tokens_seen": 24189965, "step": 1130, "time_per_iteration": 3.0757689476013184 }, { "auxiliary_loss_clip": 0.01158117, "auxiliary_loss_mlp": 0.010592, "balance_loss_clip": 1.05734515, "balance_loss_mlp": 1.03481019, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.24421862356218, "language_loss": 0.80776262, "learning_rate": 3.984872909471688e-06, "loss": 0.82993579, "num_input_tokens_seen": 24208045, "step": 1131, "time_per_iteration": 5.00832724571228 }, { "auxiliary_loss_clip": 0.01195331, "auxiliary_loss_mlp": 0.01070142, "balance_loss_clip": 1.06155944, "balance_loss_mlp": 1.04614532, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.0533244923502463, "language_loss": 0.80371779, "learning_rate": 3.984825061735701e-06, "loss": 0.8263725, "num_input_tokens_seen": 24223805, "step": 1132, "time_per_iteration": 4.487931251525879 }, { "auxiliary_loss_clip": 0.01170581, "auxiliary_loss_mlp": 0.01061867, "balance_loss_clip": 1.05438542, "balance_loss_mlp": 1.03756022, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.7182324226465766, "language_loss": 0.6341064, "learning_rate": 3.9847771387349495e-06, "loss": 0.65643084, "num_input_tokens_seen": 24249475, "step": 1133, "time_per_iteration": 4.48089337348938 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01055984, "balance_loss_clip": 1.04700482, "balance_loss_mlp": 1.02973366, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 1.9264963116598819, "language_loss": 0.74771935, "learning_rate": 3.9847291404712506e-06, "loss": 0.76953518, "num_input_tokens_seen": 24267980, "step": 1134, "time_per_iteration": 5.287277936935425 }, { "auxiliary_loss_clip": 0.01169269, "auxiliary_loss_mlp": 0.00782536, "balance_loss_clip": 1.05878353, "balance_loss_mlp": 1.00042605, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.151108605399924, "language_loss": 0.86871451, "learning_rate": 3.984681066946423e-06, "loss": 0.88823259, "num_input_tokens_seen": 24286805, "step": 1135, "time_per_iteration": 2.8024110794067383 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.007818, "balance_loss_clip": 1.0543226, "balance_loss_mlp": 1.00046515, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.521942237810997, "language_loss": 0.78131735, "learning_rate": 3.984632918162291e-06, "loss": 0.80090094, "num_input_tokens_seen": 24305855, "step": 1136, "time_per_iteration": 2.7595040798187256 }, { "auxiliary_loss_clip": 0.01185832, "auxiliary_loss_mlp": 0.01063587, "balance_loss_clip": 1.05952621, "balance_loss_mlp": 1.03868449, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.275643110468061, "language_loss": 0.83968467, "learning_rate": 3.984584694120679e-06, "loss": 0.86217892, "num_input_tokens_seen": 24326535, "step": 1137, "time_per_iteration": 2.7738285064697266 }, { "auxiliary_loss_clip": 0.01153105, "auxiliary_loss_mlp": 0.01059471, "balance_loss_clip": 1.05239427, "balance_loss_mlp": 1.0348897, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.068206081593879, "language_loss": 0.788486, "learning_rate": 3.984536394823418e-06, "loss": 0.81061178, "num_input_tokens_seen": 24345810, "step": 1138, "time_per_iteration": 2.804537296295166 }, { "auxiliary_loss_clip": 0.01209658, "auxiliary_loss_mlp": 0.01058353, "balance_loss_clip": 1.06288362, "balance_loss_mlp": 1.03415346, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 2.3335265924104096, "language_loss": 0.85507643, "learning_rate": 3.984488020272336e-06, "loss": 0.87775654, "num_input_tokens_seen": 24366095, "step": 1139, "time_per_iteration": 2.746884822845459 }, { "auxiliary_loss_clip": 0.01153855, "auxiliary_loss_mlp": 0.01063721, "balance_loss_clip": 1.05325532, "balance_loss_mlp": 1.03679228, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 1.9254794009430078, "language_loss": 0.74899161, "learning_rate": 3.984439570469271e-06, "loss": 0.7711674, "num_input_tokens_seen": 24388665, "step": 1140, "time_per_iteration": 2.938143253326416 }, { "auxiliary_loss_clip": 0.01186218, "auxiliary_loss_mlp": 0.00782227, "balance_loss_clip": 1.06101704, "balance_loss_mlp": 1.00036597, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.1250887020504767, "language_loss": 0.68258876, "learning_rate": 3.9843910454160574e-06, "loss": 0.70227319, "num_input_tokens_seen": 24407705, "step": 1141, "time_per_iteration": 2.8180530071258545 }, { "auxiliary_loss_clip": 0.01197117, "auxiliary_loss_mlp": 0.01067748, "balance_loss_clip": 1.05978489, "balance_loss_mlp": 1.04266596, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 1.8460768582410394, "language_loss": 0.78959155, "learning_rate": 3.984342445114538e-06, "loss": 0.81224018, "num_input_tokens_seen": 24428390, "step": 1142, "time_per_iteration": 2.712876558303833 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01060882, "balance_loss_clip": 1.06245089, "balance_loss_mlp": 1.03702831, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 1.7867268614306446, "language_loss": 0.68287402, "learning_rate": 3.984293769566553e-06, "loss": 0.70535195, "num_input_tokens_seen": 24450810, "step": 1143, "time_per_iteration": 2.752659320831299 }, { "auxiliary_loss_clip": 0.01177843, "auxiliary_loss_mlp": 0.01059894, "balance_loss_clip": 1.05798244, "balance_loss_mlp": 1.03773308, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.7582250309313294, "language_loss": 0.74307454, "learning_rate": 3.98424501877395e-06, "loss": 0.76545191, "num_input_tokens_seen": 24469965, "step": 1144, "time_per_iteration": 2.6448662281036377 }, { "auxiliary_loss_clip": 0.01189197, "auxiliary_loss_mlp": 0.0106544, "balance_loss_clip": 1.0565474, "balance_loss_mlp": 1.04039407, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.699041414372958, "language_loss": 0.91755033, "learning_rate": 3.984196192738577e-06, "loss": 0.94009674, "num_input_tokens_seen": 24486370, "step": 1145, "time_per_iteration": 2.6621482372283936 }, { "auxiliary_loss_clip": 0.01212189, "auxiliary_loss_mlp": 0.0106819, "balance_loss_clip": 1.06225932, "balance_loss_mlp": 1.04258406, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 2.2014676012481487, "language_loss": 0.81726635, "learning_rate": 3.984147291462285e-06, "loss": 0.84007025, "num_input_tokens_seen": 24503780, "step": 1146, "time_per_iteration": 2.623964548110962 }, { "auxiliary_loss_clip": 0.01204602, "auxiliary_loss_mlp": 0.01065301, "balance_loss_clip": 1.06215203, "balance_loss_mlp": 1.04191244, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.1265245828428108, "language_loss": 0.84968954, "learning_rate": 3.98409831494693e-06, "loss": 0.8723886, "num_input_tokens_seen": 24522320, "step": 1147, "time_per_iteration": 2.5898265838623047 }, { "auxiliary_loss_clip": 0.01156886, "auxiliary_loss_mlp": 0.01064453, "balance_loss_clip": 1.05563867, "balance_loss_mlp": 1.03949046, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.7557033260323716, "language_loss": 0.86094105, "learning_rate": 3.984049263194367e-06, "loss": 0.88315445, "num_input_tokens_seen": 24540445, "step": 1148, "time_per_iteration": 2.748782157897949 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.05569541, "balance_loss_mlp": 1.03370178, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.322434023005448, "language_loss": 0.69602191, "learning_rate": 3.9840001362064575e-06, "loss": 0.71835601, "num_input_tokens_seen": 24557105, "step": 1149, "time_per_iteration": 2.741854429244995 }, { "auxiliary_loss_clip": 0.01207871, "auxiliary_loss_mlp": 0.01051245, "balance_loss_clip": 1.06034219, "balance_loss_mlp": 1.02692604, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.9440351937259064, "language_loss": 0.8374452, "learning_rate": 3.983950933985064e-06, "loss": 0.86003637, "num_input_tokens_seen": 24578240, "step": 1150, "time_per_iteration": 2.6919586658477783 }, { "auxiliary_loss_clip": 0.01181406, "auxiliary_loss_mlp": 0.01058015, "balance_loss_clip": 1.06063652, "balance_loss_mlp": 1.03380394, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 4.11905785776886, "language_loss": 0.81464434, "learning_rate": 3.983901656532052e-06, "loss": 0.83703858, "num_input_tokens_seen": 24593585, "step": 1151, "time_per_iteration": 2.7979934215545654 }, { "auxiliary_loss_clip": 0.01206831, "auxiliary_loss_mlp": 0.01058184, "balance_loss_clip": 1.06409955, "balance_loss_mlp": 1.03434169, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 2.0324362571668724, "language_loss": 0.85408235, "learning_rate": 3.983852303849291e-06, "loss": 0.87673247, "num_input_tokens_seen": 24613110, "step": 1152, "time_per_iteration": 2.686021089553833 }, { "auxiliary_loss_clip": 0.01190935, "auxiliary_loss_mlp": 0.01062076, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.03866374, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.182544196511779, "language_loss": 0.90594423, "learning_rate": 3.983802875938651e-06, "loss": 0.92847437, "num_input_tokens_seen": 24628795, "step": 1153, "time_per_iteration": 2.58366060256958 }, { "auxiliary_loss_clip": 0.01169877, "auxiliary_loss_mlp": 0.01055253, "balance_loss_clip": 1.05681062, "balance_loss_mlp": 1.03088629, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.1214794624630846, "language_loss": 0.81526846, "learning_rate": 3.983753372802008e-06, "loss": 0.83751976, "num_input_tokens_seen": 24645480, "step": 1154, "time_per_iteration": 2.696794271469116 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01066335, "balance_loss_clip": 1.0691216, "balance_loss_mlp": 1.04200506, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 2.102018399986892, "language_loss": 0.75022292, "learning_rate": 3.983703794441237e-06, "loss": 0.77277398, "num_input_tokens_seen": 24664630, "step": 1155, "time_per_iteration": 2.7718143463134766 }, { "auxiliary_loss_clip": 0.01180696, "auxiliary_loss_mlp": 0.00782152, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.00041056, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.7459449483933205, "language_loss": 0.7110405, "learning_rate": 3.98365414085822e-06, "loss": 0.73066902, "num_input_tokens_seen": 24684210, "step": 1156, "time_per_iteration": 2.7014200687408447 }, { "auxiliary_loss_clip": 0.01179101, "auxiliary_loss_mlp": 0.00782674, "balance_loss_clip": 1.0593586, "balance_loss_mlp": 1.00037348, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 2.067241397655847, "language_loss": 0.74882817, "learning_rate": 3.98360441205484e-06, "loss": 0.76844591, "num_input_tokens_seen": 24702490, "step": 1157, "time_per_iteration": 2.7571897506713867 }, { "auxiliary_loss_clip": 0.01178249, "auxiliary_loss_mlp": 0.01061737, "balance_loss_clip": 1.05653787, "balance_loss_mlp": 1.03697729, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 1.9827644507913538, "language_loss": 0.7165724, "learning_rate": 3.983554608032982e-06, "loss": 0.73897225, "num_input_tokens_seen": 24724340, "step": 1158, "time_per_iteration": 2.839745044708252 }, { "auxiliary_loss_clip": 0.01207855, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.03370285, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.9692207215605615, "language_loss": 0.79595017, "learning_rate": 3.983504728794533e-06, "loss": 0.8186143, "num_input_tokens_seen": 24745550, "step": 1159, "time_per_iteration": 2.7535817623138428 }, { "auxiliary_loss_clip": 0.01212717, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.06535673, "balance_loss_mlp": 1.04094958, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 3.5530789367722373, "language_loss": 0.80517769, "learning_rate": 3.983454774341387e-06, "loss": 0.82799017, "num_input_tokens_seen": 24762575, "step": 1160, "time_per_iteration": 2.7455785274505615 }, { "auxiliary_loss_clip": 0.0119075, "auxiliary_loss_mlp": 0.01057887, "balance_loss_clip": 1.05680609, "balance_loss_mlp": 1.03294837, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.6303409062485206, "language_loss": 0.7607069, "learning_rate": 3.983404744675437e-06, "loss": 0.78319323, "num_input_tokens_seen": 24782605, "step": 1161, "time_per_iteration": 2.773775100708008 }, { "auxiliary_loss_clip": 0.01175787, "auxiliary_loss_mlp": 0.01062083, "balance_loss_clip": 1.05773759, "balance_loss_mlp": 1.03673923, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.6605796421434038, "language_loss": 0.82758528, "learning_rate": 3.9833546397985794e-06, "loss": 0.84996402, "num_input_tokens_seen": 24802910, "step": 1162, "time_per_iteration": 2.7426044940948486 }, { "auxiliary_loss_clip": 0.01182513, "auxiliary_loss_mlp": 0.01058124, "balance_loss_clip": 1.05717576, "balance_loss_mlp": 1.03092098, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 1.9523155091610094, "language_loss": 0.79563475, "learning_rate": 3.983304459712716e-06, "loss": 0.81804121, "num_input_tokens_seen": 24823305, "step": 1163, "time_per_iteration": 2.720947742462158 }, { "auxiliary_loss_clip": 0.01190519, "auxiliary_loss_mlp": 0.01063375, "balance_loss_clip": 1.05861616, "balance_loss_mlp": 1.03722012, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.213365660843382, "language_loss": 0.79187214, "learning_rate": 3.983254204419749e-06, "loss": 0.81441104, "num_input_tokens_seen": 24842155, "step": 1164, "time_per_iteration": 2.6554183959960938 }, { "auxiliary_loss_clip": 0.01143916, "auxiliary_loss_mlp": 0.01067459, "balance_loss_clip": 1.05240798, "balance_loss_mlp": 1.03875315, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.421930435008642, "language_loss": 0.72855628, "learning_rate": 3.983203873921583e-06, "loss": 0.75067008, "num_input_tokens_seen": 24862080, "step": 1165, "time_per_iteration": 2.753063440322876 }, { "auxiliary_loss_clip": 0.01183824, "auxiliary_loss_mlp": 0.01059612, "balance_loss_clip": 1.06135893, "balance_loss_mlp": 1.03522193, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 2.453348821242437, "language_loss": 0.81136239, "learning_rate": 3.983153468220128e-06, "loss": 0.83379674, "num_input_tokens_seen": 24886165, "step": 1166, "time_per_iteration": 2.802016496658325 }, { "auxiliary_loss_clip": 0.011718, "auxiliary_loss_mlp": 0.01053529, "balance_loss_clip": 1.05450797, "balance_loss_mlp": 1.02754176, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.457667377154448, "language_loss": 0.84640259, "learning_rate": 3.983102987317295e-06, "loss": 0.86865586, "num_input_tokens_seen": 24905775, "step": 1167, "time_per_iteration": 2.7066097259521484 }, { "auxiliary_loss_clip": 0.01193446, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.06136739, "balance_loss_mlp": 1.03887713, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.6158204436543, "language_loss": 0.89524722, "learning_rate": 3.983052431214997e-06, "loss": 0.91782373, "num_input_tokens_seen": 24924295, "step": 1168, "time_per_iteration": 2.6258392333984375 }, { "auxiliary_loss_clip": 0.01190821, "auxiliary_loss_mlp": 0.01065905, "balance_loss_clip": 1.06090224, "balance_loss_mlp": 1.03705645, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 2.6445150319591035, "language_loss": 0.89008862, "learning_rate": 3.983001799915153e-06, "loss": 0.91265589, "num_input_tokens_seen": 24943210, "step": 1169, "time_per_iteration": 2.6858527660369873 }, { "auxiliary_loss_clip": 0.01211063, "auxiliary_loss_mlp": 0.01065533, "balance_loss_clip": 1.06400895, "balance_loss_mlp": 1.03950977, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 1.9672897290124218, "language_loss": 0.83834457, "learning_rate": 3.982951093419681e-06, "loss": 0.86111057, "num_input_tokens_seen": 24960360, "step": 1170, "time_per_iteration": 2.6278069019317627 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.00782328, "balance_loss_clip": 1.0613637, "balance_loss_mlp": 1.00041986, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 1.8542795171503503, "language_loss": 0.75687242, "learning_rate": 3.982900311730506e-06, "loss": 0.77649903, "num_input_tokens_seen": 24978290, "step": 1171, "time_per_iteration": 5.806530475616455 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06133175, "balance_loss_mlp": 1.03919196, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 2.482864122539831, "language_loss": 0.88865125, "learning_rate": 3.9828494548495514e-06, "loss": 0.91108704, "num_input_tokens_seen": 24997055, "step": 1172, "time_per_iteration": 4.371561288833618 }, { "auxiliary_loss_clip": 0.01197698, "auxiliary_loss_mlp": 0.01054991, "balance_loss_clip": 1.06532764, "balance_loss_mlp": 1.02858603, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.6816354314161714, "language_loss": 0.82075119, "learning_rate": 3.982798522778748e-06, "loss": 0.84327805, "num_input_tokens_seen": 25017490, "step": 1173, "time_per_iteration": 4.611542463302612 }, { "auxiliary_loss_clip": 0.01200886, "auxiliary_loss_mlp": 0.01060851, "balance_loss_clip": 1.06317592, "balance_loss_mlp": 1.03503036, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.007232853627583, "language_loss": 0.82071686, "learning_rate": 3.9827475155200245e-06, "loss": 0.8433342, "num_input_tokens_seen": 25035660, "step": 1174, "time_per_iteration": 2.6334969997406006 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.05857778, "balance_loss_mlp": 1.03473568, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 2.09222115072597, "language_loss": 0.85013211, "learning_rate": 3.982696433075317e-06, "loss": 0.87254095, "num_input_tokens_seen": 25054785, "step": 1175, "time_per_iteration": 2.861591339111328 }, { "auxiliary_loss_clip": 0.01196955, "auxiliary_loss_mlp": 0.01069941, "balance_loss_clip": 1.06447482, "balance_loss_mlp": 1.04605186, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.7270820646539309, "language_loss": 0.83103871, "learning_rate": 3.982645275446563e-06, "loss": 0.85370767, "num_input_tokens_seen": 25075180, "step": 1176, "time_per_iteration": 2.754521608352661 }, { "auxiliary_loss_clip": 0.01152261, "auxiliary_loss_mlp": 0.01062154, "balance_loss_clip": 1.05370057, "balance_loss_mlp": 1.0352838, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 3.4939498355716996, "language_loss": 0.74409902, "learning_rate": 3.982594042635701e-06, "loss": 0.7662431, "num_input_tokens_seen": 25093035, "step": 1177, "time_per_iteration": 2.692426919937134 }, { "auxiliary_loss_clip": 0.01188551, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06080353, "balance_loss_mlp": 1.03801203, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.8240190288677762, "language_loss": 0.85965598, "learning_rate": 3.982542734644673e-06, "loss": 0.88218087, "num_input_tokens_seen": 25112520, "step": 1178, "time_per_iteration": 2.7197048664093018 }, { "auxiliary_loss_clip": 0.01082521, "auxiliary_loss_mlp": 0.01013999, "balance_loss_clip": 1.03661168, "balance_loss_mlp": 1.01023197, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8453670789764802, "language_loss": 0.63256603, "learning_rate": 3.982491351475427e-06, "loss": 0.65353125, "num_input_tokens_seen": 25177760, "step": 1179, "time_per_iteration": 3.3419978618621826 }, { "auxiliary_loss_clip": 0.01211274, "auxiliary_loss_mlp": 0.01073372, "balance_loss_clip": 1.06935215, "balance_loss_mlp": 1.04858887, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 3.2714198066984177, "language_loss": 0.83388901, "learning_rate": 3.98243989312991e-06, "loss": 0.85673553, "num_input_tokens_seen": 25195260, "step": 1180, "time_per_iteration": 2.631992816925049 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01071326, "balance_loss_clip": 1.06119037, "balance_loss_mlp": 1.04624391, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.0409456536886386, "language_loss": 0.88649988, "learning_rate": 3.982388359610074e-06, "loss": 0.90903974, "num_input_tokens_seen": 25212740, "step": 1181, "time_per_iteration": 2.696789264678955 }, { "auxiliary_loss_clip": 0.01180377, "auxiliary_loss_mlp": 0.01070036, "balance_loss_clip": 1.06187141, "balance_loss_mlp": 1.04516935, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.8294049229574356, "language_loss": 0.83244783, "learning_rate": 3.9823367509178725e-06, "loss": 0.85495198, "num_input_tokens_seen": 25236420, "step": 1182, "time_per_iteration": 2.9415605068206787 }, { "auxiliary_loss_clip": 0.01193669, "auxiliary_loss_mlp": 0.01067019, "balance_loss_clip": 1.0641923, "balance_loss_mlp": 1.04150808, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 3.5892595189310903, "language_loss": 0.79067838, "learning_rate": 3.982285067055262e-06, "loss": 0.81328523, "num_input_tokens_seen": 25255120, "step": 1183, "time_per_iteration": 2.7284862995147705 }, { "auxiliary_loss_clip": 0.01211976, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.06126475, "balance_loss_mlp": 1.03866172, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.5463322111759354, "language_loss": 0.788867, "learning_rate": 3.982233308024204e-06, "loss": 0.81163466, "num_input_tokens_seen": 25275150, "step": 1184, "time_per_iteration": 2.7531635761260986 }, { "auxiliary_loss_clip": 0.01152059, "auxiliary_loss_mlp": 0.01062006, "balance_loss_clip": 1.05961919, "balance_loss_mlp": 1.03752065, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.904751850318294, "language_loss": 0.76806915, "learning_rate": 3.98218147382666e-06, "loss": 0.79020983, "num_input_tokens_seen": 25293680, "step": 1185, "time_per_iteration": 2.732539176940918 }, { "auxiliary_loss_clip": 0.01208288, "auxiliary_loss_mlp": 0.01073792, "balance_loss_clip": 1.06328642, "balance_loss_mlp": 1.04903185, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.1301142092644696, "language_loss": 0.65472758, "learning_rate": 3.982129564464596e-06, "loss": 0.67754835, "num_input_tokens_seen": 25310050, "step": 1186, "time_per_iteration": 2.757812261581421 }, { "auxiliary_loss_clip": 0.01195497, "auxiliary_loss_mlp": 0.01057322, "balance_loss_clip": 1.06479859, "balance_loss_mlp": 1.03274107, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 2.1671481434625894, "language_loss": 0.69743419, "learning_rate": 3.98207757993998e-06, "loss": 0.71996236, "num_input_tokens_seen": 25331020, "step": 1187, "time_per_iteration": 2.746615409851074 }, { "auxiliary_loss_clip": 0.01151827, "auxiliary_loss_mlp": 0.01067347, "balance_loss_clip": 1.05412316, "balance_loss_mlp": 1.04367232, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.8037131445876597, "language_loss": 0.7861973, "learning_rate": 3.9820255202547845e-06, "loss": 0.80838895, "num_input_tokens_seen": 25347875, "step": 1188, "time_per_iteration": 2.738281726837158 }, { "auxiliary_loss_clip": 0.01203626, "auxiliary_loss_mlp": 0.01059966, "balance_loss_clip": 1.06304908, "balance_loss_mlp": 1.03530121, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 1.8909260147246576, "language_loss": 0.84754103, "learning_rate": 3.981973385410981e-06, "loss": 0.87017697, "num_input_tokens_seen": 25366715, "step": 1189, "time_per_iteration": 2.5770246982574463 }, { "auxiliary_loss_clip": 0.01173135, "auxiliary_loss_mlp": 0.0078213, "balance_loss_clip": 1.06234396, "balance_loss_mlp": 1.00041807, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 5.212083930118342, "language_loss": 0.76932275, "learning_rate": 3.9819211754105494e-06, "loss": 0.78887534, "num_input_tokens_seen": 25385450, "step": 1190, "time_per_iteration": 2.7057712078094482 }, { "auxiliary_loss_clip": 0.01208346, "auxiliary_loss_mlp": 0.01074705, "balance_loss_clip": 1.06283545, "balance_loss_mlp": 1.04751348, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.5312098602102084, "language_loss": 0.75201792, "learning_rate": 3.981868890255468e-06, "loss": 0.7748484, "num_input_tokens_seen": 25403940, "step": 1191, "time_per_iteration": 2.6071674823760986 }, { "auxiliary_loss_clip": 0.01162268, "auxiliary_loss_mlp": 0.01063437, "balance_loss_clip": 1.0519917, "balance_loss_mlp": 1.03649545, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.470839013019174, "language_loss": 0.74334443, "learning_rate": 3.981816529947719e-06, "loss": 0.76560152, "num_input_tokens_seen": 25420410, "step": 1192, "time_per_iteration": 2.661078453063965 }, { "auxiliary_loss_clip": 0.01202036, "auxiliary_loss_mlp": 0.01054727, "balance_loss_clip": 1.05904579, "balance_loss_mlp": 1.03099298, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 2.443309122344248, "language_loss": 0.78010541, "learning_rate": 3.9817640944892896e-06, "loss": 0.8026731, "num_input_tokens_seen": 25439415, "step": 1193, "time_per_iteration": 2.5603158473968506 }, { "auxiliary_loss_clip": 0.01186747, "auxiliary_loss_mlp": 0.01059465, "balance_loss_clip": 1.06358278, "balance_loss_mlp": 1.03319085, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 2.1011663585924585, "language_loss": 0.85497916, "learning_rate": 3.981711583882166e-06, "loss": 0.87744129, "num_input_tokens_seen": 25458715, "step": 1194, "time_per_iteration": 2.6819851398468018 }, { "auxiliary_loss_clip": 0.01184191, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05706751, "balance_loss_mlp": 1.04135609, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 2.0205668140023185, "language_loss": 0.8183766, "learning_rate": 3.981658998128341e-06, "loss": 0.84089589, "num_input_tokens_seen": 25477985, "step": 1195, "time_per_iteration": 2.6646647453308105 }, { "auxiliary_loss_clip": 0.01165951, "auxiliary_loss_mlp": 0.01063438, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03976321, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 2.161995064372768, "language_loss": 0.80093575, "learning_rate": 3.981606337229808e-06, "loss": 0.82322967, "num_input_tokens_seen": 25497110, "step": 1196, "time_per_iteration": 2.7217979431152344 }, { "auxiliary_loss_clip": 0.01176131, "auxiliary_loss_mlp": 0.00784114, "balance_loss_clip": 1.06106043, "balance_loss_mlp": 1.00034249, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 2.5905261146074263, "language_loss": 0.71339291, "learning_rate": 3.9815536011885655e-06, "loss": 0.73299539, "num_input_tokens_seen": 25516555, "step": 1197, "time_per_iteration": 2.7931766510009766 }, { "auxiliary_loss_clip": 0.01157444, "auxiliary_loss_mlp": 0.01055247, "balance_loss_clip": 1.06130266, "balance_loss_mlp": 1.03074968, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 3.074283933156949, "language_loss": 0.85951984, "learning_rate": 3.98150079000661e-06, "loss": 0.88164675, "num_input_tokens_seen": 25533895, "step": 1198, "time_per_iteration": 2.7241532802581787 }, { "auxiliary_loss_clip": 0.01160083, "auxiliary_loss_mlp": 0.0106501, "balance_loss_clip": 1.0597434, "balance_loss_mlp": 1.03944004, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 2.052617638295489, "language_loss": 0.83840948, "learning_rate": 3.981447903685947e-06, "loss": 0.86066043, "num_input_tokens_seen": 25554195, "step": 1199, "time_per_iteration": 2.71362566947937 }, { "auxiliary_loss_clip": 0.01212755, "auxiliary_loss_mlp": 0.01060557, "balance_loss_clip": 1.06877887, "balance_loss_mlp": 1.03709614, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 3.1601590133124837, "language_loss": 0.7623595, "learning_rate": 3.981394942228581e-06, "loss": 0.78509259, "num_input_tokens_seen": 25574155, "step": 1200, "time_per_iteration": 2.6913061141967773 }, { "auxiliary_loss_clip": 0.0119008, "auxiliary_loss_mlp": 0.010701, "balance_loss_clip": 1.06442261, "balance_loss_mlp": 1.04487491, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.2017873087036226, "language_loss": 0.83013475, "learning_rate": 3.98134190563652e-06, "loss": 0.85273659, "num_input_tokens_seen": 25592735, "step": 1201, "time_per_iteration": 2.6983115673065186 }, { "auxiliary_loss_clip": 0.01196941, "auxiliary_loss_mlp": 0.01065672, "balance_loss_clip": 1.06197119, "balance_loss_mlp": 1.03952968, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 20.835065187143087, "language_loss": 0.68601412, "learning_rate": 3.981288793911775e-06, "loss": 0.70864022, "num_input_tokens_seen": 25611510, "step": 1202, "time_per_iteration": 2.691742420196533 }, { "auxiliary_loss_clip": 0.01182684, "auxiliary_loss_mlp": 0.00782201, "balance_loss_clip": 1.06256962, "balance_loss_mlp": 1.00038218, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 1.9661831136137597, "language_loss": 0.87487721, "learning_rate": 3.98123560705636e-06, "loss": 0.89452606, "num_input_tokens_seen": 25629560, "step": 1203, "time_per_iteration": 2.7832019329071045 }, { "auxiliary_loss_clip": 0.01154778, "auxiliary_loss_mlp": 0.01065748, "balance_loss_clip": 1.05210066, "balance_loss_mlp": 1.04065442, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 1.731721557525142, "language_loss": 0.78053147, "learning_rate": 3.981182345072293e-06, "loss": 0.80273676, "num_input_tokens_seen": 25648330, "step": 1204, "time_per_iteration": 2.7754547595977783 }, { "auxiliary_loss_clip": 0.01191832, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.06211591, "balance_loss_mlp": 1.04084373, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.5043252978087258, "language_loss": 0.82094097, "learning_rate": 3.981129007961593e-06, "loss": 0.84351724, "num_input_tokens_seen": 25669470, "step": 1205, "time_per_iteration": 2.680457353591919 }, { "auxiliary_loss_clip": 0.01180244, "auxiliary_loss_mlp": 0.00782807, "balance_loss_clip": 1.06221068, "balance_loss_mlp": 1.00036049, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.6438962430217685, "language_loss": 0.76715982, "learning_rate": 3.981075595726283e-06, "loss": 0.78679025, "num_input_tokens_seen": 25690470, "step": 1206, "time_per_iteration": 2.7028439044952393 }, { "auxiliary_loss_clip": 0.01188223, "auxiliary_loss_mlp": 0.01059861, "balance_loss_clip": 1.06262684, "balance_loss_mlp": 1.03442228, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 1.9378198243304647, "language_loss": 0.77272987, "learning_rate": 3.981022108368387e-06, "loss": 0.79521072, "num_input_tokens_seen": 25709205, "step": 1207, "time_per_iteration": 2.779289960861206 }, { "auxiliary_loss_clip": 0.01185538, "auxiliary_loss_mlp": 0.01053693, "balance_loss_clip": 1.05844951, "balance_loss_mlp": 1.03062558, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 1.8716528383816402, "language_loss": 0.79480875, "learning_rate": 3.9809685458899345e-06, "loss": 0.81720108, "num_input_tokens_seen": 25728485, "step": 1208, "time_per_iteration": 2.682965040206909 }, { "auxiliary_loss_clip": 0.01184899, "auxiliary_loss_mlp": 0.01054862, "balance_loss_clip": 1.05801737, "balance_loss_mlp": 1.03198612, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 2.5612886109689765, "language_loss": 0.78537548, "learning_rate": 3.980914908292955e-06, "loss": 0.80777311, "num_input_tokens_seen": 25747730, "step": 1209, "time_per_iteration": 2.6582658290863037 }, { "auxiliary_loss_clip": 0.01191905, "auxiliary_loss_mlp": 0.01067741, "balance_loss_clip": 1.05931175, "balance_loss_mlp": 1.04408956, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.351303434522043, "language_loss": 0.80920583, "learning_rate": 3.980861195579486e-06, "loss": 0.83180225, "num_input_tokens_seen": 25768050, "step": 1210, "time_per_iteration": 4.241993427276611 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01063711, "balance_loss_clip": 1.06087565, "balance_loss_mlp": 1.03891551, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.875347829314158, "language_loss": 0.84302205, "learning_rate": 3.98080740775156e-06, "loss": 0.86540848, "num_input_tokens_seen": 25787985, "step": 1211, "time_per_iteration": 4.289919853210449 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01060218, "balance_loss_clip": 1.05356658, "balance_loss_mlp": 1.03629231, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.991110515222773, "language_loss": 0.90684664, "learning_rate": 3.98075354481122e-06, "loss": 0.92906934, "num_input_tokens_seen": 25803620, "step": 1212, "time_per_iteration": 2.660780906677246 }, { "auxiliary_loss_clip": 0.01202443, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.0623759, "balance_loss_mlp": 1.03490353, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7918815842724805, "language_loss": 0.72358596, "learning_rate": 3.9806996067605055e-06, "loss": 0.74619853, "num_input_tokens_seen": 25823315, "step": 1213, "time_per_iteration": 4.303524017333984 }, { "auxiliary_loss_clip": 0.01153662, "auxiliary_loss_mlp": 0.01055706, "balance_loss_clip": 1.05658662, "balance_loss_mlp": 1.03089869, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 1.8655932637344164, "language_loss": 0.84356117, "learning_rate": 3.980645593601465e-06, "loss": 0.86565483, "num_input_tokens_seen": 25842605, "step": 1214, "time_per_iteration": 2.7505569458007812 }, { "auxiliary_loss_clip": 0.01208881, "auxiliary_loss_mlp": 0.01062075, "balance_loss_clip": 1.06484771, "balance_loss_mlp": 1.03723145, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.025651344907852, "language_loss": 0.84113681, "learning_rate": 3.980591505336144e-06, "loss": 0.86384636, "num_input_tokens_seen": 25863030, "step": 1215, "time_per_iteration": 2.7235965728759766 }, { "auxiliary_loss_clip": 0.01149957, "auxiliary_loss_mlp": 0.01062992, "balance_loss_clip": 1.05138278, "balance_loss_mlp": 1.03744531, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.9312816725096997, "language_loss": 0.80926049, "learning_rate": 3.980537341966595e-06, "loss": 0.83139002, "num_input_tokens_seen": 25888015, "step": 1216, "time_per_iteration": 2.9129130840301514 }, { "auxiliary_loss_clip": 0.01167944, "auxiliary_loss_mlp": 0.01060276, "balance_loss_clip": 1.05619049, "balance_loss_mlp": 1.03680408, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 3.2846247291101975, "language_loss": 0.75949144, "learning_rate": 3.980483103494872e-06, "loss": 0.78177369, "num_input_tokens_seen": 25908660, "step": 1217, "time_per_iteration": 2.7106521129608154 }, { "auxiliary_loss_clip": 0.01169026, "auxiliary_loss_mlp": 0.01056631, "balance_loss_clip": 1.06182647, "balance_loss_mlp": 1.03477991, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 1.9658490798069863, "language_loss": 0.86455309, "learning_rate": 3.98042878992303e-06, "loss": 0.88680959, "num_input_tokens_seen": 25927215, "step": 1218, "time_per_iteration": 2.5911786556243896 }, { "auxiliary_loss_clip": 0.01192266, "auxiliary_loss_mlp": 0.0106258, "balance_loss_clip": 1.06015348, "balance_loss_mlp": 1.03916681, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 2.2310702082820675, "language_loss": 0.86782354, "learning_rate": 3.9803744012531305e-06, "loss": 0.89037204, "num_input_tokens_seen": 25945500, "step": 1219, "time_per_iteration": 2.608562707901001 }, { "auxiliary_loss_clip": 0.01201545, "auxiliary_loss_mlp": 0.01058282, "balance_loss_clip": 1.06024373, "balance_loss_mlp": 1.03539419, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.095886373367052, "language_loss": 0.84608674, "learning_rate": 3.980319937487235e-06, "loss": 0.86868501, "num_input_tokens_seen": 25963105, "step": 1220, "time_per_iteration": 2.469189405441284 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.05358922, "balance_loss_mlp": 1.03942597, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.648884311755534, "language_loss": 0.77114344, "learning_rate": 3.98026539862741e-06, "loss": 0.79336596, "num_input_tokens_seen": 25981690, "step": 1221, "time_per_iteration": 2.671762466430664 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.05726743, "balance_loss_mlp": 1.04082406, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 2.5357389392469942, "language_loss": 0.91631913, "learning_rate": 3.980210784675722e-06, "loss": 0.93855029, "num_input_tokens_seen": 25999890, "step": 1222, "time_per_iteration": 2.6973063945770264 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01064872, "balance_loss_clip": 1.05333126, "balance_loss_mlp": 1.04169726, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.8024324299253047, "language_loss": 0.90976465, "learning_rate": 3.980156095634242e-06, "loss": 0.93177247, "num_input_tokens_seen": 26016445, "step": 1223, "time_per_iteration": 2.8141093254089355 }, { "auxiliary_loss_clip": 0.01202875, "auxiliary_loss_mlp": 0.01077185, "balance_loss_clip": 1.06232905, "balance_loss_mlp": 1.05341494, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 1.9348534518871447, "language_loss": 0.82161939, "learning_rate": 3.980101331505045e-06, "loss": 0.84442002, "num_input_tokens_seen": 26036080, "step": 1224, "time_per_iteration": 2.640432119369507 }, { "auxiliary_loss_clip": 0.01200329, "auxiliary_loss_mlp": 0.01057586, "balance_loss_clip": 1.05987597, "balance_loss_mlp": 1.03229022, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.31744406237409, "language_loss": 0.83194047, "learning_rate": 3.9800464922902076e-06, "loss": 0.85451961, "num_input_tokens_seen": 26055805, "step": 1225, "time_per_iteration": 2.6159210205078125 }, { "auxiliary_loss_clip": 0.01170115, "auxiliary_loss_mlp": 0.01056068, "balance_loss_clip": 1.05743551, "balance_loss_mlp": 1.03190422, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 2.2959030425986544, "language_loss": 0.90388274, "learning_rate": 3.979991577991808e-06, "loss": 0.9261446, "num_input_tokens_seen": 26073905, "step": 1226, "time_per_iteration": 2.6527435779571533 }, { "auxiliary_loss_clip": 0.01207799, "auxiliary_loss_mlp": 0.0104599, "balance_loss_clip": 1.05913424, "balance_loss_mlp": 1.02080154, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.579592162134606, "language_loss": 0.76626784, "learning_rate": 3.97993658861193e-06, "loss": 0.78880572, "num_input_tokens_seen": 26091700, "step": 1227, "time_per_iteration": 2.596151351928711 }, { "auxiliary_loss_clip": 0.0118909, "auxiliary_loss_mlp": 0.01053386, "balance_loss_clip": 1.06296694, "balance_loss_mlp": 1.02954459, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 7.788838200212175, "language_loss": 0.8555491, "learning_rate": 3.9798815241526575e-06, "loss": 0.87797379, "num_input_tokens_seen": 26114105, "step": 1228, "time_per_iteration": 2.6955716609954834 }, { "auxiliary_loss_clip": 0.01191175, "auxiliary_loss_mlp": 0.01062669, "balance_loss_clip": 1.05897212, "balance_loss_mlp": 1.03860044, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.2575099517148898, "language_loss": 0.79598552, "learning_rate": 3.97982638461608e-06, "loss": 0.818524, "num_input_tokens_seen": 26131165, "step": 1229, "time_per_iteration": 2.6544861793518066 }, { "auxiliary_loss_clip": 0.01192886, "auxiliary_loss_mlp": 0.00782044, "balance_loss_clip": 1.05966699, "balance_loss_mlp": 1.00032902, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 2.2881874382496377, "language_loss": 0.78209347, "learning_rate": 3.979771170004287e-06, "loss": 0.80184281, "num_input_tokens_seen": 26150040, "step": 1230, "time_per_iteration": 2.6001133918762207 }, { "auxiliary_loss_clip": 0.0120142, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.06209648, "balance_loss_mlp": 1.02739108, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2.038847041772147, "language_loss": 0.8136946, "learning_rate": 3.979715880319372e-06, "loss": 0.83623219, "num_input_tokens_seen": 26169380, "step": 1231, "time_per_iteration": 2.6364073753356934 }, { "auxiliary_loss_clip": 0.01179975, "auxiliary_loss_mlp": 0.01070917, "balance_loss_clip": 1.05690873, "balance_loss_mlp": 1.04599047, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.096832924731062, "language_loss": 0.95204866, "learning_rate": 3.979660515563434e-06, "loss": 0.97455758, "num_input_tokens_seen": 26189420, "step": 1232, "time_per_iteration": 2.7929203510284424 }, { "auxiliary_loss_clip": 0.01187282, "auxiliary_loss_mlp": 0.01059661, "balance_loss_clip": 1.06202245, "balance_loss_mlp": 1.03733301, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.7778448126368063, "language_loss": 0.80695188, "learning_rate": 3.979605075738569e-06, "loss": 0.82942128, "num_input_tokens_seen": 26209300, "step": 1233, "time_per_iteration": 2.7945051193237305 }, { "auxiliary_loss_clip": 0.01209245, "auxiliary_loss_mlp": 0.0106207, "balance_loss_clip": 1.06238747, "balance_loss_mlp": 1.03602231, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.136728864247421, "language_loss": 0.70708907, "learning_rate": 3.979549560846883e-06, "loss": 0.72980225, "num_input_tokens_seen": 26228110, "step": 1234, "time_per_iteration": 2.9646782875061035 }, { "auxiliary_loss_clip": 0.01167486, "auxiliary_loss_mlp": 0.01068879, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 1.04265285, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.7921102377369336, "language_loss": 0.76852918, "learning_rate": 3.979493970890478e-06, "loss": 0.79089284, "num_input_tokens_seen": 26247020, "step": 1235, "time_per_iteration": 2.820577621459961 }, { "auxiliary_loss_clip": 0.01198028, "auxiliary_loss_mlp": 0.01055883, "balance_loss_clip": 1.05918813, "balance_loss_mlp": 1.0321244, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 2.3018318065058097, "language_loss": 0.82748145, "learning_rate": 3.979438305871464e-06, "loss": 0.85002053, "num_input_tokens_seen": 26265750, "step": 1236, "time_per_iteration": 2.6302287578582764 }, { "auxiliary_loss_clip": 0.01154783, "auxiliary_loss_mlp": 0.00782014, "balance_loss_clip": 1.05519629, "balance_loss_mlp": 1.00039148, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 1.7985383717833268, "language_loss": 0.7595011, "learning_rate": 3.979382565791951e-06, "loss": 0.77886909, "num_input_tokens_seen": 26287905, "step": 1237, "time_per_iteration": 2.721931219100952 }, { "auxiliary_loss_clip": 0.01135551, "auxiliary_loss_mlp": 0.00783311, "balance_loss_clip": 1.0505693, "balance_loss_mlp": 1.00031757, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.6915170784810407, "language_loss": 0.77458763, "learning_rate": 3.979326750654053e-06, "loss": 0.79377621, "num_input_tokens_seen": 26311795, "step": 1238, "time_per_iteration": 2.831620931625366 }, { "auxiliary_loss_clip": 0.01177529, "auxiliary_loss_mlp": 0.01057762, "balance_loss_clip": 1.05673254, "balance_loss_mlp": 1.03311002, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9053364150897723, "language_loss": 0.867737, "learning_rate": 3.9792708604598854e-06, "loss": 0.89008987, "num_input_tokens_seen": 26330330, "step": 1239, "time_per_iteration": 2.6697263717651367 }, { "auxiliary_loss_clip": 0.01159844, "auxiliary_loss_mlp": 0.01050954, "balance_loss_clip": 1.05222142, "balance_loss_mlp": 1.02532458, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 26.978042105238785, "language_loss": 0.89356089, "learning_rate": 3.979214895211569e-06, "loss": 0.91566885, "num_input_tokens_seen": 26348865, "step": 1240, "time_per_iteration": 2.846013069152832 }, { "auxiliary_loss_clip": 0.01174117, "auxiliary_loss_mlp": 0.01063539, "balance_loss_clip": 1.05857158, "balance_loss_mlp": 1.03713393, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.9346624045484253, "language_loss": 0.88873678, "learning_rate": 3.979158854911225e-06, "loss": 0.91111326, "num_input_tokens_seen": 26368210, "step": 1241, "time_per_iteration": 2.6926562786102295 }, { "auxiliary_loss_clip": 0.01079637, "auxiliary_loss_mlp": 0.01009562, "balance_loss_clip": 1.03489435, "balance_loss_mlp": 1.00405502, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.8973011136706247, "language_loss": 0.63067901, "learning_rate": 3.979102739560979e-06, "loss": 0.65157104, "num_input_tokens_seen": 26424890, "step": 1242, "time_per_iteration": 3.298609972000122 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01068833, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.03819644, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 3.87499965477456, "language_loss": 0.62926078, "learning_rate": 3.9790465491629595e-06, "loss": 0.65159178, "num_input_tokens_seen": 26446405, "step": 1243, "time_per_iteration": 2.7774572372436523 }, { "auxiliary_loss_clip": 0.01188864, "auxiliary_loss_mlp": 0.01059918, "balance_loss_clip": 1.05716145, "balance_loss_mlp": 1.03499091, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 1.6252135866538246, "language_loss": 0.76259589, "learning_rate": 3.978990283719296e-06, "loss": 0.78508377, "num_input_tokens_seen": 26466070, "step": 1244, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.0611167, "balance_loss_mlp": 1.00038469, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 5.636002853507256, "language_loss": 0.69419599, "learning_rate": 3.978933943232123e-06, "loss": 0.71387023, "num_input_tokens_seen": 26479350, "step": 1245, "time_per_iteration": 2.640895366668701 }, { "auxiliary_loss_clip": 0.01203955, "auxiliary_loss_mlp": 0.01062684, "balance_loss_clip": 1.06098139, "balance_loss_mlp": 1.0372088, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 2.5525245798098757, "language_loss": 0.88635457, "learning_rate": 3.978877527703576e-06, "loss": 0.90902102, "num_input_tokens_seen": 26498255, "step": 1246, "time_per_iteration": 2.747765302658081 }, { "auxiliary_loss_clip": 0.01212369, "auxiliary_loss_mlp": 0.01077452, "balance_loss_clip": 1.06102896, "balance_loss_mlp": 1.049402, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.675073323546491, "language_loss": 0.8825295, "learning_rate": 3.9788210371357945e-06, "loss": 0.90542769, "num_input_tokens_seen": 26515375, "step": 1247, "time_per_iteration": 2.6810224056243896 }, { "auxiliary_loss_clip": 0.0118495, "auxiliary_loss_mlp": 0.01069489, "balance_loss_clip": 1.06058884, "balance_loss_mlp": 1.04383492, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.620559853720615, "language_loss": 0.64849806, "learning_rate": 3.978764471530921e-06, "loss": 0.67104244, "num_input_tokens_seen": 26533595, "step": 1248, "time_per_iteration": 2.706862449645996 }, { "auxiliary_loss_clip": 0.01181878, "auxiliary_loss_mlp": 0.00782677, "balance_loss_clip": 1.0575974, "balance_loss_mlp": 1.0004611, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 2.872208543000993, "language_loss": 0.74216163, "learning_rate": 3.978707830891102e-06, "loss": 0.7618072, "num_input_tokens_seen": 26549405, "step": 1249, "time_per_iteration": 4.309665679931641 }, { "auxiliary_loss_clip": 0.01168375, "auxiliary_loss_mlp": 0.01079691, "balance_loss_clip": 1.0579834, "balance_loss_mlp": 1.05296445, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.679176110316805, "language_loss": 0.82353318, "learning_rate": 3.978651115218482e-06, "loss": 0.84601378, "num_input_tokens_seen": 26567200, "step": 1250, "time_per_iteration": 4.367432594299316 }, { "auxiliary_loss_clip": 0.011507, "auxiliary_loss_mlp": 0.01064103, "balance_loss_clip": 1.05736125, "balance_loss_mlp": 1.0380677, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 2.015636709873133, "language_loss": 0.6679548, "learning_rate": 3.978594324515215e-06, "loss": 0.69010288, "num_input_tokens_seen": 26586190, "step": 1251, "time_per_iteration": 4.339111089706421 }, { "auxiliary_loss_clip": 0.01061099, "auxiliary_loss_mlp": 0.01007289, "balance_loss_clip": 1.02992618, "balance_loss_mlp": 1.00314093, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9014655793512963, "language_loss": 0.7038399, "learning_rate": 3.9785374587834515e-06, "loss": 0.72452378, "num_input_tokens_seen": 26650710, "step": 1252, "time_per_iteration": 4.984445333480835 }, { "auxiliary_loss_clip": 0.0120348, "auxiliary_loss_mlp": 0.01071343, "balance_loss_clip": 1.06016684, "balance_loss_mlp": 1.04651129, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.2789224049077226, "language_loss": 0.79936707, "learning_rate": 3.97848051802535e-06, "loss": 0.82211524, "num_input_tokens_seen": 26669000, "step": 1253, "time_per_iteration": 2.613696575164795 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01062493, "balance_loss_clip": 1.05703712, "balance_loss_mlp": 1.03758967, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 3.1057458778243263, "language_loss": 0.93360364, "learning_rate": 3.978423502243069e-06, "loss": 0.95588255, "num_input_tokens_seen": 26683075, "step": 1254, "time_per_iteration": 2.7332606315612793 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01064454, "balance_loss_clip": 1.06050682, "balance_loss_mlp": 1.03958726, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 2.090631066181037, "language_loss": 0.88087487, "learning_rate": 3.97836641143877e-06, "loss": 0.90325236, "num_input_tokens_seen": 26701875, "step": 1255, "time_per_iteration": 2.713636875152588 }, { "auxiliary_loss_clip": 0.01202338, "auxiliary_loss_mlp": 0.01071467, "balance_loss_clip": 1.06138325, "balance_loss_mlp": 1.04531264, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.9772348994273161, "language_loss": 0.79305708, "learning_rate": 3.978309245614618e-06, "loss": 0.81579506, "num_input_tokens_seen": 26719050, "step": 1256, "time_per_iteration": 2.688812255859375 }, { "auxiliary_loss_clip": 0.01064506, "auxiliary_loss_mlp": 0.01008663, "balance_loss_clip": 1.0281384, "balance_loss_mlp": 1.0043, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.7721513084275832, "language_loss": 0.58031851, "learning_rate": 3.9782520047727825e-06, "loss": 0.6010502, "num_input_tokens_seen": 26780650, "step": 1257, "time_per_iteration": 3.290971517562866 }, { "auxiliary_loss_clip": 0.01154091, "auxiliary_loss_mlp": 0.01065293, "balance_loss_clip": 1.06175375, "balance_loss_mlp": 1.04035461, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 2.5700283098608026, "language_loss": 0.90029764, "learning_rate": 3.978194688915432e-06, "loss": 0.92249143, "num_input_tokens_seen": 26798725, "step": 1258, "time_per_iteration": 2.800297975540161 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01064585, "balance_loss_clip": 1.06184185, "balance_loss_mlp": 1.03797793, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 2.1868972302346377, "language_loss": 0.81404132, "learning_rate": 3.978137298044741e-06, "loss": 0.83638299, "num_input_tokens_seen": 26817005, "step": 1259, "time_per_iteration": 2.767717123031616 }, { "auxiliary_loss_clip": 0.01194891, "auxiliary_loss_mlp": 0.01062022, "balance_loss_clip": 1.06317782, "balance_loss_mlp": 1.03766739, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.8876128491153832, "language_loss": 0.7609086, "learning_rate": 3.978079832162885e-06, "loss": 0.78347778, "num_input_tokens_seen": 26836655, "step": 1260, "time_per_iteration": 2.859339714050293 }, { "auxiliary_loss_clip": 0.01160098, "auxiliary_loss_mlp": 0.01068568, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.04222322, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 1.7028037437197219, "language_loss": 0.84734851, "learning_rate": 3.978022291272044e-06, "loss": 0.86963522, "num_input_tokens_seen": 26854925, "step": 1261, "time_per_iteration": 2.773087978363037 }, { "auxiliary_loss_clip": 0.01212087, "auxiliary_loss_mlp": 0.0106726, "balance_loss_clip": 1.06821966, "balance_loss_mlp": 1.04273915, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.8668314773439494, "language_loss": 0.82578814, "learning_rate": 3.977964675374399e-06, "loss": 0.84858155, "num_input_tokens_seen": 26876170, "step": 1262, "time_per_iteration": 2.681764841079712 }, { "auxiliary_loss_clip": 0.01206367, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.06333947, "balance_loss_mlp": 1.03685009, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.501362251414687, "language_loss": 0.82448232, "learning_rate": 3.977906984472136e-06, "loss": 0.84717447, "num_input_tokens_seen": 26895005, "step": 1263, "time_per_iteration": 2.6262786388397217 }, { "auxiliary_loss_clip": 0.01166059, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.06484997, "balance_loss_mlp": 1.04334641, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.171520639750579, "language_loss": 0.76149648, "learning_rate": 3.977849218567442e-06, "loss": 0.78384447, "num_input_tokens_seen": 26913930, "step": 1264, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01181777, "auxiliary_loss_mlp": 0.01061673, "balance_loss_clip": 1.06183577, "balance_loss_mlp": 1.03704381, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.252731793921747, "language_loss": 0.80919051, "learning_rate": 3.977791377662507e-06, "loss": 0.83162498, "num_input_tokens_seen": 26931485, "step": 1265, "time_per_iteration": 2.6076793670654297 }, { "auxiliary_loss_clip": 0.01143593, "auxiliary_loss_mlp": 0.01068856, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.0411638, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 2.117217065332582, "language_loss": 0.65244937, "learning_rate": 3.977733461759524e-06, "loss": 0.67457378, "num_input_tokens_seen": 26951670, "step": 1266, "time_per_iteration": 2.714848041534424 }, { "auxiliary_loss_clip": 0.0116364, "auxiliary_loss_mlp": 0.01066982, "balance_loss_clip": 1.05869627, "balance_loss_mlp": 1.04194832, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 2.0157381540709416, "language_loss": 0.79570109, "learning_rate": 3.977675470860691e-06, "loss": 0.81800735, "num_input_tokens_seen": 26970335, "step": 1267, "time_per_iteration": 2.692220687866211 }, { "auxiliary_loss_clip": 0.01186526, "auxiliary_loss_mlp": 0.01060572, "balance_loss_clip": 1.06368709, "balance_loss_mlp": 1.03644359, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 2.573855585409162, "language_loss": 0.72936547, "learning_rate": 3.977617404968205e-06, "loss": 0.75183642, "num_input_tokens_seen": 26986025, "step": 1268, "time_per_iteration": 2.666487216949463 }, { "auxiliary_loss_clip": 0.01189272, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.05925119, "balance_loss_mlp": 1.03146791, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 2.3531002902867018, "language_loss": 0.82087409, "learning_rate": 3.977559264084269e-06, "loss": 0.84333622, "num_input_tokens_seen": 27004045, "step": 1269, "time_per_iteration": 2.6196024417877197 }, { "auxiliary_loss_clip": 0.01198264, "auxiliary_loss_mlp": 0.01062408, "balance_loss_clip": 1.06528163, "balance_loss_mlp": 1.03656352, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 2.6660741307472424, "language_loss": 0.88614184, "learning_rate": 3.977501048211088e-06, "loss": 0.90874851, "num_input_tokens_seen": 27022070, "step": 1270, "time_per_iteration": 2.6423919200897217 }, { "auxiliary_loss_clip": 0.01195764, "auxiliary_loss_mlp": 0.01062092, "balance_loss_clip": 1.06443572, "balance_loss_mlp": 1.0371294, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 2.486841045046768, "language_loss": 0.7104162, "learning_rate": 3.977442757350869e-06, "loss": 0.73299474, "num_input_tokens_seen": 27041755, "step": 1271, "time_per_iteration": 2.6679437160491943 }, { "auxiliary_loss_clip": 0.01157818, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05973268, "balance_loss_mlp": 1.04282308, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.5691807400142836, "language_loss": 0.82570392, "learning_rate": 3.977384391505823e-06, "loss": 0.84796339, "num_input_tokens_seen": 27061540, "step": 1272, "time_per_iteration": 2.7613680362701416 }, { "auxiliary_loss_clip": 0.01176176, "auxiliary_loss_mlp": 0.00782751, "balance_loss_clip": 1.05822372, "balance_loss_mlp": 1.00051665, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.811509476700225, "language_loss": 0.79854733, "learning_rate": 3.977325950678162e-06, "loss": 0.81813657, "num_input_tokens_seen": 27081395, "step": 1273, "time_per_iteration": 2.696317434310913 }, { "auxiliary_loss_clip": 0.01185133, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06556833, "balance_loss_mlp": 1.03910685, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 1.7399681078894738, "language_loss": 0.81519866, "learning_rate": 3.977267434870103e-06, "loss": 0.83769304, "num_input_tokens_seen": 27101175, "step": 1274, "time_per_iteration": 2.8570950031280518 }, { "auxiliary_loss_clip": 0.0118748, "auxiliary_loss_mlp": 0.01078696, "balance_loss_clip": 1.06516898, "balance_loss_mlp": 1.05164731, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 2.6845981005996453, "language_loss": 0.73083639, "learning_rate": 3.977208844083865e-06, "loss": 0.75349814, "num_input_tokens_seen": 27124505, "step": 1275, "time_per_iteration": 2.75947904586792 }, { "auxiliary_loss_clip": 0.0121081, "auxiliary_loss_mlp": 0.01063745, "balance_loss_clip": 1.06740415, "balance_loss_mlp": 1.03694642, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.828157953752124, "language_loss": 0.79507053, "learning_rate": 3.9771501783216685e-06, "loss": 0.81781602, "num_input_tokens_seen": 27140960, "step": 1276, "time_per_iteration": 2.626683473587036 }, { "auxiliary_loss_clip": 0.01198279, "auxiliary_loss_mlp": 0.01058719, "balance_loss_clip": 1.06486118, "balance_loss_mlp": 1.03485298, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 2.406514987231471, "language_loss": 0.58915478, "learning_rate": 3.97709143758574e-06, "loss": 0.61172473, "num_input_tokens_seen": 27160985, "step": 1277, "time_per_iteration": 2.6684958934783936 }, { "auxiliary_loss_clip": 0.01201282, "auxiliary_loss_mlp": 0.01064396, "balance_loss_clip": 1.06430948, "balance_loss_mlp": 1.03919542, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.8024245322836046, "language_loss": 0.74957907, "learning_rate": 3.977032621878305e-06, "loss": 0.77223587, "num_input_tokens_seen": 27178390, "step": 1278, "time_per_iteration": 2.723675012588501 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01063133, "balance_loss_clip": 1.0584681, "balance_loss_mlp": 1.0390408, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 5.339853944094037, "language_loss": 0.88594604, "learning_rate": 3.976973731201596e-06, "loss": 0.90818715, "num_input_tokens_seen": 27197505, "step": 1279, "time_per_iteration": 2.655036211013794 }, { "auxiliary_loss_clip": 0.01172627, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.06065845, "balance_loss_mlp": 1.04077685, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.4937131241937256, "language_loss": 0.8300451, "learning_rate": 3.976914765557845e-06, "loss": 0.85243726, "num_input_tokens_seen": 27214260, "step": 1280, "time_per_iteration": 2.7717065811157227 }, { "auxiliary_loss_clip": 0.01194022, "auxiliary_loss_mlp": 0.01066533, "balance_loss_clip": 1.06593037, "balance_loss_mlp": 1.04104638, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 2.044864943195716, "language_loss": 0.7581439, "learning_rate": 3.9768557249492875e-06, "loss": 0.78074944, "num_input_tokens_seen": 27232525, "step": 1281, "time_per_iteration": 2.7444865703582764 }, { "auxiliary_loss_clip": 0.01170775, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.05879402, "balance_loss_mlp": 1.03669322, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 1.8925477349429178, "language_loss": 0.75091648, "learning_rate": 3.9767966093781634e-06, "loss": 0.77324951, "num_input_tokens_seen": 27249800, "step": 1282, "time_per_iteration": 2.829145908355713 }, { "auxiliary_loss_clip": 0.01213222, "auxiliary_loss_mlp": 0.01071082, "balance_loss_clip": 1.07007408, "balance_loss_mlp": 1.04549992, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 2.1558853998977527, "language_loss": 0.83863324, "learning_rate": 3.976737418846713e-06, "loss": 0.8614763, "num_input_tokens_seen": 27268895, "step": 1283, "time_per_iteration": 2.6955173015594482 }, { "auxiliary_loss_clip": 0.0119621, "auxiliary_loss_mlp": 0.01066889, "balance_loss_clip": 1.06603825, "balance_loss_mlp": 1.03925657, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 2.520477290704422, "language_loss": 0.75147104, "learning_rate": 3.976678153357181e-06, "loss": 0.77410209, "num_input_tokens_seen": 27288180, "step": 1284, "time_per_iteration": 2.6589291095733643 }, { "auxiliary_loss_clip": 0.01182212, "auxiliary_loss_mlp": 0.01068485, "balance_loss_clip": 1.06304765, "balance_loss_mlp": 1.0438329, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 5.2953301239297295, "language_loss": 0.76224041, "learning_rate": 3.976618812911817e-06, "loss": 0.78474742, "num_input_tokens_seen": 27311815, "step": 1285, "time_per_iteration": 2.847702741622925 }, { "auxiliary_loss_clip": 0.01216302, "auxiliary_loss_mlp": 0.01071451, "balance_loss_clip": 1.07193899, "balance_loss_mlp": 1.04729891, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 2.0564733507641, "language_loss": 0.84193194, "learning_rate": 3.9765593975128685e-06, "loss": 0.86480945, "num_input_tokens_seen": 27331890, "step": 1286, "time_per_iteration": 2.713963270187378 }, { "auxiliary_loss_clip": 0.01180469, "auxiliary_loss_mlp": 0.01061062, "balance_loss_clip": 1.06331325, "balance_loss_mlp": 1.03646958, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.810253293244863, "language_loss": 0.76899689, "learning_rate": 3.97649990716259e-06, "loss": 0.79141217, "num_input_tokens_seen": 27348320, "step": 1287, "time_per_iteration": 2.669168472290039 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05891848, "balance_loss_mlp": 1.03696775, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.6525652726351308, "language_loss": 0.84699571, "learning_rate": 3.976440341863237e-06, "loss": 0.86936986, "num_input_tokens_seen": 27367670, "step": 1288, "time_per_iteration": 2.7794599533081055 }, { "auxiliary_loss_clip": 0.01206182, "auxiliary_loss_mlp": 0.0106604, "balance_loss_clip": 1.06214797, "balance_loss_mlp": 1.04203176, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.0424090794957523, "language_loss": 0.85576034, "learning_rate": 3.976380701617068e-06, "loss": 0.87848258, "num_input_tokens_seen": 27385485, "step": 1289, "time_per_iteration": 4.232934236526489 }, { "auxiliary_loss_clip": 0.01207527, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.06487668, "balance_loss_mlp": 1.0291574, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 2.840721047922519, "language_loss": 0.85548425, "learning_rate": 3.976320986426344e-06, "loss": 0.87808931, "num_input_tokens_seen": 27405110, "step": 1290, "time_per_iteration": 4.218302965164185 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01066698, "balance_loss_clip": 1.06411862, "balance_loss_mlp": 1.04041266, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.3756178078405976, "language_loss": 0.91390574, "learning_rate": 3.9762611962933315e-06, "loss": 0.93631011, "num_input_tokens_seen": 27422855, "step": 1291, "time_per_iteration": 4.468304395675659 }, { "auxiliary_loss_clip": 0.01081301, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.04092944, "balance_loss_mlp": 1.03894901, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.8973948861970446, "language_loss": 0.65065891, "learning_rate": 3.9762013312202955e-06, "loss": 0.67190224, "num_input_tokens_seen": 27487190, "step": 1292, "time_per_iteration": 3.3142755031585693 }, { "auxiliary_loss_clip": 0.01195822, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.06527543, "balance_loss_mlp": 1.03846776, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7595227960044768, "language_loss": 0.87530363, "learning_rate": 3.9761413912095075e-06, "loss": 0.89788938, "num_input_tokens_seen": 27510465, "step": 1293, "time_per_iteration": 2.801603078842163 }, { "auxiliary_loss_clip": 0.01116633, "auxiliary_loss_mlp": 0.01078659, "balance_loss_clip": 1.05041039, "balance_loss_mlp": 1.05012059, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 2.2898991349098528, "language_loss": 0.84518278, "learning_rate": 3.976081376263239e-06, "loss": 0.8671357, "num_input_tokens_seen": 27528645, "step": 1294, "time_per_iteration": 2.898597002029419 }, { "auxiliary_loss_clip": 0.01158796, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.05967593, "balance_loss_mlp": 1.0342207, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.7292442592472073, "language_loss": 0.79365373, "learning_rate": 3.976021286383768e-06, "loss": 0.81583679, "num_input_tokens_seen": 27546165, "step": 1295, "time_per_iteration": 2.8481552600860596 }, { "auxiliary_loss_clip": 0.01155886, "auxiliary_loss_mlp": 0.01061351, "balance_loss_clip": 1.06015158, "balance_loss_mlp": 1.0356493, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 3.472740252224496, "language_loss": 0.88351864, "learning_rate": 3.975961121573371e-06, "loss": 0.90569103, "num_input_tokens_seen": 27566520, "step": 1296, "time_per_iteration": 2.697831392288208 }, { "auxiliary_loss_clip": 0.0120756, "auxiliary_loss_mlp": 0.01074146, "balance_loss_clip": 1.06552935, "balance_loss_mlp": 1.04791999, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 2.384603846473911, "language_loss": 0.9625901, "learning_rate": 3.9759008818343305e-06, "loss": 0.98540717, "num_input_tokens_seen": 27581960, "step": 1297, "time_per_iteration": 2.62660551071167 }, { "auxiliary_loss_clip": 0.01175852, "auxiliary_loss_mlp": 0.01069298, "balance_loss_clip": 1.06147313, "balance_loss_mlp": 1.04517019, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.15152040651991, "language_loss": 0.7600193, "learning_rate": 3.97584056716893e-06, "loss": 0.78247076, "num_input_tokens_seen": 27601415, "step": 1298, "time_per_iteration": 2.8040499687194824 }, { "auxiliary_loss_clip": 0.0114505, "auxiliary_loss_mlp": 0.00783981, "balance_loss_clip": 1.05864501, "balance_loss_mlp": 1.0006063, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.6697657327886877, "language_loss": 0.8097105, "learning_rate": 3.9757801775794575e-06, "loss": 0.82900077, "num_input_tokens_seen": 27621490, "step": 1299, "time_per_iteration": 2.7667653560638428 }, { "auxiliary_loss_clip": 0.01162638, "auxiliary_loss_mlp": 0.01064395, "balance_loss_clip": 1.06191885, "balance_loss_mlp": 1.0393368, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.9748762517467437, "language_loss": 0.86755943, "learning_rate": 3.975719713068202e-06, "loss": 0.8898297, "num_input_tokens_seen": 27640600, "step": 1300, "time_per_iteration": 2.7819204330444336 }, { "auxiliary_loss_clip": 0.0120807, "auxiliary_loss_mlp": 0.01056805, "balance_loss_clip": 1.06663537, "balance_loss_mlp": 1.03180683, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.040560411644486, "language_loss": 0.71822268, "learning_rate": 3.975659173637458e-06, "loss": 0.74087137, "num_input_tokens_seen": 27663070, "step": 1301, "time_per_iteration": 2.845107316970825 }, { "auxiliary_loss_clip": 0.01196566, "auxiliary_loss_mlp": 0.01075534, "balance_loss_clip": 1.06426311, "balance_loss_mlp": 1.05100083, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.6425838754876312, "language_loss": 0.70782864, "learning_rate": 3.97559855928952e-06, "loss": 0.73054957, "num_input_tokens_seen": 27686425, "step": 1302, "time_per_iteration": 2.898069381713867 }, { "auxiliary_loss_clip": 0.01162032, "auxiliary_loss_mlp": 0.00783256, "balance_loss_clip": 1.06019354, "balance_loss_mlp": 1.00062823, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.067506704059933, "language_loss": 0.82100385, "learning_rate": 3.9755378700266864e-06, "loss": 0.84045678, "num_input_tokens_seen": 27704900, "step": 1303, "time_per_iteration": 2.7862839698791504 }, { "auxiliary_loss_clip": 0.01191742, "auxiliary_loss_mlp": 0.01074585, "balance_loss_clip": 1.06583321, "balance_loss_mlp": 1.04908574, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.8830773419754625, "language_loss": 0.75206572, "learning_rate": 3.9754771058512585e-06, "loss": 0.77472901, "num_input_tokens_seen": 27724890, "step": 1304, "time_per_iteration": 2.7380170822143555 }, { "auxiliary_loss_clip": 0.01211207, "auxiliary_loss_mlp": 0.01074343, "balance_loss_clip": 1.07114935, "balance_loss_mlp": 1.04922605, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.6118444643214749, "language_loss": 0.76141047, "learning_rate": 3.975416266765542e-06, "loss": 0.784266, "num_input_tokens_seen": 27743115, "step": 1305, "time_per_iteration": 2.6788928508758545 }, { "auxiliary_loss_clip": 0.01137547, "auxiliary_loss_mlp": 0.01064795, "balance_loss_clip": 1.05611205, "balance_loss_mlp": 1.04021358, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.9541638070229452, "language_loss": 0.85011744, "learning_rate": 3.975355352771841e-06, "loss": 0.87214082, "num_input_tokens_seen": 27763570, "step": 1306, "time_per_iteration": 3.048137903213501 }, { "auxiliary_loss_clip": 0.01194779, "auxiliary_loss_mlp": 0.01049822, "balance_loss_clip": 1.06754708, "balance_loss_mlp": 1.02668333, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 6.108459548145404, "language_loss": 0.90882134, "learning_rate": 3.975294363872468e-06, "loss": 0.93126732, "num_input_tokens_seen": 27780030, "step": 1307, "time_per_iteration": 3.1597135066986084 }, { "auxiliary_loss_clip": 0.01145989, "auxiliary_loss_mlp": 0.01060478, "balance_loss_clip": 1.05529833, "balance_loss_mlp": 1.034729, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 3.4991416096159136, "language_loss": 0.83695096, "learning_rate": 3.975233300069735e-06, "loss": 0.85901558, "num_input_tokens_seen": 27796225, "step": 1308, "time_per_iteration": 2.749174118041992 }, { "auxiliary_loss_clip": 0.01151044, "auxiliary_loss_mlp": 0.01061966, "balance_loss_clip": 1.05445218, "balance_loss_mlp": 1.03789735, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.7092634116882437, "language_loss": 0.77521002, "learning_rate": 3.975172161365958e-06, "loss": 0.7973401, "num_input_tokens_seen": 27815975, "step": 1309, "time_per_iteration": 2.752854108810425 }, { "auxiliary_loss_clip": 0.01200102, "auxiliary_loss_mlp": 0.01070583, "balance_loss_clip": 1.06396675, "balance_loss_mlp": 1.04449987, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.8729662604656268, "language_loss": 0.80561006, "learning_rate": 3.975110947763453e-06, "loss": 0.82831693, "num_input_tokens_seen": 27832255, "step": 1310, "time_per_iteration": 2.6966710090637207 }, { "auxiliary_loss_clip": 0.01173381, "auxiliary_loss_mlp": 0.0078245, "balance_loss_clip": 1.06193507, "balance_loss_mlp": 1.00060987, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.796715978968241, "language_loss": 0.73187977, "learning_rate": 3.9750496592645435e-06, "loss": 0.75143808, "num_input_tokens_seen": 27852180, "step": 1311, "time_per_iteration": 2.7588090896606445 }, { "auxiliary_loss_clip": 0.01188438, "auxiliary_loss_mlp": 0.01078546, "balance_loss_clip": 1.06358969, "balance_loss_mlp": 1.05342865, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.7490617386556226, "language_loss": 0.86002982, "learning_rate": 3.974988295871553e-06, "loss": 0.88269973, "num_input_tokens_seen": 27871435, "step": 1312, "time_per_iteration": 2.6969683170318604 }, { "auxiliary_loss_clip": 0.01178338, "auxiliary_loss_mlp": 0.01059112, "balance_loss_clip": 1.06324685, "balance_loss_mlp": 1.03633142, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.825664315845032, "language_loss": 0.82087892, "learning_rate": 3.9749268575868085e-06, "loss": 0.84325337, "num_input_tokens_seen": 27890625, "step": 1313, "time_per_iteration": 2.6936304569244385 }, { "auxiliary_loss_clip": 0.01184798, "auxiliary_loss_mlp": 0.00783631, "balance_loss_clip": 1.06229842, "balance_loss_mlp": 1.00053823, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.837190319075622, "language_loss": 0.73569417, "learning_rate": 3.97486534441264e-06, "loss": 0.75537837, "num_input_tokens_seen": 27906530, "step": 1314, "time_per_iteration": 2.653505325317383 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.00782352, "balance_loss_clip": 1.05730104, "balance_loss_mlp": 1.00044668, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.6153694611764058, "language_loss": 0.79490477, "learning_rate": 3.974803756351379e-06, "loss": 0.81427419, "num_input_tokens_seen": 27926725, "step": 1315, "time_per_iteration": 2.797306776046753 }, { "auxiliary_loss_clip": 0.01189107, "auxiliary_loss_mlp": 0.01060743, "balance_loss_clip": 1.05841756, "balance_loss_mlp": 1.03487444, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 1.6362349035659796, "language_loss": 0.73546493, "learning_rate": 3.974742093405362e-06, "loss": 0.75796348, "num_input_tokens_seen": 27947875, "step": 1316, "time_per_iteration": 2.688997507095337 }, { "auxiliary_loss_clip": 0.01162651, "auxiliary_loss_mlp": 0.01066617, "balance_loss_clip": 1.05845332, "balance_loss_mlp": 1.0418098, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.157376902111077, "language_loss": 0.65540409, "learning_rate": 3.974680355576927e-06, "loss": 0.67769682, "num_input_tokens_seen": 27965040, "step": 1317, "time_per_iteration": 2.6998519897460938 }, { "auxiliary_loss_clip": 0.01177674, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.06280386, "balance_loss_mlp": 1.0428021, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.382161374765057, "language_loss": 0.73105192, "learning_rate": 3.974618542868415e-06, "loss": 0.75351495, "num_input_tokens_seen": 27985330, "step": 1318, "time_per_iteration": 2.8350789546966553 }, { "auxiliary_loss_clip": 0.01139638, "auxiliary_loss_mlp": 0.01058798, "balance_loss_clip": 1.05582452, "balance_loss_mlp": 1.03515935, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 2.635941883481154, "language_loss": 0.90381306, "learning_rate": 3.97455665528217e-06, "loss": 0.92579746, "num_input_tokens_seen": 28007615, "step": 1319, "time_per_iteration": 2.8553895950317383 }, { "auxiliary_loss_clip": 0.01175059, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.05662942, "balance_loss_mlp": 1.03122926, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 1.9449065990449943, "language_loss": 0.80134505, "learning_rate": 3.974494692820539e-06, "loss": 0.82364893, "num_input_tokens_seen": 28027765, "step": 1320, "time_per_iteration": 2.6651997566223145 }, { "auxiliary_loss_clip": 0.01181808, "auxiliary_loss_mlp": 0.01060151, "balance_loss_clip": 1.06380332, "balance_loss_mlp": 1.03657198, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 2.1078540484546746, "language_loss": 0.6901226, "learning_rate": 3.974432655485872e-06, "loss": 0.71254218, "num_input_tokens_seen": 28044225, "step": 1321, "time_per_iteration": 2.6500401496887207 }, { "auxiliary_loss_clip": 0.01189002, "auxiliary_loss_mlp": 0.01060598, "balance_loss_clip": 1.06469131, "balance_loss_mlp": 1.03688753, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 1.9310950096267907, "language_loss": 0.8359012, "learning_rate": 3.9743705432805195e-06, "loss": 0.85839725, "num_input_tokens_seen": 28062915, "step": 1322, "time_per_iteration": 2.684978723526001 }, { "auxiliary_loss_clip": 0.01202147, "auxiliary_loss_mlp": 0.01057117, "balance_loss_clip": 1.06135976, "balance_loss_mlp": 1.03304851, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 2.128262121046283, "language_loss": 0.90555447, "learning_rate": 3.974308356206838e-06, "loss": 0.92814714, "num_input_tokens_seen": 28082175, "step": 1323, "time_per_iteration": 2.6192240715026855 }, { "auxiliary_loss_clip": 0.01164151, "auxiliary_loss_mlp": 0.01062303, "balance_loss_clip": 1.06272292, "balance_loss_mlp": 1.03809166, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.8373443631598505, "language_loss": 0.82521075, "learning_rate": 3.974246094267187e-06, "loss": 0.84747529, "num_input_tokens_seen": 28102645, "step": 1324, "time_per_iteration": 2.8283956050872803 }, { "auxiliary_loss_clip": 0.01180787, "auxiliary_loss_mlp": 0.01053463, "balance_loss_clip": 1.06256735, "balance_loss_mlp": 1.02834535, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 2.119290865165494, "language_loss": 0.79162025, "learning_rate": 3.974183757463925e-06, "loss": 0.8139627, "num_input_tokens_seen": 28122805, "step": 1325, "time_per_iteration": 2.6996092796325684 }, { "auxiliary_loss_clip": 0.01119286, "auxiliary_loss_mlp": 0.00785175, "balance_loss_clip": 1.04844928, "balance_loss_mlp": 1.00035501, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.2621745256944448, "language_loss": 0.88038248, "learning_rate": 3.974121345799418e-06, "loss": 0.89942712, "num_input_tokens_seen": 28140530, "step": 1326, "time_per_iteration": 2.881410837173462 }, { "auxiliary_loss_clip": 0.012, "auxiliary_loss_mlp": 0.01056877, "balance_loss_clip": 1.06257951, "balance_loss_mlp": 1.03168797, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 1.8538865301137586, "language_loss": 0.8328709, "learning_rate": 3.974058859276032e-06, "loss": 0.85543966, "num_input_tokens_seen": 28159640, "step": 1327, "time_per_iteration": 2.7277982234954834 }, { "auxiliary_loss_clip": 0.01207207, "auxiliary_loss_mlp": 0.01056886, "balance_loss_clip": 1.06532371, "balance_loss_mlp": 1.03223395, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 2.3216818645515636, "language_loss": 0.78599, "learning_rate": 3.9739962978961354e-06, "loss": 0.80863088, "num_input_tokens_seen": 28177050, "step": 1328, "time_per_iteration": 4.2137157917022705 }, { "auxiliary_loss_clip": 0.01201442, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.06778932, "balance_loss_mlp": 1.02722156, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 4.209530911932697, "language_loss": 0.73918134, "learning_rate": 3.973933661662101e-06, "loss": 0.76171625, "num_input_tokens_seen": 28193245, "step": 1329, "time_per_iteration": 5.853717565536499 }, { "auxiliary_loss_clip": 0.01169795, "auxiliary_loss_mlp": 0.01064631, "balance_loss_clip": 1.06039059, "balance_loss_mlp": 1.04069376, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.6102544328312476, "language_loss": 0.81743932, "learning_rate": 3.973870950576305e-06, "loss": 0.83978355, "num_input_tokens_seen": 28213570, "step": 1330, "time_per_iteration": 4.307915687561035 }, { "auxiliary_loss_clip": 0.01205148, "auxiliary_loss_mlp": 0.00780735, "balance_loss_clip": 1.06445098, "balance_loss_mlp": 1.00030971, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 3.0935981151455865, "language_loss": 0.88962448, "learning_rate": 3.9738081646411255e-06, "loss": 0.90948325, "num_input_tokens_seen": 28229980, "step": 1331, "time_per_iteration": 2.645198345184326 }, { "auxiliary_loss_clip": 0.01196019, "auxiliary_loss_mlp": 0.00781409, "balance_loss_clip": 1.05950165, "balance_loss_mlp": 1.00032377, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 1.8933982437719925, "language_loss": 0.7335732, "learning_rate": 3.973745303858942e-06, "loss": 0.75334752, "num_input_tokens_seen": 28253840, "step": 1332, "time_per_iteration": 2.792128562927246 }, { "auxiliary_loss_clip": 0.01180359, "auxiliary_loss_mlp": 0.01055118, "balance_loss_clip": 1.06217384, "balance_loss_mlp": 1.03216982, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 1.7464568676953767, "language_loss": 0.82765031, "learning_rate": 3.973682368232138e-06, "loss": 0.85000509, "num_input_tokens_seen": 28271675, "step": 1333, "time_per_iteration": 2.635579824447632 }, { "auxiliary_loss_clip": 0.01160554, "auxiliary_loss_mlp": 0.01059025, "balance_loss_clip": 1.05944169, "balance_loss_mlp": 1.03502798, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.677615191761892, "language_loss": 0.74862051, "learning_rate": 3.9736193577631015e-06, "loss": 0.77081633, "num_input_tokens_seen": 28291850, "step": 1334, "time_per_iteration": 2.8150298595428467 }, { "auxiliary_loss_clip": 0.01176175, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.06460369, "balance_loss_mlp": 1.04010868, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.8723728369534094, "language_loss": 0.79970533, "learning_rate": 3.973556272454221e-06, "loss": 0.82210302, "num_input_tokens_seen": 28310780, "step": 1335, "time_per_iteration": 2.6858503818511963 }, { "auxiliary_loss_clip": 0.01068232, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01693749, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7491611763509133, "language_loss": 0.56056821, "learning_rate": 3.973493112307889e-06, "loss": 0.58145452, "num_input_tokens_seen": 28369985, "step": 1336, "time_per_iteration": 3.324230670928955 }, { "auxiliary_loss_clip": 0.01179495, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06005239, "balance_loss_mlp": 1.04149771, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 2.8990759307469256, "language_loss": 0.67587668, "learning_rate": 3.9734298773265005e-06, "loss": 0.69831598, "num_input_tokens_seen": 28388670, "step": 1337, "time_per_iteration": 2.755451202392578 }, { "auxiliary_loss_clip": 0.01171763, "auxiliary_loss_mlp": 0.0107788, "balance_loss_clip": 1.06270492, "balance_loss_mlp": 1.05304837, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 1.9421039451316542, "language_loss": 0.86847901, "learning_rate": 3.973366567512453e-06, "loss": 0.89097536, "num_input_tokens_seen": 28411845, "step": 1338, "time_per_iteration": 2.758418560028076 }, { "auxiliary_loss_clip": 0.01136344, "auxiliary_loss_mlp": 0.01082295, "balance_loss_clip": 1.04883683, "balance_loss_mlp": 1.05596161, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 2.4557709650828157, "language_loss": 0.87217385, "learning_rate": 3.973303182868147e-06, "loss": 0.89436018, "num_input_tokens_seen": 28427875, "step": 1339, "time_per_iteration": 2.72682785987854 }, { "auxiliary_loss_clip": 0.01188632, "auxiliary_loss_mlp": 0.01055953, "balance_loss_clip": 1.06334567, "balance_loss_mlp": 1.03417385, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 10.603370056653041, "language_loss": 0.89504963, "learning_rate": 3.973239723395988e-06, "loss": 0.91749549, "num_input_tokens_seen": 28446615, "step": 1340, "time_per_iteration": 2.639601469039917 }, { "auxiliary_loss_clip": 0.01080107, "auxiliary_loss_mlp": 0.01012224, "balance_loss_clip": 1.02943289, "balance_loss_mlp": 1.00850451, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8861598592181924, "language_loss": 0.64834231, "learning_rate": 3.97317618909838e-06, "loss": 0.66926563, "num_input_tokens_seen": 28505290, "step": 1341, "time_per_iteration": 3.0625648498535156 }, { "auxiliary_loss_clip": 0.01197538, "auxiliary_loss_mlp": 0.01061885, "balance_loss_clip": 1.0628854, "balance_loss_mlp": 1.0364095, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 3.3156125209451286, "language_loss": 0.89471233, "learning_rate": 3.973112579977733e-06, "loss": 0.9173066, "num_input_tokens_seen": 28522735, "step": 1342, "time_per_iteration": 2.6123783588409424 }, { "auxiliary_loss_clip": 0.01177687, "auxiliary_loss_mlp": 0.01062063, "balance_loss_clip": 1.0644995, "balance_loss_mlp": 1.03818512, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.2904075751929365, "language_loss": 0.76354575, "learning_rate": 3.973048896036459e-06, "loss": 0.78594327, "num_input_tokens_seen": 28539460, "step": 1343, "time_per_iteration": 2.7564918994903564 }, { "auxiliary_loss_clip": 0.01064182, "auxiliary_loss_mlp": 0.01010488, "balance_loss_clip": 1.02542567, "balance_loss_mlp": 1.0066731, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.8071281523255156, "language_loss": 0.57418531, "learning_rate": 3.972985137276974e-06, "loss": 0.59493202, "num_input_tokens_seen": 28599855, "step": 1344, "time_per_iteration": 3.170443058013916 }, { "auxiliary_loss_clip": 0.01158029, "auxiliary_loss_mlp": 0.01063108, "balance_loss_clip": 1.05839872, "balance_loss_mlp": 1.03846788, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.5953739346171676, "language_loss": 0.86569476, "learning_rate": 3.972921303701695e-06, "loss": 0.88790607, "num_input_tokens_seen": 28617585, "step": 1345, "time_per_iteration": 2.765254497528076 }, { "auxiliary_loss_clip": 0.01203428, "auxiliary_loss_mlp": 0.01057879, "balance_loss_clip": 1.06629944, "balance_loss_mlp": 1.03603959, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.8653844332842058, "language_loss": 0.87646407, "learning_rate": 3.972857395313042e-06, "loss": 0.89907712, "num_input_tokens_seen": 28636355, "step": 1346, "time_per_iteration": 2.655611991882324 }, { "auxiliary_loss_clip": 0.01191822, "auxiliary_loss_mlp": 0.0105414, "balance_loss_clip": 1.06450033, "balance_loss_mlp": 1.03047693, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.7047476553504466, "language_loss": 0.9298563, "learning_rate": 3.972793412113439e-06, "loss": 0.95231593, "num_input_tokens_seen": 28656260, "step": 1347, "time_per_iteration": 2.718355417251587 }, { "auxiliary_loss_clip": 0.01188696, "auxiliary_loss_mlp": 0.01066703, "balance_loss_clip": 1.06260633, "balance_loss_mlp": 1.04144263, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.9307860049130865, "language_loss": 0.89506733, "learning_rate": 3.972729354105312e-06, "loss": 0.91762137, "num_input_tokens_seen": 28675865, "step": 1348, "time_per_iteration": 2.763735771179199 }, { "auxiliary_loss_clip": 0.01137961, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.06026649, "balance_loss_mlp": 1.03730989, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.6214351378274148, "language_loss": 0.76906884, "learning_rate": 3.97266522129109e-06, "loss": 0.79104578, "num_input_tokens_seen": 28696255, "step": 1349, "time_per_iteration": 2.778050661087036 }, { "auxiliary_loss_clip": 0.01202122, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.06290889, "balance_loss_mlp": 1.04144049, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.777484449358279, "language_loss": 0.8877703, "learning_rate": 3.972601013673205e-06, "loss": 0.91044247, "num_input_tokens_seen": 28713905, "step": 1350, "time_per_iteration": 2.5871450901031494 }, { "auxiliary_loss_clip": 0.01164889, "auxiliary_loss_mlp": 0.00780958, "balance_loss_clip": 1.06011164, "balance_loss_mlp": 1.00028801, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 2.7472756845793156, "language_loss": 0.82298493, "learning_rate": 3.972536731254092e-06, "loss": 0.84244347, "num_input_tokens_seen": 28732075, "step": 1351, "time_per_iteration": 2.840271234512329 }, { "auxiliary_loss_clip": 0.01198177, "auxiliary_loss_mlp": 0.01055773, "balance_loss_clip": 1.06010592, "balance_loss_mlp": 1.03090644, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.2808101252466724, "language_loss": 0.75274944, "learning_rate": 3.972472374036189e-06, "loss": 0.775289, "num_input_tokens_seen": 28751150, "step": 1352, "time_per_iteration": 2.733644485473633 }, { "auxiliary_loss_clip": 0.01194643, "auxiliary_loss_mlp": 0.00783595, "balance_loss_clip": 1.06613326, "balance_loss_mlp": 1.00036311, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 1.678520960707938, "language_loss": 0.82936156, "learning_rate": 3.972407942021935e-06, "loss": 0.84914398, "num_input_tokens_seen": 28773360, "step": 1353, "time_per_iteration": 2.742149829864502 }, { "auxiliary_loss_clip": 0.01068236, "auxiliary_loss_mlp": 0.01015932, "balance_loss_clip": 1.02440155, "balance_loss_mlp": 1.01242769, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8516312511934722, "language_loss": 0.59741521, "learning_rate": 3.972343435213775e-06, "loss": 0.61825693, "num_input_tokens_seen": 28833390, "step": 1354, "time_per_iteration": 3.1912426948547363 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.01058874, "balance_loss_clip": 1.0546236, "balance_loss_mlp": 1.03583086, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 2.1234068486581643, "language_loss": 0.82310611, "learning_rate": 3.972278853614154e-06, "loss": 0.84519827, "num_input_tokens_seen": 28852430, "step": 1355, "time_per_iteration": 2.782442808151245 }, { "auxiliary_loss_clip": 0.01186948, "auxiliary_loss_mlp": 0.01062856, "balance_loss_clip": 1.0600667, "balance_loss_mlp": 1.03801262, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 1.8366299277102565, "language_loss": 0.7135247, "learning_rate": 3.972214197225521e-06, "loss": 0.73602271, "num_input_tokens_seen": 28870685, "step": 1356, "time_per_iteration": 2.7777554988861084 }, { "auxiliary_loss_clip": 0.01194666, "auxiliary_loss_mlp": 0.01056522, "balance_loss_clip": 1.06462216, "balance_loss_mlp": 1.03259718, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 2.050923525150184, "language_loss": 0.70426142, "learning_rate": 3.972149466050329e-06, "loss": 0.72677326, "num_input_tokens_seen": 28889860, "step": 1357, "time_per_iteration": 2.852046012878418 }, { "auxiliary_loss_clip": 0.01186996, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.06138206, "balance_loss_mlp": 1.03070426, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.634204556872777, "language_loss": 0.84203482, "learning_rate": 3.97208466009103e-06, "loss": 0.8644495, "num_input_tokens_seen": 28905865, "step": 1358, "time_per_iteration": 2.7127115726470947 }, { "auxiliary_loss_clip": 0.01176629, "auxiliary_loss_mlp": 0.010566, "balance_loss_clip": 1.06037402, "balance_loss_mlp": 1.03154182, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 2.1726272773281097, "language_loss": 1.02781308, "learning_rate": 3.972019779350084e-06, "loss": 1.05014539, "num_input_tokens_seen": 28925250, "step": 1359, "time_per_iteration": 2.7171826362609863 }, { "auxiliary_loss_clip": 0.01128357, "auxiliary_loss_mlp": 0.01056774, "balance_loss_clip": 1.05009234, "balance_loss_mlp": 1.03263426, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 2.0494617207464945, "language_loss": 0.8313604, "learning_rate": 3.971954823829951e-06, "loss": 0.85321164, "num_input_tokens_seen": 28943445, "step": 1360, "time_per_iteration": 2.9020919799804688 }, { "auxiliary_loss_clip": 0.01202956, "auxiliary_loss_mlp": 0.0106887, "balance_loss_clip": 1.06274688, "balance_loss_mlp": 1.04469395, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 5.2377005088202075, "language_loss": 0.72322488, "learning_rate": 3.971889793533093e-06, "loss": 0.74594313, "num_input_tokens_seen": 28962695, "step": 1361, "time_per_iteration": 2.6643178462982178 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.0552367, "balance_loss_mlp": 1.03184962, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 28.302545492028134, "language_loss": 0.76657653, "learning_rate": 3.971824688461976e-06, "loss": 0.78880513, "num_input_tokens_seen": 28982120, "step": 1362, "time_per_iteration": 2.7439064979553223 }, { "auxiliary_loss_clip": 0.01199728, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.06350708, "balance_loss_mlp": 1.03104496, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1850191919210338, "language_loss": 0.72384715, "learning_rate": 3.971759508619069e-06, "loss": 0.74637932, "num_input_tokens_seen": 28998100, "step": 1363, "time_per_iteration": 2.7082791328430176 }, { "auxiliary_loss_clip": 0.01202887, "auxiliary_loss_mlp": 0.01066374, "balance_loss_clip": 1.06580126, "balance_loss_mlp": 1.04083955, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.142285699657122, "language_loss": 0.7726444, "learning_rate": 3.971694254006844e-06, "loss": 0.79533696, "num_input_tokens_seen": 29017095, "step": 1364, "time_per_iteration": 2.777156114578247 }, { "auxiliary_loss_clip": 0.01135428, "auxiliary_loss_mlp": 0.01063854, "balance_loss_clip": 1.05182433, "balance_loss_mlp": 1.03645968, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 1.85589982882842, "language_loss": 0.82242119, "learning_rate": 3.971628924627776e-06, "loss": 0.844414, "num_input_tokens_seen": 29037240, "step": 1365, "time_per_iteration": 2.8192803859710693 }, { "auxiliary_loss_clip": 0.01196582, "auxiliary_loss_mlp": 0.01059945, "balance_loss_clip": 1.07006347, "balance_loss_mlp": 1.03706884, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.7803424706125983, "language_loss": 0.82062519, "learning_rate": 3.97156352048434e-06, "loss": 0.84319043, "num_input_tokens_seen": 29056250, "step": 1366, "time_per_iteration": 2.7482311725616455 }, { "auxiliary_loss_clip": 0.01153262, "auxiliary_loss_mlp": 0.0107233, "balance_loss_clip": 1.05320215, "balance_loss_mlp": 1.04779685, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 2.010209091244133, "language_loss": 0.81944495, "learning_rate": 3.97149804157902e-06, "loss": 0.84170091, "num_input_tokens_seen": 29073380, "step": 1367, "time_per_iteration": 4.352729797363281 }, { "auxiliary_loss_clip": 0.01206125, "auxiliary_loss_mlp": 0.01066888, "balance_loss_clip": 1.06541765, "balance_loss_mlp": 1.04241478, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 2.518996379768439, "language_loss": 0.8331567, "learning_rate": 3.9714324879142946e-06, "loss": 0.85588682, "num_input_tokens_seen": 29091330, "step": 1368, "time_per_iteration": 6.077457666397095 }, { "auxiliary_loss_clip": 0.01159992, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.06314564, "balance_loss_mlp": 1.02790344, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 3.198110530618569, "language_loss": 0.81336468, "learning_rate": 3.971366859492653e-06, "loss": 0.8354634, "num_input_tokens_seen": 29110375, "step": 1369, "time_per_iteration": 2.769972085952759 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.05438268, "balance_loss_mlp": 1.00027657, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.610758273724768, "language_loss": 0.74818152, "learning_rate": 3.971301156316582e-06, "loss": 0.76737428, "num_input_tokens_seen": 29129395, "step": 1370, "time_per_iteration": 4.497304201126099 }, { "auxiliary_loss_clip": 0.0115498, "auxiliary_loss_mlp": 0.01064278, "balance_loss_clip": 1.06403351, "balance_loss_mlp": 1.03987551, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.5246391685186451, "language_loss": 0.7398203, "learning_rate": 3.971235378388573e-06, "loss": 0.76201284, "num_input_tokens_seen": 29148650, "step": 1371, "time_per_iteration": 2.758089065551758 }, { "auxiliary_loss_clip": 0.01097162, "auxiliary_loss_mlp": 0.0106614, "balance_loss_clip": 1.05124569, "balance_loss_mlp": 1.04098701, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 1.9670948823939327, "language_loss": 0.70851803, "learning_rate": 3.971169525711122e-06, "loss": 0.73015106, "num_input_tokens_seen": 29170785, "step": 1372, "time_per_iteration": 4.069301605224609 }, { "auxiliary_loss_clip": 0.01162292, "auxiliary_loss_mlp": 0.01056859, "balance_loss_clip": 1.0571332, "balance_loss_mlp": 1.03261209, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.750431245604494, "language_loss": 0.88363653, "learning_rate": 3.9711035982867246e-06, "loss": 0.905828, "num_input_tokens_seen": 29185210, "step": 1373, "time_per_iteration": 3.9346964359283447 }, { "auxiliary_loss_clip": 0.01147291, "auxiliary_loss_mlp": 0.01062343, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.03878665, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 2.128923272573014, "language_loss": 0.82465184, "learning_rate": 3.971037596117882e-06, "loss": 0.84674811, "num_input_tokens_seen": 29205210, "step": 1374, "time_per_iteration": 2.933377981185913 }, { "auxiliary_loss_clip": 0.01044322, "auxiliary_loss_mlp": 0.01017124, "balance_loss_clip": 1.03154135, "balance_loss_mlp": 1.0135479, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.8272339650193923, "language_loss": 0.60641956, "learning_rate": 3.970971519207095e-06, "loss": 0.62703401, "num_input_tokens_seen": 29265350, "step": 1375, "time_per_iteration": 3.3287038803100586 }, { "auxiliary_loss_clip": 0.01060461, "auxiliary_loss_mlp": 0.01013653, "balance_loss_clip": 1.02398169, "balance_loss_mlp": 1.01017237, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9162492148708097, "language_loss": 0.62171799, "learning_rate": 3.970905367556871e-06, "loss": 0.64245915, "num_input_tokens_seen": 29321475, "step": 1376, "time_per_iteration": 3.218834161758423 }, { "auxiliary_loss_clip": 0.01159103, "auxiliary_loss_mlp": 0.0106347, "balance_loss_clip": 1.06229186, "balance_loss_mlp": 1.03942561, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 1.9191670647860084, "language_loss": 0.82577401, "learning_rate": 3.970839141169718e-06, "loss": 0.84799975, "num_input_tokens_seen": 29341405, "step": 1377, "time_per_iteration": 2.8763558864593506 }, { "auxiliary_loss_clip": 0.01176967, "auxiliary_loss_mlp": 0.01054072, "balance_loss_clip": 1.06486619, "balance_loss_mlp": 1.03011107, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 1.915539507671093, "language_loss": 0.84923226, "learning_rate": 3.970772840048147e-06, "loss": 0.87154263, "num_input_tokens_seen": 29361955, "step": 1378, "time_per_iteration": 2.8232595920562744 }, { "auxiliary_loss_clip": 0.01185329, "auxiliary_loss_mlp": 0.01058999, "balance_loss_clip": 1.06043923, "balance_loss_mlp": 1.0344305, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 6.4689921779024795, "language_loss": 0.87319231, "learning_rate": 3.970706464194672e-06, "loss": 0.8956356, "num_input_tokens_seen": 29382395, "step": 1379, "time_per_iteration": 2.756082534790039 }, { "auxiliary_loss_clip": 0.01158173, "auxiliary_loss_mlp": 0.01061479, "balance_loss_clip": 1.05779433, "balance_loss_mlp": 1.03829277, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 2.078993196749275, "language_loss": 0.78545237, "learning_rate": 3.970640013611812e-06, "loss": 0.8076489, "num_input_tokens_seen": 29404460, "step": 1380, "time_per_iteration": 2.9525601863861084 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.01059448, "balance_loss_clip": 1.06308961, "balance_loss_mlp": 1.0344255, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.6608111668609697, "language_loss": 0.86125714, "learning_rate": 3.970573488302083e-06, "loss": 0.88368654, "num_input_tokens_seen": 29422675, "step": 1381, "time_per_iteration": 2.735203742980957 }, { "auxiliary_loss_clip": 0.01197152, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.06611753, "balance_loss_mlp": 1.00034571, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 2.9433398182948203, "language_loss": 0.87471211, "learning_rate": 3.970506888268011e-06, "loss": 0.89450181, "num_input_tokens_seen": 29439840, "step": 1382, "time_per_iteration": 2.6392617225646973 }, { "auxiliary_loss_clip": 0.0115996, "auxiliary_loss_mlp": 0.01055463, "balance_loss_clip": 1.06138313, "balance_loss_mlp": 1.03337312, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 1.9901989904031434, "language_loss": 0.77085757, "learning_rate": 3.970440213512121e-06, "loss": 0.79301178, "num_input_tokens_seen": 29457360, "step": 1383, "time_per_iteration": 2.756565809249878 }, { "auxiliary_loss_clip": 0.01191549, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.06211782, "balance_loss_mlp": 1.03395748, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 1.818236548161018, "language_loss": 0.82858944, "learning_rate": 3.97037346403694e-06, "loss": 0.85108507, "num_input_tokens_seen": 29477040, "step": 1384, "time_per_iteration": 2.7848587036132812 }, { "auxiliary_loss_clip": 0.01148661, "auxiliary_loss_mlp": 0.01063605, "balance_loss_clip": 1.05671442, "balance_loss_mlp": 1.03610373, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 3.9982776391866346, "language_loss": 0.85219657, "learning_rate": 3.970306639845e-06, "loss": 0.8743192, "num_input_tokens_seen": 29492010, "step": 1385, "time_per_iteration": 2.803893566131592 }, { "auxiliary_loss_clip": 0.01157001, "auxiliary_loss_mlp": 0.01061891, "balance_loss_clip": 1.05823874, "balance_loss_mlp": 1.03750122, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.7071515381676081, "language_loss": 0.69195282, "learning_rate": 3.970239740938835e-06, "loss": 0.71414173, "num_input_tokens_seen": 29511850, "step": 1386, "time_per_iteration": 3.004786252975464 }, { "auxiliary_loss_clip": 0.01172803, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.05489016, "balance_loss_mlp": 1.03483546, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.672791522425571, "language_loss": 0.81894958, "learning_rate": 3.97017276732098e-06, "loss": 0.84126568, "num_input_tokens_seen": 29531415, "step": 1387, "time_per_iteration": 2.7678542137145996 }, { "auxiliary_loss_clip": 0.01179554, "auxiliary_loss_mlp": 0.01074251, "balance_loss_clip": 1.06179345, "balance_loss_mlp": 1.04817975, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.071322011459688, "language_loss": 0.77205479, "learning_rate": 3.970105718993978e-06, "loss": 0.7945928, "num_input_tokens_seen": 29549525, "step": 1388, "time_per_iteration": 2.8246304988861084 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.01062414, "balance_loss_clip": 1.05684018, "balance_loss_mlp": 1.03742766, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.0255270252506636, "language_loss": 0.79527366, "learning_rate": 3.970038595960369e-06, "loss": 0.81720638, "num_input_tokens_seen": 29568705, "step": 1389, "time_per_iteration": 2.8606414794921875 }, { "auxiliary_loss_clip": 0.01172785, "auxiliary_loss_mlp": 0.01064077, "balance_loss_clip": 1.05787444, "balance_loss_mlp": 1.03923428, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 2.546615132743645, "language_loss": 0.87427586, "learning_rate": 3.969971398222699e-06, "loss": 0.89664447, "num_input_tokens_seen": 29585855, "step": 1390, "time_per_iteration": 2.795931577682495 }, { "auxiliary_loss_clip": 0.01160426, "auxiliary_loss_mlp": 0.01067723, "balance_loss_clip": 1.05447149, "balance_loss_mlp": 1.04082966, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.8703157168219726, "language_loss": 0.86833143, "learning_rate": 3.969904125783517e-06, "loss": 0.89061296, "num_input_tokens_seen": 29607280, "step": 1391, "time_per_iteration": 2.811598062515259 }, { "auxiliary_loss_clip": 0.01156119, "auxiliary_loss_mlp": 0.01076482, "balance_loss_clip": 1.05575848, "balance_loss_mlp": 1.05180562, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 3.7979396758909263, "language_loss": 0.87688571, "learning_rate": 3.969836778645371e-06, "loss": 0.89921176, "num_input_tokens_seen": 29624130, "step": 1392, "time_per_iteration": 2.776819944381714 }, { "auxiliary_loss_clip": 0.01183316, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.05830503, "balance_loss_mlp": 1.03500128, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 8.95243370865895, "language_loss": 0.80574775, "learning_rate": 3.969769356810819e-06, "loss": 0.82816637, "num_input_tokens_seen": 29643210, "step": 1393, "time_per_iteration": 2.735761880874634 }, { "auxiliary_loss_clip": 0.01197686, "auxiliary_loss_mlp": 0.01058125, "balance_loss_clip": 1.06329441, "balance_loss_mlp": 1.03466487, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 1.7485261130451684, "language_loss": 0.85064757, "learning_rate": 3.969701860282415e-06, "loss": 0.87320572, "num_input_tokens_seen": 29663920, "step": 1394, "time_per_iteration": 2.950211524963379 }, { "auxiliary_loss_clip": 0.01145594, "auxiliary_loss_mlp": 0.01058123, "balance_loss_clip": 1.05994248, "balance_loss_mlp": 1.03432918, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 1.782466846937859, "language_loss": 0.82979721, "learning_rate": 3.969634289062719e-06, "loss": 0.85183442, "num_input_tokens_seen": 29683825, "step": 1395, "time_per_iteration": 2.883977174758911 }, { "auxiliary_loss_clip": 0.01187279, "auxiliary_loss_mlp": 0.00782865, "balance_loss_clip": 1.06065941, "balance_loss_mlp": 1.00028706, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 3.330409107955743, "language_loss": 0.82481396, "learning_rate": 3.969566643154293e-06, "loss": 0.84451544, "num_input_tokens_seen": 29698775, "step": 1396, "time_per_iteration": 2.6729378700256348 }, { "auxiliary_loss_clip": 0.0118605, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.06378388, "balance_loss_mlp": 1.03475475, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.780410555630689, "language_loss": 0.76843297, "learning_rate": 3.969498922559703e-06, "loss": 0.79089814, "num_input_tokens_seen": 29719430, "step": 1397, "time_per_iteration": 2.64888334274292 }, { "auxiliary_loss_clip": 0.01153742, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.05790138, "balance_loss_mlp": 1.02621412, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.1323769932413184, "language_loss": 0.77941638, "learning_rate": 3.969431127281516e-06, "loss": 0.8014614, "num_input_tokens_seen": 29739685, "step": 1398, "time_per_iteration": 2.8302125930786133 }, { "auxiliary_loss_clip": 0.01191086, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.05962944, "balance_loss_mlp": 1.02943766, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.150764713624159, "language_loss": 0.94635069, "learning_rate": 3.969363257322304e-06, "loss": 0.96878529, "num_input_tokens_seen": 29756165, "step": 1399, "time_per_iteration": 2.650517702102661 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.0106738, "balance_loss_clip": 1.0562712, "balance_loss_mlp": 1.04168999, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 3.6141849657848137, "language_loss": 0.81904209, "learning_rate": 3.96929531268464e-06, "loss": 0.8414399, "num_input_tokens_seen": 29776425, "step": 1400, "time_per_iteration": 2.777369260787964 }, { "auxiliary_loss_clip": 0.01170173, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.05968165, "balance_loss_mlp": 1.03957999, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 8.998651919840762, "language_loss": 0.8642807, "learning_rate": 3.969227293371099e-06, "loss": 0.88662529, "num_input_tokens_seen": 29796440, "step": 1401, "time_per_iteration": 2.91375732421875 }, { "auxiliary_loss_clip": 0.01196, "auxiliary_loss_mlp": 0.01066109, "balance_loss_clip": 1.05935979, "balance_loss_mlp": 1.04053831, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.9792515680869114, "language_loss": 0.87500131, "learning_rate": 3.969159199384263e-06, "loss": 0.89762247, "num_input_tokens_seen": 29814755, "step": 1402, "time_per_iteration": 2.7827296257019043 }, { "auxiliary_loss_clip": 0.01144907, "auxiliary_loss_mlp": 0.00781428, "balance_loss_clip": 1.05105817, "balance_loss_mlp": 1.00033188, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.1517994230241566, "language_loss": 0.8905524, "learning_rate": 3.9690910307267125e-06, "loss": 0.90981579, "num_input_tokens_seen": 29834785, "step": 1403, "time_per_iteration": 2.931666374206543 }, { "auxiliary_loss_clip": 0.01165276, "auxiliary_loss_mlp": 0.01061696, "balance_loss_clip": 1.05570936, "balance_loss_mlp": 1.03715038, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.790271378285476, "language_loss": 0.80321431, "learning_rate": 3.969022787401033e-06, "loss": 0.82548404, "num_input_tokens_seen": 29854695, "step": 1404, "time_per_iteration": 2.7397725582122803 }, { "auxiliary_loss_clip": 0.01181709, "auxiliary_loss_mlp": 0.01071408, "balance_loss_clip": 1.06211567, "balance_loss_mlp": 1.04649353, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 2.0849305916509193, "language_loss": 0.83557045, "learning_rate": 3.968954469409811e-06, "loss": 0.85810155, "num_input_tokens_seen": 29872180, "step": 1405, "time_per_iteration": 2.8052847385406494 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01058347, "balance_loss_clip": 1.05636072, "balance_loss_mlp": 1.03588748, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.5225846020503528, "language_loss": 0.7991904, "learning_rate": 3.968886076755639e-06, "loss": 0.82159847, "num_input_tokens_seen": 29893205, "step": 1406, "time_per_iteration": 4.301243305206299 }, { "auxiliary_loss_clip": 0.0117117, "auxiliary_loss_mlp": 0.01068275, "balance_loss_clip": 1.05790758, "balance_loss_mlp": 1.04406369, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.717770739318623, "language_loss": 0.79441547, "learning_rate": 3.96881760944111e-06, "loss": 0.81680995, "num_input_tokens_seen": 29911970, "step": 1407, "time_per_iteration": 2.6535613536834717 }, { "auxiliary_loss_clip": 0.01186501, "auxiliary_loss_mlp": 0.01057881, "balance_loss_clip": 1.05982685, "balance_loss_mlp": 1.03409886, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 2.191354041218588, "language_loss": 0.91799384, "learning_rate": 3.968749067468819e-06, "loss": 0.94043779, "num_input_tokens_seen": 29929925, "step": 1408, "time_per_iteration": 5.774486064910889 }, { "auxiliary_loss_clip": 0.01058217, "auxiliary_loss_mlp": 0.01015213, "balance_loss_clip": 1.0231359, "balance_loss_mlp": 1.01139832, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.9559717259642487, "language_loss": 0.61891782, "learning_rate": 3.968680450841368e-06, "loss": 0.63965201, "num_input_tokens_seen": 29985950, "step": 1409, "time_per_iteration": 4.9455225467681885 }, { "auxiliary_loss_clip": 0.01188186, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05840743, "balance_loss_mlp": 1.03878236, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 1.6980375913788566, "language_loss": 0.86357373, "learning_rate": 3.968611759561355e-06, "loss": 0.88607281, "num_input_tokens_seen": 30004330, "step": 1410, "time_per_iteration": 2.640355110168457 }, { "auxiliary_loss_clip": 0.01181512, "auxiliary_loss_mlp": 0.01053874, "balance_loss_clip": 1.0583061, "balance_loss_mlp": 1.02870846, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 2.248971712939306, "language_loss": 0.74384397, "learning_rate": 3.968542993631388e-06, "loss": 0.7661978, "num_input_tokens_seen": 30022555, "step": 1411, "time_per_iteration": 2.6200830936431885 }, { "auxiliary_loss_clip": 0.01077929, "auxiliary_loss_mlp": 0.01003535, "balance_loss_clip": 1.02317524, "balance_loss_mlp": 0.99991113, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9014663966204861, "language_loss": 0.56748837, "learning_rate": 3.968474153054073e-06, "loss": 0.58830309, "num_input_tokens_seen": 30077220, "step": 1412, "time_per_iteration": 3.0746512413024902 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01067795, "balance_loss_clip": 1.05325568, "balance_loss_mlp": 1.04265356, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 2.2757293876932945, "language_loss": 0.88754624, "learning_rate": 3.96840523783202e-06, "loss": 0.90978551, "num_input_tokens_seen": 30094600, "step": 1413, "time_per_iteration": 2.7309420108795166 }, { "auxiliary_loss_clip": 0.01164895, "auxiliary_loss_mlp": 0.01057479, "balance_loss_clip": 1.05780244, "balance_loss_mlp": 1.03295755, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.9781781646219805, "language_loss": 0.87963474, "learning_rate": 3.968336247967844e-06, "loss": 0.90185857, "num_input_tokens_seen": 30114475, "step": 1414, "time_per_iteration": 2.692030668258667 }, { "auxiliary_loss_clip": 0.01168145, "auxiliary_loss_mlp": 0.01063751, "balance_loss_clip": 1.05704033, "balance_loss_mlp": 1.04170966, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.9706021333256292, "language_loss": 0.77636635, "learning_rate": 3.96826718346416e-06, "loss": 0.79868531, "num_input_tokens_seen": 30133350, "step": 1415, "time_per_iteration": 2.8435540199279785 }, { "auxiliary_loss_clip": 0.01182108, "auxiliary_loss_mlp": 0.01059478, "balance_loss_clip": 1.0588963, "balance_loss_mlp": 1.03701878, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 1.7170282174092708, "language_loss": 0.70545506, "learning_rate": 3.968198044323587e-06, "loss": 0.72787094, "num_input_tokens_seen": 30159005, "step": 1416, "time_per_iteration": 3.021360158920288 }, { "auxiliary_loss_clip": 0.01174166, "auxiliary_loss_mlp": 0.01066487, "balance_loss_clip": 1.05930233, "balance_loss_mlp": 1.04131043, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 2.8159853289102053, "language_loss": 0.74938154, "learning_rate": 3.968128830548748e-06, "loss": 0.771788, "num_input_tokens_seen": 30179450, "step": 1417, "time_per_iteration": 2.738301992416382 }, { "auxiliary_loss_clip": 0.01171292, "auxiliary_loss_mlp": 0.01057092, "balance_loss_clip": 1.05715823, "balance_loss_mlp": 1.03313112, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.4132423968154635, "language_loss": 0.8258723, "learning_rate": 3.968059542142265e-06, "loss": 0.84815615, "num_input_tokens_seen": 30197235, "step": 1418, "time_per_iteration": 2.671574831008911 }, { "auxiliary_loss_clip": 0.0104499, "auxiliary_loss_mlp": 0.01004818, "balance_loss_clip": 1.02242994, "balance_loss_mlp": 1.0004549, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.8667411864001444, "language_loss": 0.56638753, "learning_rate": 3.9679901791067685e-06, "loss": 0.58688557, "num_input_tokens_seen": 30257410, "step": 1419, "time_per_iteration": 3.199730396270752 }, { "auxiliary_loss_clip": 0.01192231, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05757999, "balance_loss_mlp": 1.04369283, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.2357492693560466, "language_loss": 0.70111859, "learning_rate": 3.967920741444886e-06, "loss": 0.72371829, "num_input_tokens_seen": 30277865, "step": 1420, "time_per_iteration": 2.7176027297973633 }, { "auxiliary_loss_clip": 0.01155207, "auxiliary_loss_mlp": 0.01050755, "balance_loss_clip": 1.05377483, "balance_loss_mlp": 1.02692556, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.5975069204011494, "language_loss": 0.88011539, "learning_rate": 3.967851229159252e-06, "loss": 0.90217495, "num_input_tokens_seen": 30298545, "step": 1421, "time_per_iteration": 2.7552106380462646 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01013517, "balance_loss_clip": 1.02364218, "balance_loss_mlp": 1.01020324, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.9142209544576306, "language_loss": 0.63506877, "learning_rate": 3.967781642252502e-06, "loss": 0.65597868, "num_input_tokens_seen": 30361725, "step": 1422, "time_per_iteration": 3.134183168411255 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.01063847, "balance_loss_clip": 1.05932307, "balance_loss_mlp": 1.0406723, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 1.8757015124159093, "language_loss": 0.82691669, "learning_rate": 3.967711980727276e-06, "loss": 0.84908462, "num_input_tokens_seen": 30382180, "step": 1423, "time_per_iteration": 2.789393424987793 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01064169, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.04089928, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.6593534429066656, "language_loss": 0.75424892, "learning_rate": 3.967642244586213e-06, "loss": 0.776425, "num_input_tokens_seen": 30402980, "step": 1424, "time_per_iteration": 2.7805826663970947 }, { "auxiliary_loss_clip": 0.01139579, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.05769765, "balance_loss_mlp": 1.03751373, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 1.7999307606718091, "language_loss": 0.75948423, "learning_rate": 3.96757243383196e-06, "loss": 0.78148341, "num_input_tokens_seen": 30420800, "step": 1425, "time_per_iteration": 2.677889823913574 }, { "auxiliary_loss_clip": 0.0118966, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.05982256, "balance_loss_mlp": 1.03230715, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.1792756220437743, "language_loss": 0.93362999, "learning_rate": 3.9675025484671624e-06, "loss": 0.95607889, "num_input_tokens_seen": 30439620, "step": 1426, "time_per_iteration": 2.6270906925201416 }, { "auxiliary_loss_clip": 0.01145994, "auxiliary_loss_mlp": 0.01066219, "balance_loss_clip": 1.05707717, "balance_loss_mlp": 1.0406251, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.3679064075186553, "language_loss": 0.75424731, "learning_rate": 3.967432588494471e-06, "loss": 0.77636945, "num_input_tokens_seen": 30457300, "step": 1427, "time_per_iteration": 2.84614634513855 }, { "auxiliary_loss_clip": 0.01190697, "auxiliary_loss_mlp": 0.01052992, "balance_loss_clip": 1.06006169, "balance_loss_mlp": 1.0305804, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 3.503048788198607, "language_loss": 0.82108849, "learning_rate": 3.96736255391654e-06, "loss": 0.84352541, "num_input_tokens_seen": 30471580, "step": 1428, "time_per_iteration": 2.5882396697998047 }, { "auxiliary_loss_clip": 0.01173688, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.05633736, "balance_loss_mlp": 1.03832793, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 2.088481658755078, "language_loss": 0.79929984, "learning_rate": 3.967292444736023e-06, "loss": 0.82166648, "num_input_tokens_seen": 30492720, "step": 1429, "time_per_iteration": 2.720500946044922 }, { "auxiliary_loss_clip": 0.01169119, "auxiliary_loss_mlp": 0.010606, "balance_loss_clip": 1.05971265, "balance_loss_mlp": 1.0379504, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.9029222975672677, "language_loss": 0.87716508, "learning_rate": 3.967222260955578e-06, "loss": 0.89946228, "num_input_tokens_seen": 30509535, "step": 1430, "time_per_iteration": 2.6914596557617188 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01074633, "balance_loss_clip": 1.05802035, "balance_loss_mlp": 1.05125606, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.6366623508781384, "language_loss": 0.81859726, "learning_rate": 3.96715200257787e-06, "loss": 0.84078664, "num_input_tokens_seen": 30529490, "step": 1431, "time_per_iteration": 2.834402322769165 }, { "auxiliary_loss_clip": 0.01148362, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.05620182, "balance_loss_mlp": 1.03132737, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.5497375505717568, "language_loss": 0.78109461, "learning_rate": 3.967081669605559e-06, "loss": 0.80312145, "num_input_tokens_seen": 30550205, "step": 1432, "time_per_iteration": 2.767860174179077 }, { "auxiliary_loss_clip": 0.01167351, "auxiliary_loss_mlp": 0.0106333, "balance_loss_clip": 1.0540905, "balance_loss_mlp": 1.03914225, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 1.9631692713893694, "language_loss": 0.73365706, "learning_rate": 3.967011262041315e-06, "loss": 0.75596392, "num_input_tokens_seen": 30568830, "step": 1433, "time_per_iteration": 2.6930699348449707 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.00781967, "balance_loss_clip": 1.05335927, "balance_loss_mlp": 1.00044179, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 2.468588778716135, "language_loss": 0.85340321, "learning_rate": 3.9669407798878065e-06, "loss": 0.87272388, "num_input_tokens_seen": 30585730, "step": 1434, "time_per_iteration": 2.735690116882324 }, { "auxiliary_loss_clip": 0.01170363, "auxiliary_loss_mlp": 0.01057659, "balance_loss_clip": 1.05604434, "balance_loss_mlp": 1.0344249, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.160640509122794, "language_loss": 0.7870298, "learning_rate": 3.966870223147707e-06, "loss": 0.80931008, "num_input_tokens_seen": 30603180, "step": 1435, "time_per_iteration": 2.776567220687866 }, { "auxiliary_loss_clip": 0.01047768, "auxiliary_loss_mlp": 0.01015597, "balance_loss_clip": 1.023893, "balance_loss_mlp": 1.01206815, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.8900716332014227, "language_loss": 0.57975936, "learning_rate": 3.96679959182369e-06, "loss": 0.60039294, "num_input_tokens_seen": 30668895, "step": 1436, "time_per_iteration": 3.344207763671875 }, { "auxiliary_loss_clip": 0.0117372, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.05617976, "balance_loss_mlp": 1.03153312, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.240343996649645, "language_loss": 0.69169062, "learning_rate": 3.966728885918437e-06, "loss": 0.71399617, "num_input_tokens_seen": 30688955, "step": 1437, "time_per_iteration": 2.7171547412872314 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.03223276, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.1340571114707245, "language_loss": 0.72624576, "learning_rate": 3.966658105434627e-06, "loss": 0.74800885, "num_input_tokens_seen": 30706095, "step": 1438, "time_per_iteration": 2.7815651893615723 }, { "auxiliary_loss_clip": 0.01179626, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.06052637, "balance_loss_mlp": 1.02872419, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.5339762166114281, "language_loss": 0.64377135, "learning_rate": 3.966587250374945e-06, "loss": 0.66609335, "num_input_tokens_seen": 30729025, "step": 1439, "time_per_iteration": 2.8935797214508057 }, { "auxiliary_loss_clip": 0.01153286, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.03213322, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 5.193932354158579, "language_loss": 0.87521696, "learning_rate": 3.966516320742077e-06, "loss": 0.89731431, "num_input_tokens_seen": 30746155, "step": 1440, "time_per_iteration": 2.731531858444214 }, { "auxiliary_loss_clip": 0.01155923, "auxiliary_loss_mlp": 0.00782787, "balance_loss_clip": 1.05752945, "balance_loss_mlp": 1.00043201, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.023462963415533, "language_loss": 0.83434939, "learning_rate": 3.9664453165387124e-06, "loss": 0.85373652, "num_input_tokens_seen": 30761410, "step": 1441, "time_per_iteration": 2.7126500606536865 }, { "auxiliary_loss_clip": 0.01074667, "auxiliary_loss_mlp": 0.01004602, "balance_loss_clip": 1.0222367, "balance_loss_mlp": 1.00100195, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8541685426878655, "language_loss": 0.60479522, "learning_rate": 3.966374237767545e-06, "loss": 0.62558794, "num_input_tokens_seen": 30823010, "step": 1442, "time_per_iteration": 3.25555157661438 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.05768681, "balance_loss_mlp": 1.03075421, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.8449103562639073, "language_loss": 0.79304373, "learning_rate": 3.96630308443127e-06, "loss": 0.81525922, "num_input_tokens_seen": 30841980, "step": 1443, "time_per_iteration": 2.7314631938934326 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.05780149, "balance_loss_mlp": 1.02755547, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 1.6739262813835734, "language_loss": 0.82399666, "learning_rate": 3.966231856532584e-06, "loss": 0.84631598, "num_input_tokens_seen": 30863280, "step": 1444, "time_per_iteration": 2.7341418266296387 }, { "auxiliary_loss_clip": 0.01196759, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.06044626, "balance_loss_mlp": 1.02810788, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.3015461969915747, "language_loss": 0.87354827, "learning_rate": 3.966160554074189e-06, "loss": 0.8960306, "num_input_tokens_seen": 30881710, "step": 1445, "time_per_iteration": 4.25179386138916 }, { "auxiliary_loss_clip": 0.01180784, "auxiliary_loss_mlp": 0.01055896, "balance_loss_clip": 1.06094933, "balance_loss_mlp": 1.03446186, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 1.8066650797875201, "language_loss": 0.81863767, "learning_rate": 3.96608917705879e-06, "loss": 0.84100449, "num_input_tokens_seen": 30900225, "step": 1446, "time_per_iteration": 4.197181940078735 }, { "auxiliary_loss_clip": 0.01056056, "auxiliary_loss_mlp": 0.01004371, "balance_loss_clip": 1.01782191, "balance_loss_mlp": 1.00031781, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7255245569613363, "language_loss": 0.54762936, "learning_rate": 3.966017725489091e-06, "loss": 0.56823361, "num_input_tokens_seen": 30959580, "step": 1447, "time_per_iteration": 3.2158126831054688 }, { "auxiliary_loss_clip": 0.0114861, "auxiliary_loss_mlp": 0.01056824, "balance_loss_clip": 1.05373001, "balance_loss_mlp": 1.03518772, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 2.1586118179593696, "language_loss": 0.84592307, "learning_rate": 3.965946199367804e-06, "loss": 0.86797738, "num_input_tokens_seen": 30976775, "step": 1448, "time_per_iteration": 4.262767314910889 }, { "auxiliary_loss_clip": 0.01194173, "auxiliary_loss_mlp": 0.01050219, "balance_loss_clip": 1.05891991, "balance_loss_mlp": 1.02768826, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 3.4326906921347096, "language_loss": 0.80644608, "learning_rate": 3.965874598697638e-06, "loss": 0.82888997, "num_input_tokens_seen": 30990495, "step": 1449, "time_per_iteration": 4.553676128387451 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01052142, "balance_loss_clip": 1.05437374, "balance_loss_mlp": 1.02946854, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.5251600336566102, "language_loss": 0.70971417, "learning_rate": 3.965802923481313e-06, "loss": 0.73162109, "num_input_tokens_seen": 31014080, "step": 1450, "time_per_iteration": 2.9082705974578857 }, { "auxiliary_loss_clip": 0.01124466, "auxiliary_loss_mlp": 0.01054883, "balance_loss_clip": 1.05164719, "balance_loss_mlp": 1.03207827, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 1.9392114767205617, "language_loss": 0.83684897, "learning_rate": 3.965731173721542e-06, "loss": 0.85864246, "num_input_tokens_seen": 31031210, "step": 1451, "time_per_iteration": 2.809880495071411 }, { "auxiliary_loss_clip": 0.01134251, "auxiliary_loss_mlp": 0.00780873, "balance_loss_clip": 1.05147851, "balance_loss_mlp": 1.00039482, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 2.5160845512367773, "language_loss": 0.74654591, "learning_rate": 3.965659349421049e-06, "loss": 0.76569718, "num_input_tokens_seen": 31049710, "step": 1452, "time_per_iteration": 2.88580060005188 }, { "auxiliary_loss_clip": 0.01157134, "auxiliary_loss_mlp": 0.01063328, "balance_loss_clip": 1.05607891, "balance_loss_mlp": 1.0388428, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 4.56941406999875, "language_loss": 0.80543101, "learning_rate": 3.965587450582556e-06, "loss": 0.82763565, "num_input_tokens_seen": 31066160, "step": 1453, "time_per_iteration": 2.733632802963257 }, { "auxiliary_loss_clip": 0.01169707, "auxiliary_loss_mlp": 0.01059533, "balance_loss_clip": 1.05905569, "balance_loss_mlp": 1.03625154, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 2.0102093196988102, "language_loss": 0.71041977, "learning_rate": 3.96551547720879e-06, "loss": 0.73271215, "num_input_tokens_seen": 31085270, "step": 1454, "time_per_iteration": 2.7568745613098145 }, { "auxiliary_loss_clip": 0.0106426, "auxiliary_loss_mlp": 0.01008112, "balance_loss_clip": 1.0215131, "balance_loss_mlp": 1.00463128, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7713706503543015, "language_loss": 0.5859946, "learning_rate": 3.96544342930248e-06, "loss": 0.6067183, "num_input_tokens_seen": 31148445, "step": 1455, "time_per_iteration": 3.2372186183929443 }, { "auxiliary_loss_clip": 0.01189404, "auxiliary_loss_mlp": 0.01060742, "balance_loss_clip": 1.05742884, "balance_loss_mlp": 1.03688788, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.6485208275358016, "language_loss": 0.77564865, "learning_rate": 3.965371306866359e-06, "loss": 0.79815018, "num_input_tokens_seen": 31168770, "step": 1456, "time_per_iteration": 2.790663003921509 }, { "auxiliary_loss_clip": 0.01127959, "auxiliary_loss_mlp": 0.01054526, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.03071976, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.83889407784057, "language_loss": 0.72420907, "learning_rate": 3.96529910990316e-06, "loss": 0.74603397, "num_input_tokens_seen": 31189270, "step": 1457, "time_per_iteration": 2.9099740982055664 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.0104866, "balance_loss_clip": 1.05627227, "balance_loss_mlp": 1.02633214, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5250401870177361, "language_loss": 0.86412215, "learning_rate": 3.965226838415622e-06, "loss": 0.88637424, "num_input_tokens_seen": 31210385, "step": 1458, "time_per_iteration": 2.7517166137695312 }, { "auxiliary_loss_clip": 0.01169535, "auxiliary_loss_mlp": 0.01061413, "balance_loss_clip": 1.05884266, "balance_loss_mlp": 1.03825045, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.7412813512419094, "language_loss": 0.80268395, "learning_rate": 3.965154492406486e-06, "loss": 0.82499349, "num_input_tokens_seen": 31229745, "step": 1459, "time_per_iteration": 2.71455717086792 }, { "auxiliary_loss_clip": 0.01130491, "auxiliary_loss_mlp": 0.01054334, "balance_loss_clip": 1.05256546, "balance_loss_mlp": 1.03018188, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.1450339680450714, "language_loss": 0.84538847, "learning_rate": 3.9650820718784945e-06, "loss": 0.86723673, "num_input_tokens_seen": 31248280, "step": 1460, "time_per_iteration": 2.8737733364105225 }, { "auxiliary_loss_clip": 0.01177787, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.0572983, "balance_loss_mlp": 1.03640938, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 4.917361835698274, "language_loss": 0.79993135, "learning_rate": 3.965009576834394e-06, "loss": 0.82228899, "num_input_tokens_seen": 31262190, "step": 1461, "time_per_iteration": 2.8436062335968018 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.05800629, "balance_loss_mlp": 1.03704822, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 1.566202508611165, "language_loss": 0.76571167, "learning_rate": 3.964937007276932e-06, "loss": 0.78799284, "num_input_tokens_seen": 31283690, "step": 1462, "time_per_iteration": 2.7895474433898926 }, { "auxiliary_loss_clip": 0.0117563, "auxiliary_loss_mlp": 0.01060064, "balance_loss_clip": 1.05839491, "balance_loss_mlp": 1.03580475, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 2.89717114041641, "language_loss": 0.74710488, "learning_rate": 3.9648643632088634e-06, "loss": 0.76946187, "num_input_tokens_seen": 31302505, "step": 1463, "time_per_iteration": 2.760404348373413 }, { "auxiliary_loss_clip": 0.01191543, "auxiliary_loss_mlp": 0.01061609, "balance_loss_clip": 1.06145048, "balance_loss_mlp": 1.03680158, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 2.431514195311041, "language_loss": 0.83797103, "learning_rate": 3.964791644632941e-06, "loss": 0.8605026, "num_input_tokens_seen": 31323070, "step": 1464, "time_per_iteration": 2.7417759895324707 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01063475, "balance_loss_clip": 1.05683231, "balance_loss_mlp": 1.04093289, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 2.1775753375634963, "language_loss": 0.78104752, "learning_rate": 3.964718851551923e-06, "loss": 0.8034153, "num_input_tokens_seen": 31341880, "step": 1465, "time_per_iteration": 2.6852309703826904 }, { "auxiliary_loss_clip": 0.01199489, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.0619812, "balance_loss_mlp": 1.03791499, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 2.412657222564686, "language_loss": 0.85187089, "learning_rate": 3.9646459839685675e-06, "loss": 0.87446451, "num_input_tokens_seen": 31361995, "step": 1466, "time_per_iteration": 2.706264019012451 }, { "auxiliary_loss_clip": 0.01120627, "auxiliary_loss_mlp": 0.00782645, "balance_loss_clip": 1.04989958, "balance_loss_mlp": 1.00037241, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 1.9900601596102498, "language_loss": 0.84168816, "learning_rate": 3.964573041885641e-06, "loss": 0.86072087, "num_input_tokens_seen": 31381515, "step": 1467, "time_per_iteration": 2.8636934757232666 }, { "auxiliary_loss_clip": 0.01178935, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.05910301, "balance_loss_mlp": 1.03219247, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.660218686828999, "language_loss": 0.75506544, "learning_rate": 3.964500025305907e-06, "loss": 0.77740854, "num_input_tokens_seen": 31400345, "step": 1468, "time_per_iteration": 2.661501884460449 }, { "auxiliary_loss_clip": 0.01181261, "auxiliary_loss_mlp": 0.01054252, "balance_loss_clip": 1.0629456, "balance_loss_mlp": 1.03266358, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 4.868504388441724, "language_loss": 0.80322379, "learning_rate": 3.9644269342321355e-06, "loss": 0.82557893, "num_input_tokens_seen": 31419620, "step": 1469, "time_per_iteration": 2.7473137378692627 }, { "auxiliary_loss_clip": 0.01198542, "auxiliary_loss_mlp": 0.01059353, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.03677487, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 2.0179242193855806, "language_loss": 0.77437651, "learning_rate": 3.9643537686670974e-06, "loss": 0.79695547, "num_input_tokens_seen": 31437970, "step": 1470, "time_per_iteration": 2.7672410011291504 }, { "auxiliary_loss_clip": 0.01193825, "auxiliary_loss_mlp": 0.01067102, "balance_loss_clip": 1.06180143, "balance_loss_mlp": 1.04281926, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 1.6812425162011504, "language_loss": 0.84297001, "learning_rate": 3.964280528613569e-06, "loss": 0.86557925, "num_input_tokens_seen": 31457040, "step": 1471, "time_per_iteration": 2.7584216594696045 }, { "auxiliary_loss_clip": 0.01156315, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.05682266, "balance_loss_mlp": 1.03342199, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.6938350729430058, "language_loss": 0.83321345, "learning_rate": 3.964207214074324e-06, "loss": 0.85531968, "num_input_tokens_seen": 31477520, "step": 1472, "time_per_iteration": 2.7895469665527344 }, { "auxiliary_loss_clip": 0.01176151, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.06106544, "balance_loss_mlp": 1.03529835, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 2.3638705809965, "language_loss": 0.82781172, "learning_rate": 3.964133825052146e-06, "loss": 0.85015941, "num_input_tokens_seen": 31495575, "step": 1473, "time_per_iteration": 2.7361483573913574 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.05552769, "balance_loss_mlp": 1.04263091, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.6022277785896435, "language_loss": 0.78712153, "learning_rate": 3.964060361549816e-06, "loss": 0.80906206, "num_input_tokens_seen": 31520020, "step": 1474, "time_per_iteration": 2.894319534301758 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05575764, "balance_loss_mlp": 1.04175043, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.6120869011213488, "language_loss": 0.79030406, "learning_rate": 3.963986823570121e-06, "loss": 0.81244779, "num_input_tokens_seen": 31539265, "step": 1475, "time_per_iteration": 2.8806042671203613 }, { "auxiliary_loss_clip": 0.01191986, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.05980015, "balance_loss_mlp": 1.02478909, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.4679464237421194, "language_loss": 0.74202317, "learning_rate": 3.963913211115848e-06, "loss": 0.76441753, "num_input_tokens_seen": 31563425, "step": 1476, "time_per_iteration": 2.8381049633026123 }, { "auxiliary_loss_clip": 0.01174628, "auxiliary_loss_mlp": 0.01059934, "balance_loss_clip": 1.06217527, "balance_loss_mlp": 1.03678358, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.712954575149443, "language_loss": 0.74220836, "learning_rate": 3.9638395241897895e-06, "loss": 0.76455402, "num_input_tokens_seen": 31584525, "step": 1477, "time_per_iteration": 2.8452210426330566 }, { "auxiliary_loss_clip": 0.01191865, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.06062829, "balance_loss_mlp": 1.0278163, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 1.95844459768748, "language_loss": 0.87194049, "learning_rate": 3.963765762794739e-06, "loss": 0.89437139, "num_input_tokens_seen": 31603325, "step": 1478, "time_per_iteration": 2.644918203353882 }, { "auxiliary_loss_clip": 0.01176299, "auxiliary_loss_mlp": 0.01058069, "balance_loss_clip": 1.0572443, "balance_loss_mlp": 1.03546739, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 1.6306868156426517, "language_loss": 0.77571511, "learning_rate": 3.963691926933495e-06, "loss": 0.79805881, "num_input_tokens_seen": 31624820, "step": 1479, "time_per_iteration": 2.738168954849243 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05629039, "balance_loss_mlp": 1.02801871, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.199164032289915, "language_loss": 0.77797234, "learning_rate": 3.9636180166088555e-06, "loss": 0.80014014, "num_input_tokens_seen": 31646080, "step": 1480, "time_per_iteration": 2.837562322616577 }, { "auxiliary_loss_clip": 0.01180168, "auxiliary_loss_mlp": 0.01060894, "balance_loss_clip": 1.05762577, "balance_loss_mlp": 1.03656292, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 2.9471668635954273, "language_loss": 0.66437578, "learning_rate": 3.963544031823624e-06, "loss": 0.68678641, "num_input_tokens_seen": 31665770, "step": 1481, "time_per_iteration": 2.742422580718994 }, { "auxiliary_loss_clip": 0.01143445, "auxiliary_loss_mlp": 0.01055318, "balance_loss_clip": 1.05510306, "balance_loss_mlp": 1.03273988, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 2.124586862599894, "language_loss": 0.96630967, "learning_rate": 3.9634699725806065e-06, "loss": 0.9882974, "num_input_tokens_seen": 31683805, "step": 1482, "time_per_iteration": 2.8150243759155273 }, { "auxiliary_loss_clip": 0.0115336, "auxiliary_loss_mlp": 0.01057266, "balance_loss_clip": 1.05521989, "balance_loss_mlp": 1.03353167, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.7904792435575492, "language_loss": 0.78683239, "learning_rate": 3.96339583888261e-06, "loss": 0.80893862, "num_input_tokens_seen": 31704630, "step": 1483, "time_per_iteration": 2.869084119796753 }, { "auxiliary_loss_clip": 0.0116904, "auxiliary_loss_mlp": 0.01082082, "balance_loss_clip": 1.05540919, "balance_loss_mlp": 1.05829978, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.2229749189835677, "language_loss": 0.85424453, "learning_rate": 3.963321630732448e-06, "loss": 0.87675571, "num_input_tokens_seen": 31723255, "step": 1484, "time_per_iteration": 4.280332326889038 }, { "auxiliary_loss_clip": 0.01199312, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.06350458, "balance_loss_mlp": 1.03701186, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 1.7208139316694195, "language_loss": 0.80205405, "learning_rate": 3.963247348132932e-06, "loss": 0.82465357, "num_input_tokens_seen": 31747045, "step": 1485, "time_per_iteration": 2.761733055114746 }, { "auxiliary_loss_clip": 0.01173167, "auxiliary_loss_mlp": 0.01056554, "balance_loss_clip": 1.0563333, "balance_loss_mlp": 1.03228331, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.8969438127775513, "language_loss": 0.82859123, "learning_rate": 3.96317299108688e-06, "loss": 0.85088843, "num_input_tokens_seen": 31766615, "step": 1486, "time_per_iteration": 4.144649028778076 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.05592823, "balance_loss_mlp": 1.04021382, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 2.1520807598980185, "language_loss": 0.76365155, "learning_rate": 3.963098559597111e-06, "loss": 0.78573477, "num_input_tokens_seen": 31785855, "step": 1487, "time_per_iteration": 4.432489395141602 }, { "auxiliary_loss_clip": 0.01157327, "auxiliary_loss_mlp": 0.01060261, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.03542995, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 3.851280697857004, "language_loss": 0.83030224, "learning_rate": 3.963024053666449e-06, "loss": 0.85247803, "num_input_tokens_seen": 31804210, "step": 1488, "time_per_iteration": 2.7262001037597656 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.01051875, "balance_loss_clip": 1.05546355, "balance_loss_mlp": 1.02916527, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 1.7759111472560039, "language_loss": 0.71783459, "learning_rate": 3.962949473297718e-06, "loss": 0.74007201, "num_input_tokens_seen": 31826150, "step": 1489, "time_per_iteration": 4.562536954879761 }, { "auxiliary_loss_clip": 0.01150585, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.02830291, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.6999724957706692, "language_loss": 0.89717221, "learning_rate": 3.962874818493745e-06, "loss": 0.91919196, "num_input_tokens_seen": 31848060, "step": 1490, "time_per_iteration": 2.838327646255493 }, { "auxiliary_loss_clip": 0.01184278, "auxiliary_loss_mlp": 0.01064168, "balance_loss_clip": 1.05656135, "balance_loss_mlp": 1.04102957, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 3.9062133325383126, "language_loss": 0.73075998, "learning_rate": 3.9628000892573635e-06, "loss": 0.7532444, "num_input_tokens_seen": 31870040, "step": 1491, "time_per_iteration": 2.7007367610931396 }, { "auxiliary_loss_clip": 0.01189564, "auxiliary_loss_mlp": 0.00780167, "balance_loss_clip": 1.05968356, "balance_loss_mlp": 1.00023544, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 1.7021050418948058, "language_loss": 0.77235049, "learning_rate": 3.9627252855914055e-06, "loss": 0.79204774, "num_input_tokens_seen": 31890400, "step": 1492, "time_per_iteration": 2.7799623012542725 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01057952, "balance_loss_clip": 1.05902028, "balance_loss_mlp": 1.03512359, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 1.9236790530591625, "language_loss": 0.71429193, "learning_rate": 3.962650407498707e-06, "loss": 0.73674262, "num_input_tokens_seen": 31913435, "step": 1493, "time_per_iteration": 2.8479840755462646 }, { "auxiliary_loss_clip": 0.01188796, "auxiliary_loss_mlp": 0.01057103, "balance_loss_clip": 1.05757976, "balance_loss_mlp": 1.03371406, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 2.6977604073852053, "language_loss": 0.87175488, "learning_rate": 3.962575454982109e-06, "loss": 0.8942138, "num_input_tokens_seen": 31932435, "step": 1494, "time_per_iteration": 2.855658769607544 }, { "auxiliary_loss_clip": 0.0108466, "auxiliary_loss_mlp": 0.01070478, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.04551601, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.6162523894431247, "language_loss": 0.82929438, "learning_rate": 3.962500428044454e-06, "loss": 0.85084569, "num_input_tokens_seen": 31950125, "step": 1495, "time_per_iteration": 2.9265449047088623 }, { "auxiliary_loss_clip": 0.01171464, "auxiliary_loss_mlp": 0.01059756, "balance_loss_clip": 1.05779243, "balance_loss_mlp": 1.03682017, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 9.387255385257733, "language_loss": 0.70191383, "learning_rate": 3.962425326688585e-06, "loss": 0.72422606, "num_input_tokens_seen": 31968050, "step": 1496, "time_per_iteration": 2.773693799972534 }, { "auxiliary_loss_clip": 0.01164171, "auxiliary_loss_mlp": 0.01049454, "balance_loss_clip": 1.05397439, "balance_loss_mlp": 1.02888989, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.6327835891742186, "language_loss": 0.79752576, "learning_rate": 3.962350150917351e-06, "loss": 0.81966203, "num_input_tokens_seen": 31985675, "step": 1497, "time_per_iteration": 2.6850852966308594 }, { "auxiliary_loss_clip": 0.01129609, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.05307686, "balance_loss_mlp": 1.03146648, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 8.517000212139891, "language_loss": 0.82940567, "learning_rate": 3.9622749007336035e-06, "loss": 0.85125089, "num_input_tokens_seen": 32005180, "step": 1498, "time_per_iteration": 2.786205768585205 }, { "auxiliary_loss_clip": 0.01170006, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.0577898, "balance_loss_mlp": 1.03718853, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.220597323082783, "language_loss": 0.78609937, "learning_rate": 3.962199576140195e-06, "loss": 0.80839342, "num_input_tokens_seen": 32022970, "step": 1499, "time_per_iteration": 2.71785831451416 }, { "auxiliary_loss_clip": 0.01161539, "auxiliary_loss_mlp": 0.00780528, "balance_loss_clip": 1.05444527, "balance_loss_mlp": 1.00024021, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 2.049001350461653, "language_loss": 0.93337607, "learning_rate": 3.962124177139981e-06, "loss": 0.95279682, "num_input_tokens_seen": 32043055, "step": 1500, "time_per_iteration": 2.7077536582946777 }, { "auxiliary_loss_clip": 0.01148009, "auxiliary_loss_mlp": 0.01055246, "balance_loss_clip": 1.05371249, "balance_loss_mlp": 1.0308435, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 3.0778515668575492, "language_loss": 0.74595469, "learning_rate": 3.962048703735822e-06, "loss": 0.76798725, "num_input_tokens_seen": 32061900, "step": 1501, "time_per_iteration": 2.7073416709899902 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01013118, "balance_loss_clip": 1.03392363, "balance_loss_mlp": 1.00963676, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.7274487593473578, "language_loss": 0.58316052, "learning_rate": 3.96197315593058e-06, "loss": 0.60385704, "num_input_tokens_seen": 32122745, "step": 1502, "time_per_iteration": 3.274049997329712 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01062533, "balance_loss_clip": 1.04626393, "balance_loss_mlp": 1.03896546, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.1727281711500095, "language_loss": 0.69501173, "learning_rate": 3.961897533727119e-06, "loss": 0.71708393, "num_input_tokens_seen": 32145125, "step": 1503, "time_per_iteration": 2.87554669380188 }, { "auxiliary_loss_clip": 0.01133108, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.04783726, "balance_loss_mlp": 1.03660655, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 2.169205134580129, "language_loss": 0.86124271, "learning_rate": 3.961821837128306e-06, "loss": 0.88316405, "num_input_tokens_seen": 32166255, "step": 1504, "time_per_iteration": 2.844688892364502 }, { "auxiliary_loss_clip": 0.01146301, "auxiliary_loss_mlp": 0.01069714, "balance_loss_clip": 1.05341232, "balance_loss_mlp": 1.04261804, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 2.178155372989796, "language_loss": 0.7233696, "learning_rate": 3.961746066137014e-06, "loss": 0.74552977, "num_input_tokens_seen": 32184010, "step": 1505, "time_per_iteration": 2.7992677688598633 }, { "auxiliary_loss_clip": 0.01137399, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.03302479, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 2.5107188210784526, "language_loss": 0.80730999, "learning_rate": 3.961670220756114e-06, "loss": 0.82924813, "num_input_tokens_seen": 32201635, "step": 1506, "time_per_iteration": 2.7458760738372803 }, { "auxiliary_loss_clip": 0.01140643, "auxiliary_loss_mlp": 0.01053315, "balance_loss_clip": 1.05161858, "balance_loss_mlp": 1.03197718, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 2.166956120197676, "language_loss": 0.75915337, "learning_rate": 3.961594300988482e-06, "loss": 0.78109294, "num_input_tokens_seen": 32221940, "step": 1507, "time_per_iteration": 2.873826742172241 }, { "auxiliary_loss_clip": 0.01051873, "auxiliary_loss_mlp": 0.01005715, "balance_loss_clip": 1.02043629, "balance_loss_mlp": 1.00175714, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7272435825555993, "language_loss": 0.57699698, "learning_rate": 3.961518306836998e-06, "loss": 0.59757286, "num_input_tokens_seen": 32276495, "step": 1508, "time_per_iteration": 3.064926862716675 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01054804, "balance_loss_clip": 1.055233, "balance_loss_mlp": 1.03155804, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.7601330807914457, "language_loss": 0.85090744, "learning_rate": 3.961442238304543e-06, "loss": 0.87310958, "num_input_tokens_seen": 32294130, "step": 1509, "time_per_iteration": 2.6664113998413086 }, { "auxiliary_loss_clip": 0.01168837, "auxiliary_loss_mlp": 0.01064138, "balance_loss_clip": 1.05745769, "balance_loss_mlp": 1.03949761, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 2.3794507710009203, "language_loss": 0.84110659, "learning_rate": 3.961366095394002e-06, "loss": 0.8634364, "num_input_tokens_seen": 32313555, "step": 1510, "time_per_iteration": 2.783484697341919 }, { "auxiliary_loss_clip": 0.01153141, "auxiliary_loss_mlp": 0.01058569, "balance_loss_clip": 1.05423617, "balance_loss_mlp": 1.03482211, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 1.8490761573484715, "language_loss": 0.85247588, "learning_rate": 3.961289878108262e-06, "loss": 0.87459302, "num_input_tokens_seen": 32331430, "step": 1511, "time_per_iteration": 2.714620351791382 }, { "auxiliary_loss_clip": 0.01145395, "auxiliary_loss_mlp": 0.01052919, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.02983987, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.5734326837562458, "language_loss": 0.84977764, "learning_rate": 3.9612135864502135e-06, "loss": 0.87176073, "num_input_tokens_seen": 32353705, "step": 1512, "time_per_iteration": 2.75361704826355 }, { "auxiliary_loss_clip": 0.01155239, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.05740952, "balance_loss_mlp": 1.03185391, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 3.0235926431973654, "language_loss": 0.87346804, "learning_rate": 3.961137220422749e-06, "loss": 0.89555705, "num_input_tokens_seen": 32370520, "step": 1513, "time_per_iteration": 2.6864211559295654 }, { "auxiliary_loss_clip": 0.01168585, "auxiliary_loss_mlp": 0.01049408, "balance_loss_clip": 1.05562937, "balance_loss_mlp": 1.02841544, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.7883280971870592, "language_loss": 0.86802679, "learning_rate": 3.961060780028764e-06, "loss": 0.89020675, "num_input_tokens_seen": 32389105, "step": 1514, "time_per_iteration": 2.6788065433502197 }, { "auxiliary_loss_clip": 0.01134005, "auxiliary_loss_mlp": 0.01064386, "balance_loss_clip": 1.05571628, "balance_loss_mlp": 1.04252315, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 1.7666120550996132, "language_loss": 0.89944756, "learning_rate": 3.960984265271159e-06, "loss": 0.92143154, "num_input_tokens_seen": 32408065, "step": 1515, "time_per_iteration": 2.757390022277832 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.05547726, "balance_loss_mlp": 1.03360808, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 2.1090985009837646, "language_loss": 0.85576892, "learning_rate": 3.9609076761528335e-06, "loss": 0.87796342, "num_input_tokens_seen": 32427225, "step": 1516, "time_per_iteration": 2.704784870147705 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01057165, "balance_loss_clip": 1.05135357, "balance_loss_mlp": 1.03451526, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 2.086405156201108, "language_loss": 0.81167233, "learning_rate": 3.960831012676692e-06, "loss": 0.83383185, "num_input_tokens_seen": 32450510, "step": 1517, "time_per_iteration": 2.8586854934692383 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01065492, "balance_loss_clip": 1.05741739, "balance_loss_mlp": 1.04280686, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 2.104468567304263, "language_loss": 0.78067243, "learning_rate": 3.960754274845642e-06, "loss": 0.80311304, "num_input_tokens_seen": 32468425, "step": 1518, "time_per_iteration": 2.7862088680267334 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.01061371, "balance_loss_clip": 1.05285823, "balance_loss_mlp": 1.03900695, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.6816479812467473, "language_loss": 0.86124098, "learning_rate": 3.960677462662594e-06, "loss": 0.88350856, "num_input_tokens_seen": 32487510, "step": 1519, "time_per_iteration": 2.723714828491211 }, { "auxiliary_loss_clip": 0.01163599, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.05454183, "balance_loss_mlp": 1.02914131, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 1.9681293960876167, "language_loss": 0.73279071, "learning_rate": 3.96060057613046e-06, "loss": 0.75495458, "num_input_tokens_seen": 32507250, "step": 1520, "time_per_iteration": 2.8098628520965576 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01058035, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.03469419, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.6988457876937066, "language_loss": 0.85236609, "learning_rate": 3.960523615252156e-06, "loss": 0.87455815, "num_input_tokens_seen": 32526045, "step": 1521, "time_per_iteration": 2.7134172916412354 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01063979, "balance_loss_clip": 1.05189717, "balance_loss_mlp": 1.03991079, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 1.6991603177293335, "language_loss": 0.83933008, "learning_rate": 3.960446580030599e-06, "loss": 0.8611716, "num_input_tokens_seen": 32546575, "step": 1522, "time_per_iteration": 2.93745493888855 }, { "auxiliary_loss_clip": 0.01182362, "auxiliary_loss_mlp": 0.01064589, "balance_loss_clip": 1.05630755, "balance_loss_mlp": 1.04153395, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 1.647915064434875, "language_loss": 0.81012994, "learning_rate": 3.960369470468711e-06, "loss": 0.8325994, "num_input_tokens_seen": 32568795, "step": 1523, "time_per_iteration": 4.378152847290039 }, { "auxiliary_loss_clip": 0.01157976, "auxiliary_loss_mlp": 0.00781395, "balance_loss_clip": 1.05422449, "balance_loss_mlp": 1.00037968, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.106497620262502, "language_loss": 0.7460072, "learning_rate": 3.960292286569418e-06, "loss": 0.76540089, "num_input_tokens_seen": 32587010, "step": 1524, "time_per_iteration": 2.7146124839782715 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.0106119, "balance_loss_clip": 1.05092478, "balance_loss_mlp": 1.03782487, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 2.0992608845945413, "language_loss": 0.86498803, "learning_rate": 3.960215028335644e-06, "loss": 0.88695222, "num_input_tokens_seen": 32602375, "step": 1525, "time_per_iteration": 4.314826965332031 }, { "auxiliary_loss_clip": 0.01164396, "auxiliary_loss_mlp": 0.01049506, "balance_loss_clip": 1.05688822, "balance_loss_mlp": 1.0263319, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.1146348399758237, "language_loss": 0.74512708, "learning_rate": 3.96013769577032e-06, "loss": 0.76726609, "num_input_tokens_seen": 32621460, "step": 1526, "time_per_iteration": 5.878855466842651 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01055817, "balance_loss_clip": 1.05732703, "balance_loss_mlp": 1.03392982, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 2.5135282962071215, "language_loss": 0.77581728, "learning_rate": 3.960060288876378e-06, "loss": 0.79823542, "num_input_tokens_seen": 32640440, "step": 1527, "time_per_iteration": 2.693847179412842 }, { "auxiliary_loss_clip": 0.01173605, "auxiliary_loss_mlp": 0.01052264, "balance_loss_clip": 1.0534333, "balance_loss_mlp": 1.02868414, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 2.655631139677705, "language_loss": 0.78546697, "learning_rate": 3.959982807656753e-06, "loss": 0.80772561, "num_input_tokens_seen": 32660020, "step": 1528, "time_per_iteration": 2.774219512939453 }, { "auxiliary_loss_clip": 0.01146017, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.0499053, "balance_loss_mlp": 1.02827477, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.682547324044482, "language_loss": 0.76732361, "learning_rate": 3.959905252114384e-06, "loss": 0.78928751, "num_input_tokens_seen": 32678170, "step": 1529, "time_per_iteration": 4.603156089782715 }, { "auxiliary_loss_clip": 0.01186538, "auxiliary_loss_mlp": 0.00780856, "balance_loss_clip": 1.05415928, "balance_loss_mlp": 1.00045025, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 1.7410660090049153, "language_loss": 0.82906747, "learning_rate": 3.959827622252211e-06, "loss": 0.84874141, "num_input_tokens_seen": 32697540, "step": 1530, "time_per_iteration": 2.7118582725524902 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.0106509, "balance_loss_clip": 1.04975331, "balance_loss_mlp": 1.04220152, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 2.182960664479704, "language_loss": 0.84001881, "learning_rate": 3.959749918073179e-06, "loss": 0.86197078, "num_input_tokens_seen": 32716805, "step": 1531, "time_per_iteration": 2.791947603225708 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051554, "balance_loss_clip": 1.04906452, "balance_loss_mlp": 1.02853465, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 1.7570281394880602, "language_loss": 0.81253195, "learning_rate": 3.959672139580233e-06, "loss": 0.83441293, "num_input_tokens_seen": 32736385, "step": 1532, "time_per_iteration": 2.737739324569702 }, { "auxiliary_loss_clip": 0.01157728, "auxiliary_loss_mlp": 0.01056753, "balance_loss_clip": 1.052163, "balance_loss_mlp": 1.03385305, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 2.2821036564882182, "language_loss": 0.84194255, "learning_rate": 3.9595942867763235e-06, "loss": 0.86408734, "num_input_tokens_seen": 32757140, "step": 1533, "time_per_iteration": 2.7542598247528076 }, { "auxiliary_loss_clip": 0.01149262, "auxiliary_loss_mlp": 0.01053623, "balance_loss_clip": 1.05813503, "balance_loss_mlp": 1.03190327, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 1.9396914937933663, "language_loss": 0.9009546, "learning_rate": 3.959516359664402e-06, "loss": 0.92298347, "num_input_tokens_seen": 32774860, "step": 1534, "time_per_iteration": 2.6450984477996826 }, { "auxiliary_loss_clip": 0.01150273, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.03849435, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 5.065477266086046, "language_loss": 0.75779241, "learning_rate": 3.959438358247424e-06, "loss": 0.77992499, "num_input_tokens_seen": 32795250, "step": 1535, "time_per_iteration": 2.730915069580078 }, { "auxiliary_loss_clip": 0.01168283, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.05278873, "balance_loss_mlp": 1.02403271, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.8085584532497372, "language_loss": 0.81631637, "learning_rate": 3.959360282528346e-06, "loss": 0.83845198, "num_input_tokens_seen": 32813805, "step": 1536, "time_per_iteration": 2.7326817512512207 }, { "auxiliary_loss_clip": 0.01181977, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.05431938, "balance_loss_mlp": 1.03224182, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 2.0929096884707556, "language_loss": 0.89092755, "learning_rate": 3.959282132510131e-06, "loss": 0.9132843, "num_input_tokens_seen": 32830960, "step": 1537, "time_per_iteration": 2.675771713256836 }, { "auxiliary_loss_clip": 0.01157238, "auxiliary_loss_mlp": 0.01058647, "balance_loss_clip": 1.05114293, "balance_loss_mlp": 1.03605688, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 1.9480116987165197, "language_loss": 0.80702311, "learning_rate": 3.959203908195741e-06, "loss": 0.82918191, "num_input_tokens_seen": 32848275, "step": 1538, "time_per_iteration": 2.71618390083313 }, { "auxiliary_loss_clip": 0.01060495, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03095436, "balance_loss_mlp": 1.00872231, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7534074452314953, "language_loss": 0.57429332, "learning_rate": 3.959125609588142e-06, "loss": 0.59502202, "num_input_tokens_seen": 32917730, "step": 1539, "time_per_iteration": 3.3933441638946533 }, { "auxiliary_loss_clip": 0.01159831, "auxiliary_loss_mlp": 0.01050602, "balance_loss_clip": 1.05638027, "balance_loss_mlp": 1.02863121, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 2.849299216868502, "language_loss": 0.67554641, "learning_rate": 3.959047236690304e-06, "loss": 0.69765073, "num_input_tokens_seen": 32934910, "step": 1540, "time_per_iteration": 2.757084608078003 }, { "auxiliary_loss_clip": 0.01144239, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.04954028, "balance_loss_mlp": 1.026438, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 2.044335478602743, "language_loss": 0.83917534, "learning_rate": 3.958968789505198e-06, "loss": 0.86110216, "num_input_tokens_seen": 32953840, "step": 1541, "time_per_iteration": 2.8497180938720703 }, { "auxiliary_loss_clip": 0.01077839, "auxiliary_loss_mlp": 0.01013078, "balance_loss_clip": 1.02602255, "balance_loss_mlp": 1.0097636, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8790732834061692, "language_loss": 0.61881655, "learning_rate": 3.9588902680358e-06, "loss": 0.63972563, "num_input_tokens_seen": 33011410, "step": 1542, "time_per_iteration": 3.3079330921173096 }, { "auxiliary_loss_clip": 0.01161232, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.05441117, "balance_loss_mlp": 1.03808808, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.6256118826429122, "language_loss": 0.82802349, "learning_rate": 3.958811672285086e-06, "loss": 0.85023022, "num_input_tokens_seen": 33031675, "step": 1543, "time_per_iteration": 2.7408807277679443 }, { "auxiliary_loss_clip": 0.01135873, "auxiliary_loss_mlp": 0.01060295, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 1.03863442, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.706948475246468, "language_loss": 0.72265279, "learning_rate": 3.958733002256038e-06, "loss": 0.74461448, "num_input_tokens_seen": 33056355, "step": 1544, "time_per_iteration": 3.104156255722046 }, { "auxiliary_loss_clip": 0.01166071, "auxiliary_loss_mlp": 0.01055881, "balance_loss_clip": 1.05165935, "balance_loss_mlp": 1.03138375, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.7720844214030114, "language_loss": 0.77286768, "learning_rate": 3.958654257951637e-06, "loss": 0.79508722, "num_input_tokens_seen": 33079520, "step": 1545, "time_per_iteration": 2.808180570602417 }, { "auxiliary_loss_clip": 0.01140161, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.0526737, "balance_loss_mlp": 1.02872682, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 2.7089619481030076, "language_loss": 0.74396008, "learning_rate": 3.9585754393748706e-06, "loss": 0.76586664, "num_input_tokens_seen": 33096135, "step": 1546, "time_per_iteration": 2.7634081840515137 }, { "auxiliary_loss_clip": 0.01163775, "auxiliary_loss_mlp": 0.0105305, "balance_loss_clip": 1.05357957, "balance_loss_mlp": 1.02956545, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 1.9423225100503794, "language_loss": 0.84200966, "learning_rate": 3.9584965465287275e-06, "loss": 0.86417794, "num_input_tokens_seen": 33115245, "step": 1547, "time_per_iteration": 2.790003776550293 }, { "auxiliary_loss_clip": 0.01141839, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.04740989, "balance_loss_mlp": 1.03195918, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.6545433694843488, "language_loss": 0.67698336, "learning_rate": 3.958417579416199e-06, "loss": 0.69894123, "num_input_tokens_seen": 33136640, "step": 1548, "time_per_iteration": 2.8367013931274414 }, { "auxiliary_loss_clip": 0.01123899, "auxiliary_loss_mlp": 0.01059885, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.03754544, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.6829727803454704, "language_loss": 0.8326273, "learning_rate": 3.9583385380402795e-06, "loss": 0.85446513, "num_input_tokens_seen": 33155060, "step": 1549, "time_per_iteration": 2.8462016582489014 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.0104617, "balance_loss_clip": 1.05815506, "balance_loss_mlp": 1.02473652, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.5528514681372962, "language_loss": 0.75838119, "learning_rate": 3.958259422403966e-06, "loss": 0.78060532, "num_input_tokens_seen": 33175420, "step": 1550, "time_per_iteration": 2.7325351238250732 }, { "auxiliary_loss_clip": 0.01150315, "auxiliary_loss_mlp": 0.01069257, "balance_loss_clip": 1.05249369, "balance_loss_mlp": 1.04483092, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 2.1922696027472233, "language_loss": 0.82828665, "learning_rate": 3.95818023251026e-06, "loss": 0.85048234, "num_input_tokens_seen": 33194120, "step": 1551, "time_per_iteration": 2.852602481842041 }, { "auxiliary_loss_clip": 0.01064371, "auxiliary_loss_mlp": 0.00760109, "balance_loss_clip": 1.02203059, "balance_loss_mlp": 0.99984246, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7384225982202158, "language_loss": 0.61837572, "learning_rate": 3.958100968362163e-06, "loss": 0.63662052, "num_input_tokens_seen": 33261080, "step": 1552, "time_per_iteration": 3.3453099727630615 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01016654, "balance_loss_clip": 1.02415061, "balance_loss_mlp": 1.01338792, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8524917480784928, "language_loss": 0.58986926, "learning_rate": 3.958021629962681e-06, "loss": 0.61062753, "num_input_tokens_seen": 33330235, "step": 1553, "time_per_iteration": 3.37673282623291 }, { "auxiliary_loss_clip": 0.01146955, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.05026984, "balance_loss_mlp": 1.03336585, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 2.3365109182487, "language_loss": 0.87665397, "learning_rate": 3.957942217314823e-06, "loss": 0.8986904, "num_input_tokens_seen": 33349035, "step": 1554, "time_per_iteration": 2.8098127841949463 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.01057257, "balance_loss_clip": 1.05439448, "balance_loss_mlp": 1.03393972, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 4.388884220182432, "language_loss": 0.81678319, "learning_rate": 3.957862730421599e-06, "loss": 0.83888626, "num_input_tokens_seen": 33368060, "step": 1555, "time_per_iteration": 2.726207971572876 }, { "auxiliary_loss_clip": 0.01058869, "auxiliary_loss_mlp": 0.01003892, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.00045919, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8683826280274983, "language_loss": 0.59606886, "learning_rate": 3.957783169286024e-06, "loss": 0.61669648, "num_input_tokens_seen": 33430825, "step": 1556, "time_per_iteration": 3.209326982498169 }, { "auxiliary_loss_clip": 0.01174249, "auxiliary_loss_mlp": 0.01059741, "balance_loss_clip": 1.05518138, "balance_loss_mlp": 1.03727031, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.6803158790244075, "language_loss": 0.84290808, "learning_rate": 3.9577035339111155e-06, "loss": 0.86524796, "num_input_tokens_seen": 33454855, "step": 1557, "time_per_iteration": 2.831650733947754 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.01065156, "balance_loss_clip": 1.04900038, "balance_loss_mlp": 1.04112351, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 1.6725809358966677, "language_loss": 0.780913, "learning_rate": 3.957623824299893e-06, "loss": 0.8026731, "num_input_tokens_seen": 33476000, "step": 1558, "time_per_iteration": 3.0111780166625977 }, { "auxiliary_loss_clip": 0.01164994, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.0558666, "balance_loss_mlp": 1.02881753, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.0141986314124414, "language_loss": 0.80066288, "learning_rate": 3.957544040455379e-06, "loss": 0.82282507, "num_input_tokens_seen": 33493845, "step": 1559, "time_per_iteration": 3.024117946624756 }, { "auxiliary_loss_clip": 0.01141277, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05060387, "balance_loss_mlp": 1.04012942, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 1.8358373674042003, "language_loss": 0.76418209, "learning_rate": 3.957464182380599e-06, "loss": 0.78621197, "num_input_tokens_seen": 33510850, "step": 1560, "time_per_iteration": 2.68558406829834 }, { "auxiliary_loss_clip": 0.01137939, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.05014277, "balance_loss_mlp": 1.03213274, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 3.575155933252121, "language_loss": 0.80784953, "learning_rate": 3.95738425007858e-06, "loss": 0.82977819, "num_input_tokens_seen": 33530430, "step": 1561, "time_per_iteration": 2.759148359298706 }, { "auxiliary_loss_clip": 0.01173652, "auxiliary_loss_mlp": 0.01052448, "balance_loss_clip": 1.05276573, "balance_loss_mlp": 1.02989376, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 2.448664627367939, "language_loss": 0.6140722, "learning_rate": 3.957304243552354e-06, "loss": 0.63633323, "num_input_tokens_seen": 33551975, "step": 1562, "time_per_iteration": 2.9014978408813477 }, { "auxiliary_loss_clip": 0.01162693, "auxiliary_loss_mlp": 0.0106374, "balance_loss_clip": 1.05719543, "balance_loss_mlp": 1.04213953, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 3.5098220300578555, "language_loss": 0.8496151, "learning_rate": 3.957224162804956e-06, "loss": 0.87187934, "num_input_tokens_seen": 33569850, "step": 1563, "time_per_iteration": 4.404061555862427 }, { "auxiliary_loss_clip": 0.01164811, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.05775142, "balance_loss_mlp": 1.02652228, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 1.6765528861156813, "language_loss": 0.76511294, "learning_rate": 3.9571440078394205e-06, "loss": 0.78724039, "num_input_tokens_seen": 33590510, "step": 1564, "time_per_iteration": 4.255565166473389 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.05196142, "balance_loss_mlp": 1.04172707, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 1.9762038777899962, "language_loss": 0.80134326, "learning_rate": 3.9570637786587895e-06, "loss": 0.82348871, "num_input_tokens_seen": 33608810, "step": 1565, "time_per_iteration": 2.8548545837402344 }, { "auxiliary_loss_clip": 0.01158602, "auxiliary_loss_mlp": 0.01063767, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.04233313, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 1.6810250981626251, "language_loss": 0.75134379, "learning_rate": 3.956983475266103e-06, "loss": 0.77356744, "num_input_tokens_seen": 33627265, "step": 1566, "time_per_iteration": 4.889045715332031 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.00780689, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.00022864, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 1.6828919748843199, "language_loss": 0.77958012, "learning_rate": 3.956903097664407e-06, "loss": 0.79884553, "num_input_tokens_seen": 33644810, "step": 1567, "time_per_iteration": 4.445765972137451 }, { "auxiliary_loss_clip": 0.01156815, "auxiliary_loss_mlp": 0.01056228, "balance_loss_clip": 1.05256855, "balance_loss_mlp": 1.03591454, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 2.008686295040646, "language_loss": 0.82608044, "learning_rate": 3.956822645856749e-06, "loss": 0.84821093, "num_input_tokens_seen": 33665665, "step": 1568, "time_per_iteration": 2.881535768508911 }, { "auxiliary_loss_clip": 0.01187915, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.05717778, "balance_loss_mlp": 1.02927184, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 1.9573151026586577, "language_loss": 0.76943743, "learning_rate": 3.9567421198461814e-06, "loss": 0.79183388, "num_input_tokens_seen": 33684760, "step": 1569, "time_per_iteration": 2.6097726821899414 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04771852, "balance_loss_mlp": 1.03625941, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 3.3813700161908917, "language_loss": 0.85488856, "learning_rate": 3.956661519635756e-06, "loss": 0.87669849, "num_input_tokens_seen": 33700750, "step": 1570, "time_per_iteration": 2.7571377754211426 }, { "auxiliary_loss_clip": 0.01122458, "auxiliary_loss_mlp": 0.01055939, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.03183508, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.540414635950846, "language_loss": 0.76415235, "learning_rate": 3.95658084522853e-06, "loss": 0.7859363, "num_input_tokens_seen": 33724430, "step": 1571, "time_per_iteration": 2.913569211959839 }, { "auxiliary_loss_clip": 0.01135683, "auxiliary_loss_mlp": 0.01057111, "balance_loss_clip": 1.0490278, "balance_loss_mlp": 1.0349735, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.6745378641752047, "language_loss": 0.79397607, "learning_rate": 3.956500096627561e-06, "loss": 0.81590402, "num_input_tokens_seen": 33743455, "step": 1572, "time_per_iteration": 2.813410758972168 }, { "auxiliary_loss_clip": 0.01148251, "auxiliary_loss_mlp": 0.0106927, "balance_loss_clip": 1.05619979, "balance_loss_mlp": 1.04524922, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 1.7559396294879055, "language_loss": 0.87707287, "learning_rate": 3.956419273835913e-06, "loss": 0.89924812, "num_input_tokens_seen": 33763435, "step": 1573, "time_per_iteration": 2.776535987854004 }, { "auxiliary_loss_clip": 0.01161183, "auxiliary_loss_mlp": 0.01063326, "balance_loss_clip": 1.05485129, "balance_loss_mlp": 1.03804219, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.9707854698090097, "language_loss": 0.81982428, "learning_rate": 3.95633837685665e-06, "loss": 0.84206939, "num_input_tokens_seen": 33784325, "step": 1574, "time_per_iteration": 2.7604806423187256 }, { "auxiliary_loss_clip": 0.01156287, "auxiliary_loss_mlp": 0.01055594, "balance_loss_clip": 1.05234718, "balance_loss_mlp": 1.0344342, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.7178511535677499, "language_loss": 0.80855322, "learning_rate": 3.95625740569284e-06, "loss": 0.83067203, "num_input_tokens_seen": 33802510, "step": 1575, "time_per_iteration": 2.713247299194336 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01068689, "balance_loss_clip": 1.05578864, "balance_loss_mlp": 1.04581285, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 1.9110861379460222, "language_loss": 0.86483347, "learning_rate": 3.956176360347553e-06, "loss": 0.88734365, "num_input_tokens_seen": 33819980, "step": 1576, "time_per_iteration": 2.682644844055176 }, { "auxiliary_loss_clip": 0.01056441, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.0225811, "balance_loss_mlp": 1.02344561, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9789918611127905, "language_loss": 0.6582402, "learning_rate": 3.956095240823862e-06, "loss": 0.67907751, "num_input_tokens_seen": 33878925, "step": 1577, "time_per_iteration": 3.2106685638427734 }, { "auxiliary_loss_clip": 0.01147668, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.05218005, "balance_loss_mlp": 1.03098869, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 1.8223175005615506, "language_loss": 0.79152733, "learning_rate": 3.956014047124844e-06, "loss": 0.81352365, "num_input_tokens_seen": 33897600, "step": 1578, "time_per_iteration": 2.820089340209961 }, { "auxiliary_loss_clip": 0.01185941, "auxiliary_loss_mlp": 0.01066432, "balance_loss_clip": 1.05838132, "balance_loss_mlp": 1.04437804, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 3.480730999818176, "language_loss": 0.78161818, "learning_rate": 3.955932779253578e-06, "loss": 0.80414188, "num_input_tokens_seen": 33917365, "step": 1579, "time_per_iteration": 2.6518983840942383 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01065633, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.04001498, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 2.0084876987684526, "language_loss": 0.73410392, "learning_rate": 3.955851437213144e-06, "loss": 0.75604343, "num_input_tokens_seen": 33936680, "step": 1580, "time_per_iteration": 2.679461717605591 }, { "auxiliary_loss_clip": 0.01157568, "auxiliary_loss_mlp": 0.01062628, "balance_loss_clip": 1.05573344, "balance_loss_mlp": 1.04095626, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 14.809542792179553, "language_loss": 0.77565914, "learning_rate": 3.955770021006627e-06, "loss": 0.7978611, "num_input_tokens_seen": 33960685, "step": 1581, "time_per_iteration": 2.765394449234009 }, { "auxiliary_loss_clip": 0.01144835, "auxiliary_loss_mlp": 0.0106468, "balance_loss_clip": 1.05426359, "balance_loss_mlp": 1.04276967, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.8617167187056045, "language_loss": 0.87230825, "learning_rate": 3.955688530637116e-06, "loss": 0.89440346, "num_input_tokens_seen": 33980015, "step": 1582, "time_per_iteration": 2.691364288330078 }, { "auxiliary_loss_clip": 0.01174295, "auxiliary_loss_mlp": 0.0106431, "balance_loss_clip": 1.05508888, "balance_loss_mlp": 1.04039705, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 1.8512060219658202, "language_loss": 0.67043924, "learning_rate": 3.955606966107699e-06, "loss": 0.69282532, "num_input_tokens_seen": 33997705, "step": 1583, "time_per_iteration": 2.6693732738494873 }, { "auxiliary_loss_clip": 0.01177751, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.0593859, "balance_loss_mlp": 1.03035378, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 2.144216926782962, "language_loss": 0.70752859, "learning_rate": 3.95552532742147e-06, "loss": 0.7298435, "num_input_tokens_seen": 34017465, "step": 1584, "time_per_iteration": 2.7164390087127686 }, { "auxiliary_loss_clip": 0.01138507, "auxiliary_loss_mlp": 0.0105762, "balance_loss_clip": 1.05243039, "balance_loss_mlp": 1.03584039, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.4654737580846544, "language_loss": 0.8080442, "learning_rate": 3.955443614581525e-06, "loss": 0.83000553, "num_input_tokens_seen": 34038550, "step": 1585, "time_per_iteration": 2.879831314086914 }, { "auxiliary_loss_clip": 0.01159374, "auxiliary_loss_mlp": 0.01057717, "balance_loss_clip": 1.05387473, "balance_loss_mlp": 1.03355336, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.638250735795891, "language_loss": 0.71921158, "learning_rate": 3.955361827590961e-06, "loss": 0.74138248, "num_input_tokens_seen": 34058665, "step": 1586, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.01048565, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.03115988, "balance_loss_mlp": 0.99901009, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8099482252624973, "language_loss": 0.55475175, "learning_rate": 3.955279966452883e-06, "loss": 0.57527041, "num_input_tokens_seen": 34109655, "step": 1587, "time_per_iteration": 3.0975699424743652 }, { "auxiliary_loss_clip": 0.01128884, "auxiliary_loss_mlp": 0.0105965, "balance_loss_clip": 1.04768586, "balance_loss_mlp": 1.03661847, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 1.708481785076906, "language_loss": 0.81062275, "learning_rate": 3.955198031170391e-06, "loss": 0.83250809, "num_input_tokens_seen": 34131115, "step": 1588, "time_per_iteration": 2.7718451023101807 }, { "auxiliary_loss_clip": 0.01131602, "auxiliary_loss_mlp": 0.01056117, "balance_loss_clip": 1.04894614, "balance_loss_mlp": 1.03438473, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.5119879232668088, "language_loss": 0.81481898, "learning_rate": 3.955116021746594e-06, "loss": 0.83669615, "num_input_tokens_seen": 34151925, "step": 1589, "time_per_iteration": 2.782468795776367 }, { "auxiliary_loss_clip": 0.0112194, "auxiliary_loss_mlp": 0.00780573, "balance_loss_clip": 1.0508883, "balance_loss_mlp": 1.00013089, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.525287399882202, "language_loss": 0.64882791, "learning_rate": 3.955033938184601e-06, "loss": 0.667853, "num_input_tokens_seen": 34175395, "step": 1590, "time_per_iteration": 3.0783450603485107 }, { "auxiliary_loss_clip": 0.01143501, "auxiliary_loss_mlp": 0.01058399, "balance_loss_clip": 1.05087948, "balance_loss_mlp": 1.0358206, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 2.0745314237741916, "language_loss": 0.83290577, "learning_rate": 3.954951780487526e-06, "loss": 0.85492468, "num_input_tokens_seen": 34197760, "step": 1591, "time_per_iteration": 2.8393962383270264 }, { "auxiliary_loss_clip": 0.01163486, "auxiliary_loss_mlp": 0.01065588, "balance_loss_clip": 1.0522387, "balance_loss_mlp": 1.04266405, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 2.825705290827541, "language_loss": 0.74087322, "learning_rate": 3.9548695486584835e-06, "loss": 0.76316392, "num_input_tokens_seen": 34215330, "step": 1592, "time_per_iteration": 2.6828882694244385 }, { "auxiliary_loss_clip": 0.01169239, "auxiliary_loss_mlp": 0.01055073, "balance_loss_clip": 1.05161428, "balance_loss_mlp": 1.03337741, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 2.18277080043521, "language_loss": 0.74483889, "learning_rate": 3.954787242700592e-06, "loss": 0.76708198, "num_input_tokens_seen": 34237745, "step": 1593, "time_per_iteration": 2.7193498611450195 }, { "auxiliary_loss_clip": 0.01177343, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.05910873, "balance_loss_mlp": 1.03307831, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.887493467708827, "language_loss": 0.69782627, "learning_rate": 3.954704862616971e-06, "loss": 0.72015071, "num_input_tokens_seen": 34256565, "step": 1594, "time_per_iteration": 2.635383367538452 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.05618978, "balance_loss_mlp": 1.03037214, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.1411006117727682, "language_loss": 0.82780552, "learning_rate": 3.954622408410747e-06, "loss": 0.85005581, "num_input_tokens_seen": 34275970, "step": 1595, "time_per_iteration": 2.7158257961273193 }, { "auxiliary_loss_clip": 0.01153253, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.05143809, "balance_loss_mlp": 1.0301652, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 1.7751890788987925, "language_loss": 0.84513396, "learning_rate": 3.954539880085045e-06, "loss": 0.86720896, "num_input_tokens_seen": 34295490, "step": 1596, "time_per_iteration": 2.710228204727173 }, { "auxiliary_loss_clip": 0.01166586, "auxiliary_loss_mlp": 0.0105804, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.03376901, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 1.8335529067237837, "language_loss": 0.69328064, "learning_rate": 3.9544572776429945e-06, "loss": 0.71552688, "num_input_tokens_seen": 34319990, "step": 1597, "time_per_iteration": 2.802959442138672 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.00780978, "balance_loss_clip": 1.0503217, "balance_loss_mlp": 1.00010371, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.0491570740921885, "language_loss": 0.7486403, "learning_rate": 3.954374601087729e-06, "loss": 0.76812243, "num_input_tokens_seen": 34339225, "step": 1598, "time_per_iteration": 2.6502270698547363 }, { "auxiliary_loss_clip": 0.01176661, "auxiliary_loss_mlp": 0.01053936, "balance_loss_clip": 1.05745888, "balance_loss_mlp": 1.03009462, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.6831440826618358, "language_loss": 0.68804371, "learning_rate": 3.954291850422382e-06, "loss": 0.71034968, "num_input_tokens_seen": 34361020, "step": 1599, "time_per_iteration": 2.74243426322937 }, { "auxiliary_loss_clip": 0.01157322, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.05754852, "balance_loss_mlp": 1.0371263, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 2.9774251326108367, "language_loss": 0.83950365, "learning_rate": 3.954209025650093e-06, "loss": 0.86167574, "num_input_tokens_seen": 34378630, "step": 1600, "time_per_iteration": 2.702907085418701 }, { "auxiliary_loss_clip": 0.01150263, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.05129707, "balance_loss_mlp": 1.03093433, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 2.287254549480118, "language_loss": 0.80520785, "learning_rate": 3.954126126774001e-06, "loss": 0.82725215, "num_input_tokens_seen": 34397110, "step": 1601, "time_per_iteration": 2.693399429321289 }, { "auxiliary_loss_clip": 0.01181247, "auxiliary_loss_mlp": 0.01054578, "balance_loss_clip": 1.05711937, "balance_loss_mlp": 1.03133249, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.4356926646094954, "language_loss": 0.81959623, "learning_rate": 3.954043153797251e-06, "loss": 0.84195447, "num_input_tokens_seen": 34414165, "step": 1602, "time_per_iteration": 2.639479875564575 }, { "auxiliary_loss_clip": 0.01137855, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05295444, "balance_loss_mlp": 1.02681863, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 3.099164686790191, "language_loss": 0.62498438, "learning_rate": 3.953960106722989e-06, "loss": 0.64687788, "num_input_tokens_seen": 34434445, "step": 1603, "time_per_iteration": 4.341834306716919 }, { "auxiliary_loss_clip": 0.01189954, "auxiliary_loss_mlp": 0.01054376, "balance_loss_clip": 1.05902839, "balance_loss_mlp": 1.02918696, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 3.121905357886113, "language_loss": 0.70996022, "learning_rate": 3.953876985554364e-06, "loss": 0.73240346, "num_input_tokens_seen": 34453095, "step": 1604, "time_per_iteration": 2.6520893573760986 }, { "auxiliary_loss_clip": 0.01176446, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.0570209, "balance_loss_mlp": 1.03358221, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 2.082890345500055, "language_loss": 0.7993719, "learning_rate": 3.953793790294527e-06, "loss": 0.82168949, "num_input_tokens_seen": 34473680, "step": 1605, "time_per_iteration": 4.5557661056518555 }, { "auxiliary_loss_clip": 0.01161047, "auxiliary_loss_mlp": 0.01047918, "balance_loss_clip": 1.05455577, "balance_loss_mlp": 1.0245893, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 1.990204665194141, "language_loss": 0.74550986, "learning_rate": 3.953710520946634e-06, "loss": 0.76759952, "num_input_tokens_seen": 34492610, "step": 1606, "time_per_iteration": 2.7172651290893555 }, { "auxiliary_loss_clip": 0.01172416, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.05834222, "balance_loss_mlp": 1.02378857, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.6403710807101601, "language_loss": 0.7571919, "learning_rate": 3.953627177513843e-06, "loss": 0.77938372, "num_input_tokens_seen": 34511855, "step": 1607, "time_per_iteration": 4.302686452865601 }, { "auxiliary_loss_clip": 0.01139491, "auxiliary_loss_mlp": 0.01051546, "balance_loss_clip": 1.04833579, "balance_loss_mlp": 1.0289799, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 1.975850982703557, "language_loss": 0.86756283, "learning_rate": 3.953543759999312e-06, "loss": 0.88947326, "num_input_tokens_seen": 34528905, "step": 1608, "time_per_iteration": 2.6280455589294434 }, { "auxiliary_loss_clip": 0.01126253, "auxiliary_loss_mlp": 0.01064704, "balance_loss_clip": 1.05433142, "balance_loss_mlp": 1.03940821, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 2.3082762386200266, "language_loss": 0.71363097, "learning_rate": 3.953460268406207e-06, "loss": 0.73554057, "num_input_tokens_seen": 34548480, "step": 1609, "time_per_iteration": 2.9116146564483643 }, { "auxiliary_loss_clip": 0.01149353, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.0546515, "balance_loss_mlp": 1.03606534, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 1.9988414994799784, "language_loss": 0.84810984, "learning_rate": 3.953376702737693e-06, "loss": 0.87018514, "num_input_tokens_seen": 34565410, "step": 1610, "time_per_iteration": 2.8005051612854004 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.01056267, "balance_loss_clip": 1.05790925, "balance_loss_mlp": 1.03228188, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 2.176236379770122, "language_loss": 0.6696198, "learning_rate": 3.953293062996939e-06, "loss": 0.69179636, "num_input_tokens_seen": 34584840, "step": 1611, "time_per_iteration": 2.731931447982788 }, { "auxiliary_loss_clip": 0.01125259, "auxiliary_loss_mlp": 0.01057116, "balance_loss_clip": 1.04740572, "balance_loss_mlp": 1.03385806, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 1.6508278294088392, "language_loss": 0.81067657, "learning_rate": 3.953209349187115e-06, "loss": 0.83250034, "num_input_tokens_seen": 34603360, "step": 1612, "time_per_iteration": 2.7998390197753906 }, { "auxiliary_loss_clip": 0.01182404, "auxiliary_loss_mlp": 0.01069551, "balance_loss_clip": 1.06046534, "balance_loss_mlp": 1.04600716, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 3.304939197664143, "language_loss": 0.80836105, "learning_rate": 3.953125561311398e-06, "loss": 0.83088064, "num_input_tokens_seen": 34620760, "step": 1613, "time_per_iteration": 2.624218702316284 }, { "auxiliary_loss_clip": 0.01148565, "auxiliary_loss_mlp": 0.01054743, "balance_loss_clip": 1.05542159, "balance_loss_mlp": 1.03047192, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 1.7164386274315457, "language_loss": 0.84289789, "learning_rate": 3.953041699372964e-06, "loss": 0.86493099, "num_input_tokens_seen": 34640695, "step": 1614, "time_per_iteration": 2.744340419769287 }, { "auxiliary_loss_clip": 0.01066618, "auxiliary_loss_mlp": 0.00759744, "balance_loss_clip": 1.02654934, "balance_loss_mlp": 1.00008702, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7127167896900892, "language_loss": 0.54629624, "learning_rate": 3.952957763374992e-06, "loss": 0.56455994, "num_input_tokens_seen": 34702395, "step": 1615, "time_per_iteration": 3.1547679901123047 }, { "auxiliary_loss_clip": 0.01033143, "auxiliary_loss_mlp": 0.01017555, "balance_loss_clip": 1.02384067, "balance_loss_mlp": 1.01381195, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7689373847786285, "language_loss": 0.58190405, "learning_rate": 3.952873753320666e-06, "loss": 0.60241103, "num_input_tokens_seen": 34768910, "step": 1616, "time_per_iteration": 3.3940556049346924 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01067983, "balance_loss_clip": 1.05504358, "balance_loss_mlp": 1.04205465, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 1.8932449927934136, "language_loss": 0.69031835, "learning_rate": 3.952789669213172e-06, "loss": 0.7125535, "num_input_tokens_seen": 34787680, "step": 1617, "time_per_iteration": 2.714629888534546 }, { "auxiliary_loss_clip": 0.01152637, "auxiliary_loss_mlp": 0.01057882, "balance_loss_clip": 1.05386162, "balance_loss_mlp": 1.03127456, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.755493071880773, "language_loss": 0.80910909, "learning_rate": 3.952705511055698e-06, "loss": 0.83121431, "num_input_tokens_seen": 34808330, "step": 1618, "time_per_iteration": 2.8081507682800293 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.06048679, "balance_loss_mlp": 1.03678131, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.667659488760432, "language_loss": 0.92901695, "learning_rate": 3.952621278851435e-06, "loss": 0.95128226, "num_input_tokens_seen": 34830020, "step": 1619, "time_per_iteration": 2.7752275466918945 }, { "auxiliary_loss_clip": 0.01175515, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.05952573, "balance_loss_mlp": 1.03512526, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 2.1973967195348902, "language_loss": 0.88978708, "learning_rate": 3.9525369726035784e-06, "loss": 0.91212475, "num_input_tokens_seen": 34850330, "step": 1620, "time_per_iteration": 2.771176338195801 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01065329, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.0397464, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 2.154793183838835, "language_loss": 0.77331412, "learning_rate": 3.952452592315324e-06, "loss": 0.79550499, "num_input_tokens_seen": 34871640, "step": 1621, "time_per_iteration": 2.6740832328796387 }, { "auxiliary_loss_clip": 0.01131342, "auxiliary_loss_mlp": 0.01082359, "balance_loss_clip": 1.04798269, "balance_loss_mlp": 1.05640674, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 1.9420195171733425, "language_loss": 0.77671158, "learning_rate": 3.952368137989871e-06, "loss": 0.79884863, "num_input_tokens_seen": 34888100, "step": 1622, "time_per_iteration": 2.7247347831726074 }, { "auxiliary_loss_clip": 0.01150185, "auxiliary_loss_mlp": 0.01064277, "balance_loss_clip": 1.05335355, "balance_loss_mlp": 1.04025626, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.8603109065807166, "language_loss": 0.85784447, "learning_rate": 3.9522836096304225e-06, "loss": 0.87998909, "num_input_tokens_seen": 34910485, "step": 1623, "time_per_iteration": 2.785388469696045 }, { "auxiliary_loss_clip": 0.0117659, "auxiliary_loss_mlp": 0.01064102, "balance_loss_clip": 1.05769634, "balance_loss_mlp": 1.04043913, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 2.39630116599036, "language_loss": 0.80534065, "learning_rate": 3.952199007240184e-06, "loss": 0.82774758, "num_input_tokens_seen": 34928615, "step": 1624, "time_per_iteration": 2.6818184852600098 }, { "auxiliary_loss_clip": 0.01176335, "auxiliary_loss_mlp": 0.01056788, "balance_loss_clip": 1.05616927, "balance_loss_mlp": 1.03465128, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.44379144971104, "language_loss": 0.85556966, "learning_rate": 3.952114330822364e-06, "loss": 0.8779009, "num_input_tokens_seen": 34946045, "step": 1625, "time_per_iteration": 2.6594324111938477 }, { "auxiliary_loss_clip": 0.01181411, "auxiliary_loss_mlp": 0.0106682, "balance_loss_clip": 1.06004012, "balance_loss_mlp": 1.04411101, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 2.058269503362464, "language_loss": 0.85431635, "learning_rate": 3.952029580380172e-06, "loss": 0.87679869, "num_input_tokens_seen": 34962865, "step": 1626, "time_per_iteration": 2.7384841442108154 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.007823, "balance_loss_clip": 1.05467701, "balance_loss_mlp": 1.000211, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.0701580273163036, "language_loss": 0.83370024, "learning_rate": 3.9519447559168234e-06, "loss": 0.85317636, "num_input_tokens_seen": 34983505, "step": 1627, "time_per_iteration": 2.8269948959350586 }, { "auxiliary_loss_clip": 0.01168188, "auxiliary_loss_mlp": 0.01065332, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.04275417, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 1.8143281262319713, "language_loss": 0.84674478, "learning_rate": 3.951859857435534e-06, "loss": 0.86907995, "num_input_tokens_seen": 35001825, "step": 1628, "time_per_iteration": 2.6151821613311768 }, { "auxiliary_loss_clip": 0.01170257, "auxiliary_loss_mlp": 0.01058367, "balance_loss_clip": 1.05374515, "balance_loss_mlp": 1.03558636, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.5658807312485334, "language_loss": 0.75531614, "learning_rate": 3.951774884939523e-06, "loss": 0.77760237, "num_input_tokens_seen": 35023075, "step": 1629, "time_per_iteration": 2.6794557571411133 }, { "auxiliary_loss_clip": 0.01129604, "auxiliary_loss_mlp": 0.01056904, "balance_loss_clip": 1.0577755, "balance_loss_mlp": 1.03169131, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 1.6755762488260617, "language_loss": 0.78487194, "learning_rate": 3.951689838432013e-06, "loss": 0.80673707, "num_input_tokens_seen": 35043480, "step": 1630, "time_per_iteration": 2.7986228466033936 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.05938148, "balance_loss_mlp": 1.03804946, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.8175370389297836, "language_loss": 0.86677933, "learning_rate": 3.951604717916228e-06, "loss": 0.88909143, "num_input_tokens_seen": 35061490, "step": 1631, "time_per_iteration": 2.6350157260894775 }, { "auxiliary_loss_clip": 0.01171369, "auxiliary_loss_mlp": 0.01058643, "balance_loss_clip": 1.0610745, "balance_loss_mlp": 1.03625536, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.2030333753544773, "language_loss": 0.82996809, "learning_rate": 3.9515195233953975e-06, "loss": 0.85226822, "num_input_tokens_seen": 35079670, "step": 1632, "time_per_iteration": 2.7990314960479736 }, { "auxiliary_loss_clip": 0.01148453, "auxiliary_loss_mlp": 0.01064004, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.04102039, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 1.531777801288569, "language_loss": 0.7882973, "learning_rate": 3.951434254872751e-06, "loss": 0.81042188, "num_input_tokens_seen": 35099205, "step": 1633, "time_per_iteration": 2.735353708267212 }, { "auxiliary_loss_clip": 0.01170992, "auxiliary_loss_mlp": 0.01061681, "balance_loss_clip": 1.05558002, "balance_loss_mlp": 1.03731489, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 2.4037572513069687, "language_loss": 0.73209554, "learning_rate": 3.951348912351521e-06, "loss": 0.75442231, "num_input_tokens_seen": 35115270, "step": 1634, "time_per_iteration": 2.688596248626709 }, { "auxiliary_loss_clip": 0.01162743, "auxiliary_loss_mlp": 0.01071164, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04672611, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 3.2244021303311405, "language_loss": 0.72553629, "learning_rate": 3.951263495834947e-06, "loss": 0.74787533, "num_input_tokens_seen": 35134065, "step": 1635, "time_per_iteration": 2.720266342163086 }, { "auxiliary_loss_clip": 0.01154765, "auxiliary_loss_mlp": 0.01068349, "balance_loss_clip": 1.05526268, "balance_loss_mlp": 1.04177701, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 1.7699592352066487, "language_loss": 0.78026646, "learning_rate": 3.951178005326264e-06, "loss": 0.80249763, "num_input_tokens_seen": 35154870, "step": 1636, "time_per_iteration": 2.9618239402770996 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.0368979, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 1.8332710343018006, "language_loss": 0.69524407, "learning_rate": 3.951092440828715e-06, "loss": 0.71747863, "num_input_tokens_seen": 35171850, "step": 1637, "time_per_iteration": 2.671178102493286 }, { "auxiliary_loss_clip": 0.01188316, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.05926394, "balance_loss_mlp": 1.03500926, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.3775286970935503, "language_loss": 0.77050996, "learning_rate": 3.951006802345545e-06, "loss": 0.79298162, "num_input_tokens_seen": 35188795, "step": 1638, "time_per_iteration": 2.62457537651062 }, { "auxiliary_loss_clip": 0.01140265, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.05538166, "balance_loss_mlp": 1.02941203, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4014263071342075, "language_loss": 0.72620296, "learning_rate": 3.950921089880003e-06, "loss": 0.74812591, "num_input_tokens_seen": 35212100, "step": 1639, "time_per_iteration": 2.7499618530273438 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.0582087, "balance_loss_mlp": 1.02831531, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7213189449892274, "language_loss": 0.88679075, "learning_rate": 3.950835303435337e-06, "loss": 0.90904212, "num_input_tokens_seen": 35230390, "step": 1640, "time_per_iteration": 2.664133071899414 }, { "auxiliary_loss_clip": 0.01177786, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.05981517, "balance_loss_mlp": 1.02130616, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.0701766566296915, "language_loss": 0.80567038, "learning_rate": 3.950749443014801e-06, "loss": 0.82789278, "num_input_tokens_seen": 35250405, "step": 1641, "time_per_iteration": 2.645353317260742 }, { "auxiliary_loss_clip": 0.011756, "auxiliary_loss_mlp": 0.01062641, "balance_loss_clip": 1.05896795, "balance_loss_mlp": 1.03742838, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.64335263522248, "language_loss": 0.86117625, "learning_rate": 3.95066350862165e-06, "loss": 0.88355863, "num_input_tokens_seen": 35262820, "step": 1642, "time_per_iteration": 5.81004524230957 }, { "auxiliary_loss_clip": 0.01151329, "auxiliary_loss_mlp": 0.01056693, "balance_loss_clip": 1.05857074, "balance_loss_mlp": 1.03404331, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 2.7092208079201607, "language_loss": 0.8058275, "learning_rate": 3.950577500259144e-06, "loss": 0.82790768, "num_input_tokens_seen": 35284490, "step": 1643, "time_per_iteration": 2.7235090732574463 }, { "auxiliary_loss_clip": 0.01174075, "auxiliary_loss_mlp": 0.01077435, "balance_loss_clip": 1.05761337, "balance_loss_mlp": 1.05470192, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.0561742686210676, "language_loss": 0.82546467, "learning_rate": 3.950491417930543e-06, "loss": 0.84797978, "num_input_tokens_seen": 35302815, "step": 1644, "time_per_iteration": 4.318823575973511 }, { "auxiliary_loss_clip": 0.01163142, "auxiliary_loss_mlp": 0.00782463, "balance_loss_clip": 1.05607629, "balance_loss_mlp": 1.00010633, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 1.6945489721625269, "language_loss": 0.68219113, "learning_rate": 3.9504052616391124e-06, "loss": 0.70164716, "num_input_tokens_seen": 35321175, "step": 1645, "time_per_iteration": 2.6626670360565186 }, { "auxiliary_loss_clip": 0.01059795, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.02852345, "balance_loss_mlp": 1.04404068, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.8512889940087613, "language_loss": 0.60885167, "learning_rate": 3.950319031388119e-06, "loss": 0.62992585, "num_input_tokens_seen": 35381740, "step": 1646, "time_per_iteration": 4.752669095993042 }, { "auxiliary_loss_clip": 0.01147006, "auxiliary_loss_mlp": 0.0105976, "balance_loss_clip": 1.0574733, "balance_loss_mlp": 1.03464222, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 5.785751121573768, "language_loss": 0.73211443, "learning_rate": 3.950232727180833e-06, "loss": 0.7541821, "num_input_tokens_seen": 35403760, "step": 1647, "time_per_iteration": 2.783442974090576 }, { "auxiliary_loss_clip": 0.01161789, "auxiliary_loss_mlp": 0.01066314, "balance_loss_clip": 1.06016421, "balance_loss_mlp": 1.04445136, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 1.828428298130997, "language_loss": 0.84094375, "learning_rate": 3.950146349020525e-06, "loss": 0.86322474, "num_input_tokens_seen": 35424050, "step": 1648, "time_per_iteration": 2.709559679031372 }, { "auxiliary_loss_clip": 0.01065954, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 1.02565169, "balance_loss_mlp": 1.01722264, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7317434537206132, "language_loss": 0.55672908, "learning_rate": 3.950059896910473e-06, "loss": 0.5775966, "num_input_tokens_seen": 35481690, "step": 1649, "time_per_iteration": 3.0944156646728516 }, { "auxiliary_loss_clip": 0.0117133, "auxiliary_loss_mlp": 0.01049543, "balance_loss_clip": 1.05603158, "balance_loss_mlp": 1.02723897, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 2.195431109372502, "language_loss": 0.8975327, "learning_rate": 3.949973370853954e-06, "loss": 0.91974139, "num_input_tokens_seen": 35498635, "step": 1650, "time_per_iteration": 2.7438554763793945 }, { "auxiliary_loss_clip": 0.01033978, "auxiliary_loss_mlp": 0.00758727, "balance_loss_clip": 1.02943921, "balance_loss_mlp": 0.9997822, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.8036050505402587, "language_loss": 0.63734978, "learning_rate": 3.94988677085425e-06, "loss": 0.65527683, "num_input_tokens_seen": 35565720, "step": 1651, "time_per_iteration": 3.40269136428833 }, { "auxiliary_loss_clip": 0.01170347, "auxiliary_loss_mlp": 0.01062486, "balance_loss_clip": 1.05790281, "balance_loss_mlp": 1.03842974, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 1.9744130417114842, "language_loss": 0.88115525, "learning_rate": 3.949800096914643e-06, "loss": 0.90348363, "num_input_tokens_seen": 35586000, "step": 1652, "time_per_iteration": 2.6695117950439453 }, { "auxiliary_loss_clip": 0.0116773, "auxiliary_loss_mlp": 0.01062073, "balance_loss_clip": 1.06095552, "balance_loss_mlp": 1.03895831, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 2.166773052437996, "language_loss": 0.81789082, "learning_rate": 3.949713349038422e-06, "loss": 0.84018886, "num_input_tokens_seen": 35604355, "step": 1653, "time_per_iteration": 2.7136831283569336 }, { "auxiliary_loss_clip": 0.01173152, "auxiliary_loss_mlp": 0.00780466, "balance_loss_clip": 1.05683279, "balance_loss_mlp": 1.00016594, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 1.662037391605293, "language_loss": 0.79489207, "learning_rate": 3.949626527228875e-06, "loss": 0.81442821, "num_input_tokens_seen": 35625495, "step": 1654, "time_per_iteration": 2.645875930786133 }, { "auxiliary_loss_clip": 0.01187918, "auxiliary_loss_mlp": 0.01056849, "balance_loss_clip": 1.06405056, "balance_loss_mlp": 1.03561759, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 1.7263610037420916, "language_loss": 0.81038272, "learning_rate": 3.949539631489295e-06, "loss": 0.83283037, "num_input_tokens_seen": 35645030, "step": 1655, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01181205, "auxiliary_loss_mlp": 0.01055977, "balance_loss_clip": 1.05679035, "balance_loss_mlp": 1.03294599, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 2.426795421082641, "language_loss": 0.80429518, "learning_rate": 3.9494526618229765e-06, "loss": 0.82666701, "num_input_tokens_seen": 35664305, "step": 1656, "time_per_iteration": 2.6283950805664062 }, { "auxiliary_loss_clip": 0.01170003, "auxiliary_loss_mlp": 0.01061881, "balance_loss_clip": 1.05787742, "balance_loss_mlp": 1.03870714, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.4960238412267362, "language_loss": 0.89040691, "learning_rate": 3.949365618233217e-06, "loss": 0.91272575, "num_input_tokens_seen": 35684060, "step": 1657, "time_per_iteration": 2.653674602508545 }, { "auxiliary_loss_clip": 0.01165842, "auxiliary_loss_mlp": 0.01057352, "balance_loss_clip": 1.05830753, "balance_loss_mlp": 1.0329144, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 2.1866084372248062, "language_loss": 0.84684521, "learning_rate": 3.9492785007233195e-06, "loss": 0.86907715, "num_input_tokens_seen": 35703250, "step": 1658, "time_per_iteration": 2.6897473335266113 }, { "auxiliary_loss_clip": 0.01069806, "auxiliary_loss_mlp": 0.01015844, "balance_loss_clip": 1.02042234, "balance_loss_mlp": 1.01292348, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9123227767672076, "language_loss": 0.60828507, "learning_rate": 3.949191309296585e-06, "loss": 0.62914157, "num_input_tokens_seen": 35762165, "step": 1659, "time_per_iteration": 3.273890495300293 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.01051829, "balance_loss_clip": 1.05082798, "balance_loss_mlp": 1.02814245, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 1.9344290476513741, "language_loss": 0.84892076, "learning_rate": 3.949104043956321e-06, "loss": 0.87096334, "num_input_tokens_seen": 35781520, "step": 1660, "time_per_iteration": 2.788018226623535 }, { "auxiliary_loss_clip": 0.01149163, "auxiliary_loss_mlp": 0.01060092, "balance_loss_clip": 1.05374026, "balance_loss_mlp": 1.03514171, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 1.9493882663610318, "language_loss": 0.80024737, "learning_rate": 3.949016704705836e-06, "loss": 0.82234001, "num_input_tokens_seen": 35799565, "step": 1661, "time_per_iteration": 2.6537399291992188 }, { "auxiliary_loss_clip": 0.01172787, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.05715156, "balance_loss_mlp": 1.03153503, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 2.0152235709188377, "language_loss": 0.83560598, "learning_rate": 3.948929291548443e-06, "loss": 0.85788912, "num_input_tokens_seen": 35821085, "step": 1662, "time_per_iteration": 2.753807783126831 }, { "auxiliary_loss_clip": 0.01154838, "auxiliary_loss_mlp": 0.01061466, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03616929, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 1.9355779644050557, "language_loss": 0.88865256, "learning_rate": 3.9488418044874546e-06, "loss": 0.91081554, "num_input_tokens_seen": 35839840, "step": 1663, "time_per_iteration": 2.6829047203063965 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01061692, "balance_loss_clip": 1.06228638, "balance_loss_mlp": 1.03825521, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.7925330820671084, "language_loss": 0.70140731, "learning_rate": 3.948754243526191e-06, "loss": 0.72384882, "num_input_tokens_seen": 35861545, "step": 1664, "time_per_iteration": 2.809300184249878 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01055306, "balance_loss_clip": 1.05475903, "balance_loss_mlp": 1.03312087, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.4978474602303895, "language_loss": 0.78981555, "learning_rate": 3.94866660866797e-06, "loss": 0.81179744, "num_input_tokens_seen": 35878295, "step": 1665, "time_per_iteration": 2.7010488510131836 }, { "auxiliary_loss_clip": 0.01175861, "auxiliary_loss_mlp": 0.01070341, "balance_loss_clip": 1.06286561, "balance_loss_mlp": 1.04742861, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 3.1438625724360945, "language_loss": 0.70054829, "learning_rate": 3.9485788999161165e-06, "loss": 0.7230103, "num_input_tokens_seen": 35898990, "step": 1666, "time_per_iteration": 2.689879894256592 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01074593, "balance_loss_clip": 1.05082703, "balance_loss_mlp": 1.04946339, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.7583449522195267, "language_loss": 0.78647351, "learning_rate": 3.948491117273956e-06, "loss": 0.80832791, "num_input_tokens_seen": 35916225, "step": 1667, "time_per_iteration": 2.8973352909088135 }, { "auxiliary_loss_clip": 0.01153352, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.05452693, "balance_loss_mlp": 1.03752255, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.4011089045072187, "language_loss": 0.77357388, "learning_rate": 3.948403260744817e-06, "loss": 0.7957356, "num_input_tokens_seen": 35934630, "step": 1668, "time_per_iteration": 3.2600321769714355 }, { "auxiliary_loss_clip": 0.01184879, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.05833495, "balance_loss_mlp": 1.03523922, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.7407668002390366, "language_loss": 0.77520061, "learning_rate": 3.948315330332031e-06, "loss": 0.79764307, "num_input_tokens_seen": 35953855, "step": 1669, "time_per_iteration": 2.6899471282958984 }, { "auxiliary_loss_clip": 0.0118887, "auxiliary_loss_mlp": 0.01067842, "balance_loss_clip": 1.05948365, "balance_loss_mlp": 1.04416728, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 5.441134829238958, "language_loss": 0.85160148, "learning_rate": 3.948227326038933e-06, "loss": 0.87416857, "num_input_tokens_seen": 35974555, "step": 1670, "time_per_iteration": 2.616867780685425 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01055607, "balance_loss_clip": 1.05584121, "balance_loss_mlp": 1.03354108, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 1.4849262119454174, "language_loss": 0.76836258, "learning_rate": 3.9481392478688586e-06, "loss": 0.79068166, "num_input_tokens_seen": 35996830, "step": 1671, "time_per_iteration": 2.658254384994507 }, { "auxiliary_loss_clip": 0.01061447, "auxiliary_loss_mlp": 0.01017561, "balance_loss_clip": 1.02178144, "balance_loss_mlp": 1.01454473, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7781454358921105, "language_loss": 0.60718858, "learning_rate": 3.948051095825149e-06, "loss": 0.62797856, "num_input_tokens_seen": 36054465, "step": 1672, "time_per_iteration": 3.1269097328186035 }, { "auxiliary_loss_clip": 0.01143177, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.05112922, "balance_loss_mlp": 1.04055333, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 2.433278134910662, "language_loss": 0.7711426, "learning_rate": 3.947962869911147e-06, "loss": 0.79320776, "num_input_tokens_seen": 36073480, "step": 1673, "time_per_iteration": 2.6931638717651367 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01056611, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.03262639, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.074683072839241, "language_loss": 0.73173523, "learning_rate": 3.947874570130197e-06, "loss": 0.75362229, "num_input_tokens_seen": 36091830, "step": 1674, "time_per_iteration": 2.7188127040863037 }, { "auxiliary_loss_clip": 0.01172389, "auxiliary_loss_mlp": 0.00779533, "balance_loss_clip": 1.0556165, "balance_loss_mlp": 1.00024796, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 2.1982379565146872, "language_loss": 0.79456973, "learning_rate": 3.947786196485649e-06, "loss": 0.81408894, "num_input_tokens_seen": 36111400, "step": 1675, "time_per_iteration": 2.712090253829956 }, { "auxiliary_loss_clip": 0.01182659, "auxiliary_loss_mlp": 0.01063327, "balance_loss_clip": 1.05801332, "balance_loss_mlp": 1.04239404, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.408955682155161, "language_loss": 0.8120935, "learning_rate": 3.947697748980853e-06, "loss": 0.83455336, "num_input_tokens_seen": 36129345, "step": 1676, "time_per_iteration": 2.685472249984741 }, { "auxiliary_loss_clip": 0.01175397, "auxiliary_loss_mlp": 0.01057105, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.03546858, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.008035557658629, "language_loss": 0.86132157, "learning_rate": 3.947609227619163e-06, "loss": 0.88364655, "num_input_tokens_seen": 36146255, "step": 1677, "time_per_iteration": 2.6589157581329346 }, { "auxiliary_loss_clip": 0.01162997, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.05363441, "balance_loss_mlp": 1.02896047, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.160847391025828, "language_loss": 0.86006588, "learning_rate": 3.947520632403936e-06, "loss": 0.88220382, "num_input_tokens_seen": 36164050, "step": 1678, "time_per_iteration": 2.694347858428955 }, { "auxiliary_loss_clip": 0.0116292, "auxiliary_loss_mlp": 0.01056376, "balance_loss_clip": 1.0587275, "balance_loss_mlp": 1.03406048, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 12.700254532531051, "language_loss": 0.89978886, "learning_rate": 3.947431963338532e-06, "loss": 0.92198181, "num_input_tokens_seen": 36183530, "step": 1679, "time_per_iteration": 2.6741397380828857 }, { "auxiliary_loss_clip": 0.01071086, "auxiliary_loss_mlp": 0.0101685, "balance_loss_clip": 1.02328789, "balance_loss_mlp": 1.01360798, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.7882499243548835, "language_loss": 0.52985126, "learning_rate": 3.947343220426312e-06, "loss": 0.55073065, "num_input_tokens_seen": 36248550, "step": 1680, "time_per_iteration": 3.169893503189087 }, { "auxiliary_loss_clip": 0.01185252, "auxiliary_loss_mlp": 0.00779951, "balance_loss_clip": 1.06022644, "balance_loss_mlp": 1.00017488, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 1.6642182724084642, "language_loss": 0.76869059, "learning_rate": 3.947254403670641e-06, "loss": 0.7883426, "num_input_tokens_seen": 36266065, "step": 1681, "time_per_iteration": 4.146950006484985 }, { "auxiliary_loss_clip": 0.01156046, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.0539515, "balance_loss_mlp": 1.03469992, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.3884003317971225, "language_loss": 0.93957508, "learning_rate": 3.947165513074889e-06, "loss": 0.96173531, "num_input_tokens_seen": 36280960, "step": 1682, "time_per_iteration": 4.220505237579346 }, { "auxiliary_loss_clip": 0.01173183, "auxiliary_loss_mlp": 0.01053261, "balance_loss_clip": 1.05487084, "balance_loss_mlp": 1.03133821, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 3.5300660189263917, "language_loss": 0.87618893, "learning_rate": 3.947076548642425e-06, "loss": 0.89845335, "num_input_tokens_seen": 36299010, "step": 1683, "time_per_iteration": 2.635636329650879 }, { "auxiliary_loss_clip": 0.01128888, "auxiliary_loss_mlp": 0.01063089, "balance_loss_clip": 1.04814756, "balance_loss_mlp": 1.04008126, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 2.3337760241024923, "language_loss": 0.74566805, "learning_rate": 3.946987510376624e-06, "loss": 0.76758784, "num_input_tokens_seen": 36318400, "step": 1684, "time_per_iteration": 4.417364835739136 }, { "auxiliary_loss_clip": 0.01053031, "auxiliary_loss_mlp": 0.0101182, "balance_loss_clip": 1.02547038, "balance_loss_mlp": 1.00853014, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7564631726021327, "language_loss": 0.61085057, "learning_rate": 3.9468983982808615e-06, "loss": 0.63149905, "num_input_tokens_seen": 36381815, "step": 1685, "time_per_iteration": 4.87179970741272 }, { "auxiliary_loss_clip": 0.01157045, "auxiliary_loss_mlp": 0.01056064, "balance_loss_clip": 1.05233479, "balance_loss_mlp": 1.0341655, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 4.297801792672815, "language_loss": 0.61381406, "learning_rate": 3.946809212358516e-06, "loss": 0.6359452, "num_input_tokens_seen": 36404320, "step": 1686, "time_per_iteration": 2.8289108276367188 }, { "auxiliary_loss_clip": 0.01144631, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.05645001, "balance_loss_mlp": 1.03678524, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.21923850158845, "language_loss": 0.81216162, "learning_rate": 3.946719952612972e-06, "loss": 0.83420682, "num_input_tokens_seen": 36427510, "step": 1687, "time_per_iteration": 2.947535276412964 }, { "auxiliary_loss_clip": 0.0117612, "auxiliary_loss_mlp": 0.0105614, "balance_loss_clip": 1.05933213, "balance_loss_mlp": 1.03403926, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 1.7955898786084035, "language_loss": 0.71943259, "learning_rate": 3.94663061904761e-06, "loss": 0.74175525, "num_input_tokens_seen": 36448230, "step": 1688, "time_per_iteration": 2.693249225616455 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.05288756, "balance_loss_mlp": 1.04079556, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.636795901714516, "language_loss": 0.86876953, "learning_rate": 3.94654121166582e-06, "loss": 0.89092261, "num_input_tokens_seen": 36464395, "step": 1689, "time_per_iteration": 2.677992820739746 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01057982, "balance_loss_clip": 1.05476904, "balance_loss_mlp": 1.0378834, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 2.2211105929909696, "language_loss": 0.88170946, "learning_rate": 3.946451730470993e-06, "loss": 0.90401113, "num_input_tokens_seen": 36486475, "step": 1690, "time_per_iteration": 2.707209348678589 }, { "auxiliary_loss_clip": 0.01158767, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.05507553, "balance_loss_mlp": 1.02973664, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 2.08291471600754, "language_loss": 0.83348423, "learning_rate": 3.946362175466521e-06, "loss": 0.85559577, "num_input_tokens_seen": 36505310, "step": 1691, "time_per_iteration": 2.6521170139312744 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01051716, "balance_loss_clip": 1.05550599, "balance_loss_mlp": 1.03016281, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.704519528530946, "language_loss": 0.66773653, "learning_rate": 3.946272546655801e-06, "loss": 0.68987525, "num_input_tokens_seen": 36529820, "step": 1692, "time_per_iteration": 2.799353837966919 }, { "auxiliary_loss_clip": 0.01144502, "auxiliary_loss_mlp": 0.0107473, "balance_loss_clip": 1.05057836, "balance_loss_mlp": 1.05258095, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.8345924563029705, "language_loss": 0.75939322, "learning_rate": 3.94618284404223e-06, "loss": 0.78158557, "num_input_tokens_seen": 36549000, "step": 1693, "time_per_iteration": 2.6711113452911377 }, { "auxiliary_loss_clip": 0.01132621, "auxiliary_loss_mlp": 0.01057162, "balance_loss_clip": 1.04893303, "balance_loss_mlp": 1.03289056, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.7027745569702395, "language_loss": 0.87503564, "learning_rate": 3.9460930676292105e-06, "loss": 0.89693356, "num_input_tokens_seen": 36567515, "step": 1694, "time_per_iteration": 2.749119520187378 }, { "auxiliary_loss_clip": 0.01130673, "auxiliary_loss_mlp": 0.01058451, "balance_loss_clip": 1.04954553, "balance_loss_mlp": 1.033095, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.7649462193878245, "language_loss": 0.79299057, "learning_rate": 3.946003217420147e-06, "loss": 0.8148818, "num_input_tokens_seen": 36586190, "step": 1695, "time_per_iteration": 2.839081048965454 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.04818296, "balance_loss_mlp": 1.03772628, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 2.7190993931598446, "language_loss": 0.86494684, "learning_rate": 3.945913293418447e-06, "loss": 0.88683105, "num_input_tokens_seen": 36607495, "step": 1696, "time_per_iteration": 2.7802348136901855 }, { "auxiliary_loss_clip": 0.01168675, "auxiliary_loss_mlp": 0.01054661, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.03315568, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 1.7889048836535288, "language_loss": 0.82350796, "learning_rate": 3.945823295627519e-06, "loss": 0.84574133, "num_input_tokens_seen": 36628555, "step": 1697, "time_per_iteration": 2.667962074279785 }, { "auxiliary_loss_clip": 0.01184333, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.05680871, "balance_loss_mlp": 1.033149, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.0464291543972006, "language_loss": 0.81198204, "learning_rate": 3.9457332240507775e-06, "loss": 0.83438087, "num_input_tokens_seen": 36646250, "step": 1698, "time_per_iteration": 2.6484432220458984 }, { "auxiliary_loss_clip": 0.01150498, "auxiliary_loss_mlp": 0.01053546, "balance_loss_clip": 1.05696845, "balance_loss_mlp": 1.03226686, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.3020250981163226, "language_loss": 0.75612724, "learning_rate": 3.945643078691637e-06, "loss": 0.77816761, "num_input_tokens_seen": 36666675, "step": 1699, "time_per_iteration": 2.8040614128112793 }, { "auxiliary_loss_clip": 0.01162088, "auxiliary_loss_mlp": 0.01050379, "balance_loss_clip": 1.06041551, "balance_loss_mlp": 1.02827764, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.6839869206777538, "language_loss": 0.80395639, "learning_rate": 3.945552859553516e-06, "loss": 0.8260811, "num_input_tokens_seen": 36685225, "step": 1700, "time_per_iteration": 2.6701290607452393 }, { "auxiliary_loss_clip": 0.0117076, "auxiliary_loss_mlp": 0.0104804, "balance_loss_clip": 1.05714083, "balance_loss_mlp": 1.02653444, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.102621975458346, "language_loss": 0.76877582, "learning_rate": 3.945462566639836e-06, "loss": 0.79096377, "num_input_tokens_seen": 36705985, "step": 1701, "time_per_iteration": 2.748201847076416 }, { "auxiliary_loss_clip": 0.01182259, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.06157088, "balance_loss_mlp": 1.02852523, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 2.1099726588763965, "language_loss": 0.77922845, "learning_rate": 3.945372199954019e-06, "loss": 0.80155474, "num_input_tokens_seen": 36725815, "step": 1702, "time_per_iteration": 2.6703274250030518 }, { "auxiliary_loss_clip": 0.01156323, "auxiliary_loss_mlp": 0.01052524, "balance_loss_clip": 1.05596721, "balance_loss_mlp": 1.03126872, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 2.2326457826946293, "language_loss": 0.94093609, "learning_rate": 3.945281759499494e-06, "loss": 0.96302462, "num_input_tokens_seen": 36742345, "step": 1703, "time_per_iteration": 2.6712698936462402 }, { "auxiliary_loss_clip": 0.01034483, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.02765131, "balance_loss_mlp": 1.03315914, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.8815387598011586, "language_loss": 0.55096036, "learning_rate": 3.94519124527969e-06, "loss": 0.57168299, "num_input_tokens_seen": 36798775, "step": 1704, "time_per_iteration": 3.2863855361938477 }, { "auxiliary_loss_clip": 0.01186822, "auxiliary_loss_mlp": 0.01053701, "balance_loss_clip": 1.06026638, "balance_loss_mlp": 1.03088403, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.051901555713709, "language_loss": 0.84025991, "learning_rate": 3.945100657298039e-06, "loss": 0.86266518, "num_input_tokens_seen": 36816295, "step": 1705, "time_per_iteration": 2.8991851806640625 }, { "auxiliary_loss_clip": 0.01045354, "auxiliary_loss_mlp": 0.01018361, "balance_loss_clip": 1.02622223, "balance_loss_mlp": 1.01526153, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7692746082941451, "language_loss": 0.60408181, "learning_rate": 3.9450099955579765e-06, "loss": 0.62471896, "num_input_tokens_seen": 36882030, "step": 1706, "time_per_iteration": 3.2174558639526367 }, { "auxiliary_loss_clip": 0.01149922, "auxiliary_loss_mlp": 0.01051211, "balance_loss_clip": 1.05388391, "balance_loss_mlp": 1.02812052, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.201796189576969, "language_loss": 0.85937822, "learning_rate": 3.94491926006294e-06, "loss": 0.88138962, "num_input_tokens_seen": 36899245, "step": 1707, "time_per_iteration": 2.689208507537842 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.05941081, "balance_loss_mlp": 1.03114319, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.471109036018689, "language_loss": 0.73299325, "learning_rate": 3.944828450816369e-06, "loss": 0.75521457, "num_input_tokens_seen": 36920950, "step": 1708, "time_per_iteration": 2.679760456085205 }, { "auxiliary_loss_clip": 0.01155833, "auxiliary_loss_mlp": 0.00780571, "balance_loss_clip": 1.05718231, "balance_loss_mlp": 1.00042295, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.7051644476897239, "language_loss": 0.91616452, "learning_rate": 3.944737567821709e-06, "loss": 0.93552846, "num_input_tokens_seen": 36938900, "step": 1709, "time_per_iteration": 2.6754679679870605 }, { "auxiliary_loss_clip": 0.01124911, "auxiliary_loss_mlp": 0.01057008, "balance_loss_clip": 1.05144072, "balance_loss_mlp": 1.0343945, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.1056252966717275, "language_loss": 0.88004494, "learning_rate": 3.944646611082406e-06, "loss": 0.90186411, "num_input_tokens_seen": 36957010, "step": 1710, "time_per_iteration": 2.708723306655884 }, { "auxiliary_loss_clip": 0.01171004, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.036973, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 1.7046493271202992, "language_loss": 0.79370153, "learning_rate": 3.944555580601908e-06, "loss": 0.81600821, "num_input_tokens_seen": 36977690, "step": 1711, "time_per_iteration": 2.631908416748047 }, { "auxiliary_loss_clip": 0.01156003, "auxiliary_loss_mlp": 0.01055126, "balance_loss_clip": 1.05841637, "balance_loss_mlp": 1.03189242, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 3.2168061349371135, "language_loss": 0.73666596, "learning_rate": 3.944464476383668e-06, "loss": 0.75877726, "num_input_tokens_seen": 36997300, "step": 1712, "time_per_iteration": 2.7107467651367188 }, { "auxiliary_loss_clip": 0.01133407, "auxiliary_loss_mlp": 0.01056055, "balance_loss_clip": 1.05496907, "balance_loss_mlp": 1.03334546, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 1.974447377126898, "language_loss": 0.87049067, "learning_rate": 3.94437329843114e-06, "loss": 0.89238536, "num_input_tokens_seen": 37016110, "step": 1713, "time_per_iteration": 2.6532411575317383 }, { "auxiliary_loss_clip": 0.0116832, "auxiliary_loss_mlp": 0.01060237, "balance_loss_clip": 1.05669498, "balance_loss_mlp": 1.03877962, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 1.57388574383124, "language_loss": 0.72406238, "learning_rate": 3.944282046747782e-06, "loss": 0.74634796, "num_input_tokens_seen": 37036405, "step": 1714, "time_per_iteration": 2.5987610816955566 }, { "auxiliary_loss_clip": 0.01174482, "auxiliary_loss_mlp": 0.01063165, "balance_loss_clip": 1.05715692, "balance_loss_mlp": 1.03934693, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 2.1959530175190434, "language_loss": 0.91065919, "learning_rate": 3.944190721337053e-06, "loss": 0.93303567, "num_input_tokens_seen": 37057580, "step": 1715, "time_per_iteration": 2.743833303451538 }, { "auxiliary_loss_clip": 0.01170297, "auxiliary_loss_mlp": 0.01054891, "balance_loss_clip": 1.05448914, "balance_loss_mlp": 1.03305221, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 1.8741123562687005, "language_loss": 0.75969976, "learning_rate": 3.944099322202418e-06, "loss": 0.78195167, "num_input_tokens_seen": 37079120, "step": 1716, "time_per_iteration": 2.748903274536133 }, { "auxiliary_loss_clip": 0.01162664, "auxiliary_loss_mlp": 0.01061895, "balance_loss_clip": 1.05617428, "balance_loss_mlp": 1.03804111, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 3.178190042364093, "language_loss": 0.85308528, "learning_rate": 3.944007849347342e-06, "loss": 0.87533092, "num_input_tokens_seen": 37099710, "step": 1717, "time_per_iteration": 2.690772533416748 }, { "auxiliary_loss_clip": 0.01127019, "auxiliary_loss_mlp": 0.01067935, "balance_loss_clip": 1.05048633, "balance_loss_mlp": 1.04436755, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 1.8474438265561113, "language_loss": 0.82945001, "learning_rate": 3.943916302775292e-06, "loss": 0.85139954, "num_input_tokens_seen": 37117775, "step": 1718, "time_per_iteration": 2.7029476165771484 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01049869, "balance_loss_clip": 1.05912328, "balance_loss_mlp": 1.02701616, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.7728224248964342, "language_loss": 0.73396438, "learning_rate": 3.943824682489742e-06, "loss": 0.75617492, "num_input_tokens_seen": 37140280, "step": 1719, "time_per_iteration": 2.7653820514678955 }, { "auxiliary_loss_clip": 0.01168859, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.05861163, "balance_loss_mlp": 1.02786827, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 1.7819459058763836, "language_loss": 0.92692196, "learning_rate": 3.9437329884941665e-06, "loss": 0.94909501, "num_input_tokens_seen": 37158350, "step": 1720, "time_per_iteration": 4.1962480545043945 }, { "auxiliary_loss_clip": 0.01139894, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.05092323, "balance_loss_mlp": 1.02827597, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.6861044154168399, "language_loss": 0.79497123, "learning_rate": 3.943641220792039e-06, "loss": 0.81688046, "num_input_tokens_seen": 37177120, "step": 1721, "time_per_iteration": 4.524151802062988 }, { "auxiliary_loss_clip": 0.01130482, "auxiliary_loss_mlp": 0.01067754, "balance_loss_clip": 1.05380797, "balance_loss_mlp": 1.04109859, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.951940775381607, "language_loss": 0.80707669, "learning_rate": 3.9435493793868434e-06, "loss": 0.829059, "num_input_tokens_seen": 37195895, "step": 1722, "time_per_iteration": 2.7972562313079834 }, { "auxiliary_loss_clip": 0.01059018, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.02668202, "balance_loss_mlp": 1.03536737, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9413879826908518, "language_loss": 0.67161834, "learning_rate": 3.943457464282059e-06, "loss": 0.69259846, "num_input_tokens_seen": 37247270, "step": 1723, "time_per_iteration": 4.899553060531616 }, { "auxiliary_loss_clip": 0.01169875, "auxiliary_loss_mlp": 0.01062977, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.04193664, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 2.8641520576523116, "language_loss": 0.77715755, "learning_rate": 3.9433654754811745e-06, "loss": 0.7994861, "num_input_tokens_seen": 37265595, "step": 1724, "time_per_iteration": 2.7613437175750732 }, { "auxiliary_loss_clip": 0.01151829, "auxiliary_loss_mlp": 0.01069246, "balance_loss_clip": 1.05667496, "balance_loss_mlp": 1.04753852, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 2.6433978354033543, "language_loss": 0.74533165, "learning_rate": 3.943273412987676e-06, "loss": 0.76754242, "num_input_tokens_seen": 37286660, "step": 1725, "time_per_iteration": 4.557274580001831 }, { "auxiliary_loss_clip": 0.01137065, "auxiliary_loss_mlp": 0.01081067, "balance_loss_clip": 1.05264461, "balance_loss_mlp": 1.05832207, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.2241153649877865, "language_loss": 0.75043738, "learning_rate": 3.943181276805054e-06, "loss": 0.77261865, "num_input_tokens_seen": 37304915, "step": 1726, "time_per_iteration": 2.7098495960235596 }, { "auxiliary_loss_clip": 0.01150932, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.05345368, "balance_loss_mlp": 1.05610991, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 2.783771441956431, "language_loss": 0.73243797, "learning_rate": 3.9430890669368035e-06, "loss": 0.75473368, "num_input_tokens_seen": 37325265, "step": 1727, "time_per_iteration": 2.74774169921875 }, { "auxiliary_loss_clip": 0.01157922, "auxiliary_loss_mlp": 0.01068007, "balance_loss_clip": 1.05303776, "balance_loss_mlp": 1.04625082, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.172978726198527, "language_loss": 0.84373868, "learning_rate": 3.942996783386422e-06, "loss": 0.86599791, "num_input_tokens_seen": 37341650, "step": 1728, "time_per_iteration": 2.675724744796753 }, { "auxiliary_loss_clip": 0.01154897, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.0545603, "balance_loss_mlp": 1.0393219, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 2.1406499008555513, "language_loss": 0.70776087, "learning_rate": 3.942904426157406e-06, "loss": 0.7299149, "num_input_tokens_seen": 37360270, "step": 1729, "time_per_iteration": 2.6885008811950684 }, { "auxiliary_loss_clip": 0.01158623, "auxiliary_loss_mlp": 0.01068311, "balance_loss_clip": 1.05437422, "balance_loss_mlp": 1.04520774, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 2.4133379049648283, "language_loss": 0.81237471, "learning_rate": 3.9428119952532605e-06, "loss": 0.83464402, "num_input_tokens_seen": 37375225, "step": 1730, "time_per_iteration": 2.6659536361694336 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01063394, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.04314065, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 1.6634499611984725, "language_loss": 0.75829297, "learning_rate": 3.942719490677489e-06, "loss": 0.77978551, "num_input_tokens_seen": 37395165, "step": 1731, "time_per_iteration": 3.043125629425049 }, { "auxiliary_loss_clip": 0.01129913, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.0526607, "balance_loss_mlp": 1.04604149, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.8280179918091173, "language_loss": 0.8268069, "learning_rate": 3.9426269124336e-06, "loss": 0.84876388, "num_input_tokens_seen": 37414845, "step": 1732, "time_per_iteration": 2.96221661567688 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01067805, "balance_loss_clip": 1.05805755, "balance_loss_mlp": 1.04852867, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 1.9919813178368582, "language_loss": 0.83320522, "learning_rate": 3.942534260525104e-06, "loss": 0.85529828, "num_input_tokens_seen": 37432490, "step": 1733, "time_per_iteration": 2.7364420890808105 }, { "auxiliary_loss_clip": 0.01153374, "auxiliary_loss_mlp": 0.0106675, "balance_loss_clip": 1.05592012, "balance_loss_mlp": 1.04654372, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.4441875881355597, "language_loss": 0.76683885, "learning_rate": 3.942441534955514e-06, "loss": 0.78904009, "num_input_tokens_seen": 37449435, "step": 1734, "time_per_iteration": 2.669623851776123 }, { "auxiliary_loss_clip": 0.0113597, "auxiliary_loss_mlp": 0.01052567, "balance_loss_clip": 1.05042601, "balance_loss_mlp": 1.03255177, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.6775801166329647, "language_loss": 0.74826896, "learning_rate": 3.9423487357283465e-06, "loss": 0.7701543, "num_input_tokens_seen": 37469105, "step": 1735, "time_per_iteration": 2.8477160930633545 }, { "auxiliary_loss_clip": 0.01167698, "auxiliary_loss_mlp": 0.01055716, "balance_loss_clip": 1.05678105, "balance_loss_mlp": 1.0344727, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 1.7228393064183538, "language_loss": 0.78835273, "learning_rate": 3.94225586284712e-06, "loss": 0.81058681, "num_input_tokens_seen": 37490540, "step": 1736, "time_per_iteration": 2.690453052520752 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.01064692, "balance_loss_clip": 1.05800533, "balance_loss_mlp": 1.04357982, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 1.8549131823334455, "language_loss": 0.7058785, "learning_rate": 3.942162916315356e-06, "loss": 0.72819883, "num_input_tokens_seen": 37511905, "step": 1737, "time_per_iteration": 2.6296744346618652 }, { "auxiliary_loss_clip": 0.01150138, "auxiliary_loss_mlp": 0.01059407, "balance_loss_clip": 1.04806042, "balance_loss_mlp": 1.03600669, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 2.415613377802324, "language_loss": 0.81624997, "learning_rate": 3.942069896136581e-06, "loss": 0.83834541, "num_input_tokens_seen": 37533635, "step": 1738, "time_per_iteration": 2.7436723709106445 }, { "auxiliary_loss_clip": 0.01181471, "auxiliary_loss_mlp": 0.01062035, "balance_loss_clip": 1.05579174, "balance_loss_mlp": 1.03950453, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.1004590024567897, "language_loss": 0.75419426, "learning_rate": 3.9419768023143196e-06, "loss": 0.77662933, "num_input_tokens_seen": 37552035, "step": 1739, "time_per_iteration": 2.585538148880005 }, { "auxiliary_loss_clip": 0.01146716, "auxiliary_loss_mlp": 0.01054893, "balance_loss_clip": 1.05417264, "balance_loss_mlp": 1.03348303, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.586314706443492, "language_loss": 0.77523744, "learning_rate": 3.941883634852104e-06, "loss": 0.79725355, "num_input_tokens_seen": 37571540, "step": 1740, "time_per_iteration": 2.8947789669036865 }, { "auxiliary_loss_clip": 0.01152077, "auxiliary_loss_mlp": 0.01049503, "balance_loss_clip": 1.05725431, "balance_loss_mlp": 1.0288676, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 1.964868695493703, "language_loss": 0.85976374, "learning_rate": 3.941790393753467e-06, "loss": 0.88177955, "num_input_tokens_seen": 37588265, "step": 1741, "time_per_iteration": 2.7706260681152344 }, { "auxiliary_loss_clip": 0.01158134, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.05614483, "balance_loss_mlp": 1.03350592, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 5.197245251055922, "language_loss": 0.75592613, "learning_rate": 3.941697079021942e-06, "loss": 0.77807057, "num_input_tokens_seen": 37606860, "step": 1742, "time_per_iteration": 2.784748077392578 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01057571, "balance_loss_clip": 1.05678856, "balance_loss_mlp": 1.03735304, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.1426857583950416, "language_loss": 0.87614191, "learning_rate": 3.94160369066107e-06, "loss": 0.89802414, "num_input_tokens_seen": 37625210, "step": 1743, "time_per_iteration": 2.819350004196167 }, { "auxiliary_loss_clip": 0.01139959, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.0552268, "balance_loss_mlp": 1.0254786, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 2.060686178474056, "language_loss": 0.75927812, "learning_rate": 3.941510228674391e-06, "loss": 0.7811631, "num_input_tokens_seen": 37644110, "step": 1744, "time_per_iteration": 2.7817211151123047 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.05992889, "balance_loss_mlp": 1.03442037, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 1.9689383181633062, "language_loss": 0.78905094, "learning_rate": 3.941416693065451e-06, "loss": 0.81129813, "num_input_tokens_seen": 37665800, "step": 1745, "time_per_iteration": 2.88080096244812 }, { "auxiliary_loss_clip": 0.01180482, "auxiliary_loss_mlp": 0.01060479, "balance_loss_clip": 1.05740213, "balance_loss_mlp": 1.03920031, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 2.64819141351011, "language_loss": 0.82568693, "learning_rate": 3.941323083837794e-06, "loss": 0.84809649, "num_input_tokens_seen": 37685095, "step": 1746, "time_per_iteration": 2.7068004608154297 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.0105595, "balance_loss_clip": 1.05737162, "balance_loss_mlp": 1.03448033, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 1.6274602877205533, "language_loss": 0.70573747, "learning_rate": 3.941229400994971e-06, "loss": 0.7278806, "num_input_tokens_seen": 37707445, "step": 1747, "time_per_iteration": 2.8689963817596436 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01056346, "balance_loss_clip": 1.06035507, "balance_loss_mlp": 1.03492367, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.386885173400054, "language_loss": 0.8447504, "learning_rate": 3.941135644540535e-06, "loss": 0.86690772, "num_input_tokens_seen": 37728325, "step": 1748, "time_per_iteration": 2.8022749423980713 }, { "auxiliary_loss_clip": 0.01175489, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05471563, "balance_loss_mlp": 1.02701974, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.759895679837136, "language_loss": 0.71681082, "learning_rate": 3.941041814478041e-06, "loss": 0.73905981, "num_input_tokens_seen": 37748910, "step": 1749, "time_per_iteration": 2.6568849086761475 }, { "auxiliary_loss_clip": 0.01158221, "auxiliary_loss_mlp": 0.01058697, "balance_loss_clip": 1.05427456, "balance_loss_mlp": 1.03590393, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.95022560634889, "language_loss": 0.81510806, "learning_rate": 3.940947910811047e-06, "loss": 0.83727717, "num_input_tokens_seen": 37765745, "step": 1750, "time_per_iteration": 2.6282739639282227 }, { "auxiliary_loss_clip": 0.01156475, "auxiliary_loss_mlp": 0.01062657, "balance_loss_clip": 1.06022298, "balance_loss_mlp": 1.03973269, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 2.2218325288878953, "language_loss": 0.92364043, "learning_rate": 3.940853933543114e-06, "loss": 0.94583178, "num_input_tokens_seen": 37780520, "step": 1751, "time_per_iteration": 2.703376531600952 }, { "auxiliary_loss_clip": 0.01165779, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.0570029, "balance_loss_mlp": 1.03171563, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.0356912608722877, "language_loss": 0.79293752, "learning_rate": 3.940759882677805e-06, "loss": 0.81512833, "num_input_tokens_seen": 37799515, "step": 1752, "time_per_iteration": 2.6501150131225586 }, { "auxiliary_loss_clip": 0.01116865, "auxiliary_loss_mlp": 0.01055489, "balance_loss_clip": 1.05116987, "balance_loss_mlp": 1.03264856, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 2.022904639316529, "language_loss": 0.75978744, "learning_rate": 3.940665758218686e-06, "loss": 0.78151095, "num_input_tokens_seen": 37818695, "step": 1753, "time_per_iteration": 2.871335744857788 }, { "auxiliary_loss_clip": 0.01141721, "auxiliary_loss_mlp": 0.01057356, "balance_loss_clip": 1.05547547, "balance_loss_mlp": 1.03415775, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.0563919939847914, "language_loss": 0.83969283, "learning_rate": 3.940571560169328e-06, "loss": 0.86168355, "num_input_tokens_seen": 37837860, "step": 1754, "time_per_iteration": 2.685591459274292 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01053577, "balance_loss_clip": 1.05587101, "balance_loss_mlp": 1.03034329, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.7567281016961087, "language_loss": 0.68732727, "learning_rate": 3.940477288533302e-06, "loss": 0.70923102, "num_input_tokens_seen": 37856260, "step": 1755, "time_per_iteration": 2.754117727279663 }, { "auxiliary_loss_clip": 0.01161626, "auxiliary_loss_mlp": 0.010623, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 1.040187, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.26658946748733, "language_loss": 0.76382339, "learning_rate": 3.940382943314182e-06, "loss": 0.7860626, "num_input_tokens_seen": 37876960, "step": 1756, "time_per_iteration": 2.686790943145752 }, { "auxiliary_loss_clip": 0.01182062, "auxiliary_loss_mlp": 0.01062906, "balance_loss_clip": 1.05688286, "balance_loss_mlp": 1.04203284, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.5917029795724482, "language_loss": 0.79926664, "learning_rate": 3.940288524515547e-06, "loss": 0.82171631, "num_input_tokens_seen": 37897070, "step": 1757, "time_per_iteration": 2.6543681621551514 }, { "auxiliary_loss_clip": 0.01149304, "auxiliary_loss_mlp": 0.01057523, "balance_loss_clip": 1.0524838, "balance_loss_mlp": 1.03563643, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 1.6583181970862437, "language_loss": 0.78714895, "learning_rate": 3.940194032140976e-06, "loss": 0.80921721, "num_input_tokens_seen": 37923635, "step": 1758, "time_per_iteration": 3.013157367706299 }, { "auxiliary_loss_clip": 0.01165597, "auxiliary_loss_mlp": 0.01054919, "balance_loss_clip": 1.05894113, "balance_loss_mlp": 1.03347349, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 1.870482409236857, "language_loss": 0.91388202, "learning_rate": 3.940099466194054e-06, "loss": 0.93608713, "num_input_tokens_seen": 37942650, "step": 1759, "time_per_iteration": 4.1841137409210205 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.05242109, "balance_loss_mlp": 1.03346229, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.509404173865799, "language_loss": 0.77406812, "learning_rate": 3.940004826678365e-06, "loss": 0.79618067, "num_input_tokens_seen": 37960660, "step": 1760, "time_per_iteration": 4.476959228515625 }, { "auxiliary_loss_clip": 0.01161737, "auxiliary_loss_mlp": 0.01064522, "balance_loss_clip": 1.0536418, "balance_loss_mlp": 1.04053712, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 2.27300461956159, "language_loss": 0.88896096, "learning_rate": 3.939910113597498e-06, "loss": 0.91122353, "num_input_tokens_seen": 37978625, "step": 1761, "time_per_iteration": 2.6907520294189453 }, { "auxiliary_loss_clip": 0.01110571, "auxiliary_loss_mlp": 0.00782389, "balance_loss_clip": 1.04964042, "balance_loss_mlp": 1.00012767, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 2.010693315376097, "language_loss": 0.7809304, "learning_rate": 3.9398153269550464e-06, "loss": 0.79986, "num_input_tokens_seen": 38000005, "step": 1762, "time_per_iteration": 2.869051456451416 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.02694225, "balance_loss_mlp": 1.05056334, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8956567750819878, "language_loss": 0.60503203, "learning_rate": 3.939720466754602e-06, "loss": 0.6261009, "num_input_tokens_seen": 38066165, "step": 1763, "time_per_iteration": 5.049196720123291 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01048706, "balance_loss_clip": 1.05424261, "balance_loss_mlp": 1.02708137, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 2.0510547250099633, "language_loss": 0.80232942, "learning_rate": 3.939625532999763e-06, "loss": 0.82438517, "num_input_tokens_seen": 38086150, "step": 1764, "time_per_iteration": 4.288762807846069 }, { "auxiliary_loss_clip": 0.01136032, "auxiliary_loss_mlp": 0.01055975, "balance_loss_clip": 1.04879069, "balance_loss_mlp": 1.03218043, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.693202084864273, "language_loss": 0.801691, "learning_rate": 3.9395305256941314e-06, "loss": 0.82361102, "num_input_tokens_seen": 38104205, "step": 1765, "time_per_iteration": 2.931269407272339 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01058956, "balance_loss_clip": 1.05457163, "balance_loss_mlp": 1.0367949, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.7665774264343403, "language_loss": 0.76864165, "learning_rate": 3.939435444841306e-06, "loss": 0.79086387, "num_input_tokens_seen": 38122005, "step": 1766, "time_per_iteration": 2.5976176261901855 }, { "auxiliary_loss_clip": 0.01182495, "auxiliary_loss_mlp": 0.01059246, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.03766894, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 1.6265727447650185, "language_loss": 0.77311498, "learning_rate": 3.939340290444895e-06, "loss": 0.79553241, "num_input_tokens_seen": 38143365, "step": 1767, "time_per_iteration": 2.6356630325317383 }, { "auxiliary_loss_clip": 0.01006515, "auxiliary_loss_mlp": 0.01018751, "balance_loss_clip": 1.03004837, "balance_loss_mlp": 1.0151509, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.9172341423433896, "language_loss": 0.57889944, "learning_rate": 3.939245062508506e-06, "loss": 0.59915209, "num_input_tokens_seen": 38210035, "step": 1768, "time_per_iteration": 3.6866471767425537 }, { "auxiliary_loss_clip": 0.01144481, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.0546546, "balance_loss_mlp": 1.02687907, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4529696494540971, "language_loss": 0.86711109, "learning_rate": 3.939149761035749e-06, "loss": 0.8890301, "num_input_tokens_seen": 38231230, "step": 1769, "time_per_iteration": 3.936905860900879 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.00780338, "balance_loss_clip": 1.05321527, "balance_loss_mlp": 1.00008726, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 1.8275276693890916, "language_loss": 0.61906171, "learning_rate": 3.9390543860302395e-06, "loss": 0.63827729, "num_input_tokens_seen": 38253890, "step": 1770, "time_per_iteration": 2.8926138877868652 }, { "auxiliary_loss_clip": 0.01057689, "auxiliary_loss_mlp": 0.01010808, "balance_loss_clip": 1.02007711, "balance_loss_mlp": 1.00775671, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.9163874753670794, "language_loss": 0.57049137, "learning_rate": 3.9389589374955925e-06, "loss": 0.59117633, "num_input_tokens_seen": 38304290, "step": 1771, "time_per_iteration": 3.0783088207244873 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.01065918, "balance_loss_clip": 1.05574095, "balance_loss_mlp": 1.04465103, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 12.794881398939157, "language_loss": 0.88265753, "learning_rate": 3.938863415435429e-06, "loss": 0.90477949, "num_input_tokens_seen": 38324725, "step": 1772, "time_per_iteration": 2.770202159881592 }, { "auxiliary_loss_clip": 0.0118421, "auxiliary_loss_mlp": 0.01058161, "balance_loss_clip": 1.05697048, "balance_loss_mlp": 1.03497458, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 2.576940958490313, "language_loss": 0.76030588, "learning_rate": 3.93876781985337e-06, "loss": 0.78272957, "num_input_tokens_seen": 38340735, "step": 1773, "time_per_iteration": 2.6177070140838623 }, { "auxiliary_loss_clip": 0.01122733, "auxiliary_loss_mlp": 0.01067657, "balance_loss_clip": 1.04691553, "balance_loss_mlp": 1.04205084, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 1.868288871406422, "language_loss": 0.8330853, "learning_rate": 3.938672150753041e-06, "loss": 0.85498923, "num_input_tokens_seen": 38361315, "step": 1774, "time_per_iteration": 2.7396061420440674 }, { "auxiliary_loss_clip": 0.01156305, "auxiliary_loss_mlp": 0.00780518, "balance_loss_clip": 1.05627465, "balance_loss_mlp": 1.00011277, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.73383407032925, "language_loss": 0.76446521, "learning_rate": 3.9385764081380704e-06, "loss": 0.78383344, "num_input_tokens_seen": 38377425, "step": 1775, "time_per_iteration": 2.624208927154541 }, { "auxiliary_loss_clip": 0.01063199, "auxiliary_loss_mlp": 0.01007654, "balance_loss_clip": 1.01726675, "balance_loss_mlp": 1.00443542, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8200823962511624, "language_loss": 0.57477289, "learning_rate": 3.9384805920120876e-06, "loss": 0.5954814, "num_input_tokens_seen": 38440275, "step": 1776, "time_per_iteration": 3.1782386302948 }, { "auxiliary_loss_clip": 0.01150087, "auxiliary_loss_mlp": 0.01066244, "balance_loss_clip": 1.05192852, "balance_loss_mlp": 1.0407691, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.4232532718517703, "language_loss": 0.83442962, "learning_rate": 3.938384702378727e-06, "loss": 0.85659301, "num_input_tokens_seen": 38461820, "step": 1777, "time_per_iteration": 2.7342305183410645 }, { "auxiliary_loss_clip": 0.01113855, "auxiliary_loss_mlp": 0.00780712, "balance_loss_clip": 1.04919302, "balance_loss_mlp": 1.00015831, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.8326039994575831, "language_loss": 0.87207437, "learning_rate": 3.938288739241625e-06, "loss": 0.89102006, "num_input_tokens_seen": 38482235, "step": 1778, "time_per_iteration": 2.859834671020508 }, { "auxiliary_loss_clip": 0.01152509, "auxiliary_loss_mlp": 0.00780436, "balance_loss_clip": 1.06804752, "balance_loss_mlp": 1.00019765, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.4525249429301823, "language_loss": 0.84165859, "learning_rate": 3.938192702604417e-06, "loss": 0.86098808, "num_input_tokens_seen": 38500690, "step": 1779, "time_per_iteration": 2.81423020362854 }, { "auxiliary_loss_clip": 0.01141718, "auxiliary_loss_mlp": 0.00779857, "balance_loss_clip": 1.05215359, "balance_loss_mlp": 1.0001775, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 1.9378348403129941, "language_loss": 0.66915894, "learning_rate": 3.9380965924707495e-06, "loss": 0.68837464, "num_input_tokens_seen": 38518405, "step": 1780, "time_per_iteration": 2.616684913635254 }, { "auxiliary_loss_clip": 0.01166288, "auxiliary_loss_mlp": 0.01054109, "balance_loss_clip": 1.05843914, "balance_loss_mlp": 1.03268683, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 1.9168180254288365, "language_loss": 0.92058647, "learning_rate": 3.938000408844265e-06, "loss": 0.94279045, "num_input_tokens_seen": 38535060, "step": 1781, "time_per_iteration": 2.6167802810668945 }, { "auxiliary_loss_clip": 0.0113109, "auxiliary_loss_mlp": 0.01064554, "balance_loss_clip": 1.0531441, "balance_loss_mlp": 1.04344225, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 1.8357670097294174, "language_loss": 0.79336482, "learning_rate": 3.9379041517286105e-06, "loss": 0.81532121, "num_input_tokens_seen": 38552855, "step": 1782, "time_per_iteration": 2.7669336795806885 }, { "auxiliary_loss_clip": 0.01158369, "auxiliary_loss_mlp": 0.01061646, "balance_loss_clip": 1.05510604, "balance_loss_mlp": 1.04016423, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.0914095256513945, "language_loss": 0.79086542, "learning_rate": 3.937807821127436e-06, "loss": 0.81306553, "num_input_tokens_seen": 38570075, "step": 1783, "time_per_iteration": 2.6349542140960693 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01065333, "balance_loss_clip": 1.0570296, "balance_loss_mlp": 1.04299295, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.1874612027367806, "language_loss": 0.86421812, "learning_rate": 3.937711417044395e-06, "loss": 0.88651407, "num_input_tokens_seen": 38587970, "step": 1784, "time_per_iteration": 2.8452541828155518 }, { "auxiliary_loss_clip": 0.01153461, "auxiliary_loss_mlp": 0.01055605, "balance_loss_clip": 1.05502176, "balance_loss_mlp": 1.03321707, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.4649130783319553, "language_loss": 1.01192284, "learning_rate": 3.937614939483143e-06, "loss": 1.03401351, "num_input_tokens_seen": 38605840, "step": 1785, "time_per_iteration": 2.690018653869629 }, { "auxiliary_loss_clip": 0.01168517, "auxiliary_loss_mlp": 0.01060763, "balance_loss_clip": 1.05854678, "balance_loss_mlp": 1.03984189, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.397915549237645, "language_loss": 0.84951413, "learning_rate": 3.937518388447339e-06, "loss": 0.87180698, "num_input_tokens_seen": 38627070, "step": 1786, "time_per_iteration": 2.637430191040039 }, { "auxiliary_loss_clip": 0.01183118, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05716729, "balance_loss_mlp": 1.03520155, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 1.7951357311742837, "language_loss": 0.78861409, "learning_rate": 3.937421763940642e-06, "loss": 0.81103605, "num_input_tokens_seen": 38645840, "step": 1787, "time_per_iteration": 2.54508900642395 }, { "auxiliary_loss_clip": 0.01174896, "auxiliary_loss_mlp": 0.01047406, "balance_loss_clip": 1.05971575, "balance_loss_mlp": 1.02528071, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.8536072321218278, "language_loss": 0.82307518, "learning_rate": 3.937325065966719e-06, "loss": 0.84529817, "num_input_tokens_seen": 38664770, "step": 1788, "time_per_iteration": 2.706247568130493 }, { "auxiliary_loss_clip": 0.01180896, "auxiliary_loss_mlp": 0.01064682, "balance_loss_clip": 1.05843878, "balance_loss_mlp": 1.04427314, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.110245519520894, "language_loss": 0.77840686, "learning_rate": 3.9372282945292335e-06, "loss": 0.80086267, "num_input_tokens_seen": 38683865, "step": 1789, "time_per_iteration": 2.6274654865264893 }, { "auxiliary_loss_clip": 0.01185566, "auxiliary_loss_mlp": 0.01065099, "balance_loss_clip": 1.0604099, "balance_loss_mlp": 1.04049408, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 2.7248977042722524, "language_loss": 0.74817526, "learning_rate": 3.937131449631859e-06, "loss": 0.77068192, "num_input_tokens_seen": 38702485, "step": 1790, "time_per_iteration": 2.624382972717285 }, { "auxiliary_loss_clip": 0.01178128, "auxiliary_loss_mlp": 0.00780572, "balance_loss_clip": 1.06110644, "balance_loss_mlp": 1.00021124, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.350797373347828, "language_loss": 0.78764236, "learning_rate": 3.9370345312782645e-06, "loss": 0.80722934, "num_input_tokens_seen": 38722475, "step": 1791, "time_per_iteration": 2.696162223815918 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01065057, "balance_loss_clip": 1.05280125, "balance_loss_mlp": 1.04117918, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.5879424734455678, "language_loss": 0.70638013, "learning_rate": 3.936937539472126e-06, "loss": 0.7283721, "num_input_tokens_seen": 38743285, "step": 1792, "time_per_iteration": 2.770874261856079 }, { "auxiliary_loss_clip": 0.01149934, "auxiliary_loss_mlp": 0.01051019, "balance_loss_clip": 1.05610943, "balance_loss_mlp": 1.02764249, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 1.920104493539276, "language_loss": 0.76565266, "learning_rate": 3.9368404742171236e-06, "loss": 0.78766215, "num_input_tokens_seen": 38763035, "step": 1793, "time_per_iteration": 2.7218761444091797 }, { "auxiliary_loss_clip": 0.01116412, "auxiliary_loss_mlp": 0.01064574, "balance_loss_clip": 1.05029237, "balance_loss_mlp": 1.0414238, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.7475786500241859, "language_loss": 0.85103315, "learning_rate": 3.936743335516936e-06, "loss": 0.87284303, "num_input_tokens_seen": 38784900, "step": 1794, "time_per_iteration": 2.7590620517730713 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01055294, "balance_loss_clip": 1.04807687, "balance_loss_mlp": 1.03146446, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 2.5236234593460924, "language_loss": 0.74585378, "learning_rate": 3.936646123375246e-06, "loss": 0.76755869, "num_input_tokens_seen": 38804695, "step": 1795, "time_per_iteration": 2.8500585556030273 }, { "auxiliary_loss_clip": 0.01124895, "auxiliary_loss_mlp": 0.01058294, "balance_loss_clip": 1.04831553, "balance_loss_mlp": 1.03479767, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 2.842374039298248, "language_loss": 0.81653619, "learning_rate": 3.936548837795741e-06, "loss": 0.83836806, "num_input_tokens_seen": 38822395, "step": 1796, "time_per_iteration": 2.7549750804901123 }, { "auxiliary_loss_clip": 0.01140492, "auxiliary_loss_mlp": 0.01083966, "balance_loss_clip": 1.05246449, "balance_loss_mlp": 1.05721593, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.59635455269928, "language_loss": 0.74233043, "learning_rate": 3.936451478782111e-06, "loss": 0.764575, "num_input_tokens_seen": 38839865, "step": 1797, "time_per_iteration": 2.6396753787994385 }, { "auxiliary_loss_clip": 0.01160286, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.05505061, "balance_loss_mlp": 1.02874684, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.0852339617015025, "language_loss": 0.81855786, "learning_rate": 3.936354046338046e-06, "loss": 0.84066033, "num_input_tokens_seen": 38857300, "step": 1798, "time_per_iteration": 2.7105324268341064 }, { "auxiliary_loss_clip": 0.01142859, "auxiliary_loss_mlp": 0.01054502, "balance_loss_clip": 1.05379176, "balance_loss_mlp": 1.03117299, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.4443000829323687, "language_loss": 0.85516405, "learning_rate": 3.936256540467242e-06, "loss": 0.87713766, "num_input_tokens_seen": 38874960, "step": 1799, "time_per_iteration": 4.159978628158569 }, { "auxiliary_loss_clip": 0.01154352, "auxiliary_loss_mlp": 0.01062903, "balance_loss_clip": 1.05493283, "balance_loss_mlp": 1.04114687, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 2.7405734706827825, "language_loss": 0.77434146, "learning_rate": 3.9361589611733955e-06, "loss": 0.79651403, "num_input_tokens_seen": 38893610, "step": 1800, "time_per_iteration": 4.52047872543335 }, { "auxiliary_loss_clip": 0.01178634, "auxiliary_loss_mlp": 0.0104758, "balance_loss_clip": 1.05722904, "balance_loss_mlp": 1.02689719, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.582468034859118, "language_loss": 0.72897375, "learning_rate": 3.9360613084602075e-06, "loss": 0.75123584, "num_input_tokens_seen": 38913485, "step": 1801, "time_per_iteration": 4.291400909423828 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01056056, "balance_loss_clip": 1.06095624, "balance_loss_mlp": 1.03478956, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 1.951139287607183, "language_loss": 0.6634692, "learning_rate": 3.935963582331381e-06, "loss": 0.68593562, "num_input_tokens_seen": 38935650, "step": 1802, "time_per_iteration": 2.722628355026245 }, { "auxiliary_loss_clip": 0.01155661, "auxiliary_loss_mlp": 0.01059375, "balance_loss_clip": 1.05326533, "balance_loss_mlp": 1.03695142, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 2.084551157592464, "language_loss": 0.81612957, "learning_rate": 3.935865782790621e-06, "loss": 0.8382799, "num_input_tokens_seen": 38954130, "step": 1803, "time_per_iteration": 4.239379167556763 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01061781, "balance_loss_clip": 1.0567112, "balance_loss_mlp": 1.03921473, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 1.9102934552723363, "language_loss": 0.91127038, "learning_rate": 3.9357679098416365e-06, "loss": 0.93351918, "num_input_tokens_seen": 38972905, "step": 1804, "time_per_iteration": 2.5836737155914307 }, { "auxiliary_loss_clip": 0.01136188, "auxiliary_loss_mlp": 0.01060133, "balance_loss_clip": 1.05617714, "balance_loss_mlp": 1.03718543, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 2.5742522317806262, "language_loss": 0.76198906, "learning_rate": 3.935669963488139e-06, "loss": 0.78395224, "num_input_tokens_seen": 38993255, "step": 1805, "time_per_iteration": 2.783137321472168 }, { "auxiliary_loss_clip": 0.01149468, "auxiliary_loss_mlp": 0.01050946, "balance_loss_clip": 1.05419612, "balance_loss_mlp": 1.03050184, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 1.7049574807827799, "language_loss": 0.85876733, "learning_rate": 3.935571943733843e-06, "loss": 0.88077152, "num_input_tokens_seen": 39012610, "step": 1806, "time_per_iteration": 2.8148701190948486 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.00779888, "balance_loss_clip": 1.05462408, "balance_loss_mlp": 1.00006652, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.554050049117878, "language_loss": 0.8108198, "learning_rate": 3.9354738505824635e-06, "loss": 0.83030605, "num_input_tokens_seen": 39030120, "step": 1807, "time_per_iteration": 2.6275649070739746 }, { "auxiliary_loss_clip": 0.01139085, "auxiliary_loss_mlp": 0.01055438, "balance_loss_clip": 1.05193985, "balance_loss_mlp": 1.03522038, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.834914777588586, "language_loss": 0.78910971, "learning_rate": 3.9353756840377225e-06, "loss": 0.81105494, "num_input_tokens_seen": 39049875, "step": 1808, "time_per_iteration": 2.722910165786743 }, { "auxiliary_loss_clip": 0.01157997, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.05918014, "balance_loss_mlp": 1.03548992, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.6201371380093192, "language_loss": 0.79013431, "learning_rate": 3.935277444103342e-06, "loss": 0.81228393, "num_input_tokens_seen": 39068935, "step": 1809, "time_per_iteration": 2.7261481285095215 }, { "auxiliary_loss_clip": 0.01180468, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.0568099, "balance_loss_mlp": 1.03705359, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 1.9004896030263678, "language_loss": 0.85129547, "learning_rate": 3.935179130783046e-06, "loss": 0.87367928, "num_input_tokens_seen": 39087370, "step": 1810, "time_per_iteration": 2.672696828842163 }, { "auxiliary_loss_clip": 0.01124301, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.04580724, "balance_loss_mlp": 1.0335803, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 1.5993643379141278, "language_loss": 0.63822675, "learning_rate": 3.935080744080564e-06, "loss": 0.66004336, "num_input_tokens_seen": 39106635, "step": 1811, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.01151891, "auxiliary_loss_mlp": 0.01050225, "balance_loss_clip": 1.05335796, "balance_loss_mlp": 1.02836192, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 1.9284151803363307, "language_loss": 0.74238706, "learning_rate": 3.934982283999626e-06, "loss": 0.76440823, "num_input_tokens_seen": 39126335, "step": 1812, "time_per_iteration": 2.727743625640869 }, { "auxiliary_loss_clip": 0.01142498, "auxiliary_loss_mlp": 0.01057826, "balance_loss_clip": 1.05199611, "balance_loss_mlp": 1.03546214, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.5783196636767667, "language_loss": 0.72746086, "learning_rate": 3.934883750543966e-06, "loss": 0.74946409, "num_input_tokens_seen": 39144820, "step": 1813, "time_per_iteration": 2.798297166824341 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01056639, "balance_loss_clip": 1.0511452, "balance_loss_mlp": 1.03515792, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.635228619121262, "language_loss": 0.82981038, "learning_rate": 3.93478514371732e-06, "loss": 0.85176599, "num_input_tokens_seen": 39165945, "step": 1814, "time_per_iteration": 2.7120048999786377 }, { "auxiliary_loss_clip": 0.01141958, "auxiliary_loss_mlp": 0.01058857, "balance_loss_clip": 1.0537864, "balance_loss_mlp": 1.03787625, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 1.9556743991494996, "language_loss": 0.84310579, "learning_rate": 3.934686463523429e-06, "loss": 0.86511397, "num_input_tokens_seen": 39183520, "step": 1815, "time_per_iteration": 2.788870096206665 }, { "auxiliary_loss_clip": 0.01146878, "auxiliary_loss_mlp": 0.01055141, "balance_loss_clip": 1.05443966, "balance_loss_mlp": 1.03182411, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 2.5374826422013195, "language_loss": 0.71670222, "learning_rate": 3.9345877099660315e-06, "loss": 0.73872244, "num_input_tokens_seen": 39201190, "step": 1816, "time_per_iteration": 2.8424103260040283 }, { "auxiliary_loss_clip": 0.01164173, "auxiliary_loss_mlp": 0.01064184, "balance_loss_clip": 1.05216932, "balance_loss_mlp": 1.04052126, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 2.016899555923086, "language_loss": 0.72880268, "learning_rate": 3.9344888830488744e-06, "loss": 0.75108624, "num_input_tokens_seen": 39221210, "step": 1817, "time_per_iteration": 2.7320947647094727 }, { "auxiliary_loss_clip": 0.01116915, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.05173278, "balance_loss_mlp": 1.03517008, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.5988628345308824, "language_loss": 0.67275256, "learning_rate": 3.934389982775706e-06, "loss": 0.69450033, "num_input_tokens_seen": 39242025, "step": 1818, "time_per_iteration": 2.8700790405273438 }, { "auxiliary_loss_clip": 0.01155804, "auxiliary_loss_mlp": 0.01065952, "balance_loss_clip": 1.05673873, "balance_loss_mlp": 1.04313517, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 3.593580913512793, "language_loss": 0.73149616, "learning_rate": 3.934291009150275e-06, "loss": 0.75371373, "num_input_tokens_seen": 39259870, "step": 1819, "time_per_iteration": 2.7091007232666016 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.00779155, "balance_loss_clip": 1.05341268, "balance_loss_mlp": 1.00027704, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 4.531598275817935, "language_loss": 0.73764241, "learning_rate": 3.934191962176335e-06, "loss": 0.75686359, "num_input_tokens_seen": 39278500, "step": 1820, "time_per_iteration": 2.6513099670410156 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01056073, "balance_loss_clip": 1.05747604, "balance_loss_mlp": 1.03297031, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 2.2567103978329337, "language_loss": 0.82532805, "learning_rate": 3.934092841857642e-06, "loss": 0.84768236, "num_input_tokens_seen": 39294800, "step": 1821, "time_per_iteration": 2.5348384380340576 }, { "auxiliary_loss_clip": 0.01148016, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.05133605, "balance_loss_mlp": 1.03077567, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 2.0770330480401578, "language_loss": 0.76271641, "learning_rate": 3.933993648197955e-06, "loss": 0.7847169, "num_input_tokens_seen": 39314625, "step": 1822, "time_per_iteration": 2.730079174041748 }, { "auxiliary_loss_clip": 0.01142446, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.04849207, "balance_loss_mlp": 1.02856421, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.734419613996414, "language_loss": 0.79309607, "learning_rate": 3.933894381201034e-06, "loss": 0.81501311, "num_input_tokens_seen": 39336465, "step": 1823, "time_per_iteration": 2.756969928741455 }, { "auxiliary_loss_clip": 0.01148165, "auxiliary_loss_mlp": 0.01049595, "balance_loss_clip": 1.05160606, "balance_loss_mlp": 1.02745807, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.4318009514182364, "language_loss": 0.79590744, "learning_rate": 3.933795040870645e-06, "loss": 0.81788504, "num_input_tokens_seen": 39357930, "step": 1824, "time_per_iteration": 2.798168182373047 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.05104232, "balance_loss_mlp": 1.03381693, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 2.127143421089703, "language_loss": 0.88138539, "learning_rate": 3.933695627210554e-06, "loss": 0.90336192, "num_input_tokens_seen": 39376380, "step": 1825, "time_per_iteration": 2.6804513931274414 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 1.04586983, "balance_loss_mlp": 1.03439498, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 1.721192594935189, "language_loss": 0.76441038, "learning_rate": 3.933596140224532e-06, "loss": 0.78625786, "num_input_tokens_seen": 39399935, "step": 1826, "time_per_iteration": 2.8315086364746094 }, { "auxiliary_loss_clip": 0.01063155, "auxiliary_loss_mlp": 0.01016957, "balance_loss_clip": 1.02709544, "balance_loss_mlp": 1.01409554, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8518463216820418, "language_loss": 0.54997343, "learning_rate": 3.93349657991635e-06, "loss": 0.57077461, "num_input_tokens_seen": 39460685, "step": 1827, "time_per_iteration": 3.1425766944885254 }, { "auxiliary_loss_clip": 0.01072651, "auxiliary_loss_mlp": 0.01010167, "balance_loss_clip": 1.02693772, "balance_loss_mlp": 1.00717473, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 0.7375455878808789, "language_loss": 0.55382878, "learning_rate": 3.933396946289784e-06, "loss": 0.57465696, "num_input_tokens_seen": 39524765, "step": 1828, "time_per_iteration": 3.168165922164917 }, { "auxiliary_loss_clip": 0.01156998, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.05407059, "balance_loss_mlp": 1.03618491, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 2.250827401167328, "language_loss": 0.84010404, "learning_rate": 3.933297239348612e-06, "loss": 0.86226743, "num_input_tokens_seen": 39543640, "step": 1829, "time_per_iteration": 2.7341628074645996 }, { "auxiliary_loss_clip": 0.01130747, "auxiliary_loss_mlp": 0.01053464, "balance_loss_clip": 1.0547024, "balance_loss_mlp": 1.03036165, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 2.342204785330024, "language_loss": 0.88880253, "learning_rate": 3.933197459096614e-06, "loss": 0.91064465, "num_input_tokens_seen": 39567525, "step": 1830, "time_per_iteration": 2.9093260765075684 }, { "auxiliary_loss_clip": 0.01049643, "auxiliary_loss_mlp": 0.01009685, "balance_loss_clip": 1.02618647, "balance_loss_mlp": 1.00681162, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6882192363357665, "language_loss": 0.55566543, "learning_rate": 3.9330976055375756e-06, "loss": 0.57625872, "num_input_tokens_seen": 39628470, "step": 1831, "time_per_iteration": 3.1713974475860596 }, { "auxiliary_loss_clip": 0.01156783, "auxiliary_loss_mlp": 0.01073931, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04965997, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 2.4937725361201495, "language_loss": 0.90836191, "learning_rate": 3.932997678675282e-06, "loss": 0.93066907, "num_input_tokens_seen": 39646670, "step": 1832, "time_per_iteration": 2.6786489486694336 }, { "auxiliary_loss_clip": 0.0106111, "auxiliary_loss_mlp": 0.01010664, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.00769615, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7154576595208243, "language_loss": 0.59911001, "learning_rate": 3.932897678513523e-06, "loss": 0.61982775, "num_input_tokens_seen": 39712915, "step": 1833, "time_per_iteration": 3.1802401542663574 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.0105502, "balance_loss_clip": 1.05312014, "balance_loss_mlp": 1.03285873, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 2.6772934272606923, "language_loss": 0.80799395, "learning_rate": 3.93279760505609e-06, "loss": 0.83021617, "num_input_tokens_seen": 39730650, "step": 1834, "time_per_iteration": 2.591374635696411 }, { "auxiliary_loss_clip": 0.01141662, "auxiliary_loss_mlp": 0.01054827, "balance_loss_clip": 1.05557871, "balance_loss_mlp": 1.03004324, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 2.4853906687508247, "language_loss": 0.89856094, "learning_rate": 3.932697458306779e-06, "loss": 0.92052579, "num_input_tokens_seen": 39751065, "step": 1835, "time_per_iteration": 2.742330312728882 }, { "auxiliary_loss_clip": 0.01131787, "auxiliary_loss_mlp": 0.01063812, "balance_loss_clip": 1.0524013, "balance_loss_mlp": 1.03758645, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 2.2754442269720023, "language_loss": 0.63256055, "learning_rate": 3.932597238269386e-06, "loss": 0.65451658, "num_input_tokens_seen": 39769245, "step": 1836, "time_per_iteration": 2.6935038566589355 }, { "auxiliary_loss_clip": 0.01138919, "auxiliary_loss_mlp": 0.01061469, "balance_loss_clip": 1.05021358, "balance_loss_mlp": 1.03954661, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 1.6726289784191204, "language_loss": 0.72792488, "learning_rate": 3.932496944947711e-06, "loss": 0.74992871, "num_input_tokens_seen": 39790830, "step": 1837, "time_per_iteration": 2.7790510654449463 }, { "auxiliary_loss_clip": 0.01165472, "auxiliary_loss_mlp": 0.01057035, "balance_loss_clip": 1.05463088, "balance_loss_mlp": 1.03551781, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.027055787194766, "language_loss": 0.78489268, "learning_rate": 3.93239657834556e-06, "loss": 0.8071177, "num_input_tokens_seen": 39809475, "step": 1838, "time_per_iteration": 4.098532438278198 }, { "auxiliary_loss_clip": 0.01154042, "auxiliary_loss_mlp": 0.01062407, "balance_loss_clip": 1.05542612, "balance_loss_mlp": 1.03970969, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.046221888979386, "language_loss": 0.71451718, "learning_rate": 3.932296138466736e-06, "loss": 0.7366817, "num_input_tokens_seen": 39826355, "step": 1839, "time_per_iteration": 4.205714464187622 }, { "auxiliary_loss_clip": 0.01187588, "auxiliary_loss_mlp": 0.00781104, "balance_loss_clip": 1.06183171, "balance_loss_mlp": 1.00018013, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.623062836625425, "language_loss": 0.79027873, "learning_rate": 3.93219562531505e-06, "loss": 0.80996567, "num_input_tokens_seen": 39845335, "step": 1840, "time_per_iteration": 2.6023378372192383 }, { "auxiliary_loss_clip": 0.01156508, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.05206251, "balance_loss_mlp": 1.02887261, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.7551987843009527, "language_loss": 0.88083529, "learning_rate": 3.932095038894311e-06, "loss": 0.90292549, "num_input_tokens_seen": 39865065, "step": 1841, "time_per_iteration": 4.3361639976501465 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01067683, "balance_loss_clip": 1.05036247, "balance_loss_mlp": 1.04453301, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 3.1603067125494126, "language_loss": 0.90521991, "learning_rate": 3.931994379208334e-06, "loss": 0.92719877, "num_input_tokens_seen": 39882780, "step": 1842, "time_per_iteration": 2.7086760997772217 }, { "auxiliary_loss_clip": 0.01152506, "auxiliary_loss_mlp": 0.01061227, "balance_loss_clip": 1.05065131, "balance_loss_mlp": 1.03982854, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.112801816568727, "language_loss": 0.85845053, "learning_rate": 3.931893646260937e-06, "loss": 0.88058788, "num_input_tokens_seen": 39900295, "step": 1843, "time_per_iteration": 4.263117790222168 }, { "auxiliary_loss_clip": 0.01119254, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05050898, "balance_loss_mlp": 1.00012159, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.4511349711086798, "language_loss": 0.74735641, "learning_rate": 3.931792840055941e-06, "loss": 0.76637971, "num_input_tokens_seen": 39922075, "step": 1844, "time_per_iteration": 2.7999000549316406 }, { "auxiliary_loss_clip": 0.01180395, "auxiliary_loss_mlp": 0.01055824, "balance_loss_clip": 1.05662274, "balance_loss_mlp": 1.03238785, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.017286766878137, "language_loss": 0.7566812, "learning_rate": 3.931691960597165e-06, "loss": 0.77904338, "num_input_tokens_seen": 39940115, "step": 1845, "time_per_iteration": 2.5305535793304443 }, { "auxiliary_loss_clip": 0.01153403, "auxiliary_loss_mlp": 0.01058911, "balance_loss_clip": 1.05442989, "balance_loss_mlp": 1.03807366, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.9628359583393364, "language_loss": 0.75953126, "learning_rate": 3.9315910078884375e-06, "loss": 0.78165436, "num_input_tokens_seen": 39959920, "step": 1846, "time_per_iteration": 2.719325542449951 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.01059369, "balance_loss_clip": 1.05823123, "balance_loss_mlp": 1.03717244, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 2.612459533347621, "language_loss": 0.8620472, "learning_rate": 3.931489981933584e-06, "loss": 0.88437986, "num_input_tokens_seen": 39974755, "step": 1847, "time_per_iteration": 2.7705559730529785 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01055145, "balance_loss_clip": 1.05562854, "balance_loss_mlp": 1.0322808, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 1.8452742714770096, "language_loss": 0.76981926, "learning_rate": 3.931388882736438e-06, "loss": 0.79218227, "num_input_tokens_seen": 39993355, "step": 1848, "time_per_iteration": 2.605933666229248 }, { "auxiliary_loss_clip": 0.01172398, "auxiliary_loss_mlp": 0.01056349, "balance_loss_clip": 1.06262445, "balance_loss_mlp": 1.03455794, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 1.6943193134392138, "language_loss": 0.77621841, "learning_rate": 3.931287710300832e-06, "loss": 0.7985059, "num_input_tokens_seen": 40012410, "step": 1849, "time_per_iteration": 2.678415536880493 }, { "auxiliary_loss_clip": 0.01138995, "auxiliary_loss_mlp": 0.00781122, "balance_loss_clip": 1.05277848, "balance_loss_mlp": 1.00010324, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 3.3234972538165066, "language_loss": 0.72098577, "learning_rate": 3.931186464630601e-06, "loss": 0.74018693, "num_input_tokens_seen": 40029315, "step": 1850, "time_per_iteration": 2.7763028144836426 }, { "auxiliary_loss_clip": 0.01170569, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05759382, "balance_loss_mlp": 1.03874469, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.0638339407107873, "language_loss": 0.81499028, "learning_rate": 3.931085145729588e-06, "loss": 0.83730704, "num_input_tokens_seen": 40045765, "step": 1851, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01061301, "balance_loss_clip": 1.05789042, "balance_loss_mlp": 1.04027295, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.365035468310974, "language_loss": 0.88270009, "learning_rate": 3.930983753601631e-06, "loss": 0.90496004, "num_input_tokens_seen": 40061660, "step": 1852, "time_per_iteration": 2.659914493560791 }, { "auxiliary_loss_clip": 0.01166772, "auxiliary_loss_mlp": 0.01060698, "balance_loss_clip": 1.05489326, "balance_loss_mlp": 1.03791702, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.1825610274136054, "language_loss": 0.72492862, "learning_rate": 3.930882288250578e-06, "loss": 0.74720335, "num_input_tokens_seen": 40080180, "step": 1853, "time_per_iteration": 2.7840964794158936 }, { "auxiliary_loss_clip": 0.01069898, "auxiliary_loss_mlp": 0.01019902, "balance_loss_clip": 1.02549517, "balance_loss_mlp": 1.01701725, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.772231443606995, "language_loss": 0.53664064, "learning_rate": 3.930780749680273e-06, "loss": 0.55753863, "num_input_tokens_seen": 40138910, "step": 1854, "time_per_iteration": 3.089354991912842 }, { "auxiliary_loss_clip": 0.01159576, "auxiliary_loss_mlp": 0.0105585, "balance_loss_clip": 1.05390525, "balance_loss_mlp": 1.03184092, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 1.863523240792578, "language_loss": 0.8468501, "learning_rate": 3.9306791378945705e-06, "loss": 0.86900431, "num_input_tokens_seen": 40157745, "step": 1855, "time_per_iteration": 2.7361156940460205 }, { "auxiliary_loss_clip": 0.01147504, "auxiliary_loss_mlp": 0.01064479, "balance_loss_clip": 1.05225825, "balance_loss_mlp": 1.0424726, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.1217067547931756, "language_loss": 0.81187081, "learning_rate": 3.9305774528973205e-06, "loss": 0.83399057, "num_input_tokens_seen": 40175375, "step": 1856, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01052259, "balance_loss_clip": 1.05843937, "balance_loss_mlp": 1.02957392, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 2.0555738298465314, "language_loss": 0.82761133, "learning_rate": 3.93047569469238e-06, "loss": 0.8498168, "num_input_tokens_seen": 40195715, "step": 1857, "time_per_iteration": 2.647184133529663 }, { "auxiliary_loss_clip": 0.01144196, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.05255508, "balance_loss_mlp": 1.02395833, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.3199985887988914, "language_loss": 0.83131742, "learning_rate": 3.930373863283608e-06, "loss": 0.85320854, "num_input_tokens_seen": 40213975, "step": 1858, "time_per_iteration": 2.726905107498169 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.04900265, "balance_loss_mlp": 1.04350638, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.0395414997027657, "language_loss": 0.9133389, "learning_rate": 3.930271958674866e-06, "loss": 0.93536508, "num_input_tokens_seen": 40233905, "step": 1859, "time_per_iteration": 3.0006766319274902 }, { "auxiliary_loss_clip": 0.01167289, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.05445409, "balance_loss_mlp": 1.02751315, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.048197345879043, "language_loss": 0.81528586, "learning_rate": 3.930169980870018e-06, "loss": 0.83745575, "num_input_tokens_seen": 40252810, "step": 1860, "time_per_iteration": 2.7216553688049316 }, { "auxiliary_loss_clip": 0.01154007, "auxiliary_loss_mlp": 0.01060885, "balance_loss_clip": 1.05737674, "balance_loss_mlp": 1.03920078, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 2.00330439318394, "language_loss": 0.75250578, "learning_rate": 3.930067929872931e-06, "loss": 0.77465475, "num_input_tokens_seen": 40272000, "step": 1861, "time_per_iteration": 2.6878490447998047 }, { "auxiliary_loss_clip": 0.01177651, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.0565964, "balance_loss_mlp": 1.03360212, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 1.9427039767358767, "language_loss": 0.88888168, "learning_rate": 3.929965805687474e-06, "loss": 0.91120267, "num_input_tokens_seen": 40290660, "step": 1862, "time_per_iteration": 2.615057945251465 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.05994737, "balance_loss_mlp": 1.04086459, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.2273555113866847, "language_loss": 0.87719512, "learning_rate": 3.92986360831752e-06, "loss": 0.89946657, "num_input_tokens_seen": 40307820, "step": 1863, "time_per_iteration": 2.6778175830841064 }, { "auxiliary_loss_clip": 0.01158667, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05455208, "balance_loss_mlp": 1.03071773, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 2.8013407816012226, "language_loss": 0.64245486, "learning_rate": 3.929761337766945e-06, "loss": 0.66459453, "num_input_tokens_seen": 40327430, "step": 1864, "time_per_iteration": 2.724076509475708 }, { "auxiliary_loss_clip": 0.01110154, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.04924703, "balance_loss_mlp": 1.02672601, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.0303098144917135, "language_loss": 0.74043733, "learning_rate": 3.929658994039627e-06, "loss": 0.7620182, "num_input_tokens_seen": 40344545, "step": 1865, "time_per_iteration": 2.8119356632232666 }, { "auxiliary_loss_clip": 0.01114683, "auxiliary_loss_mlp": 0.01070203, "balance_loss_clip": 1.05348182, "balance_loss_mlp": 1.04483545, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.7389427033573375, "language_loss": 0.84692436, "learning_rate": 3.929556577139446e-06, "loss": 0.86877316, "num_input_tokens_seen": 40362300, "step": 1866, "time_per_iteration": 2.8022067546844482 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.00781014, "balance_loss_clip": 1.04227424, "balance_loss_mlp": 1.00006938, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.704208120094955, "language_loss": 0.8104012, "learning_rate": 3.929454087070286e-06, "loss": 0.82913494, "num_input_tokens_seen": 40384720, "step": 1867, "time_per_iteration": 2.915989875793457 }, { "auxiliary_loss_clip": 0.01179505, "auxiliary_loss_mlp": 0.01060529, "balance_loss_clip": 1.05720687, "balance_loss_mlp": 1.03959608, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 2.0811636681692844, "language_loss": 0.86840278, "learning_rate": 3.929351523836035e-06, "loss": 0.8908031, "num_input_tokens_seen": 40404000, "step": 1868, "time_per_iteration": 2.6855647563934326 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.00779977, "balance_loss_clip": 1.06005311, "balance_loss_mlp": 1.00010097, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.1491178409138376, "language_loss": 0.68308532, "learning_rate": 3.9292488874405795e-06, "loss": 0.70249927, "num_input_tokens_seen": 40418665, "step": 1869, "time_per_iteration": 2.7404487133026123 }, { "auxiliary_loss_clip": 0.01133783, "auxiliary_loss_mlp": 0.01066188, "balance_loss_clip": 1.04932964, "balance_loss_mlp": 1.04225063, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.5255545896853626, "language_loss": 0.76943326, "learning_rate": 3.929146177887814e-06, "loss": 0.79143298, "num_input_tokens_seen": 40437870, "step": 1870, "time_per_iteration": 2.809734344482422 }, { "auxiliary_loss_clip": 0.01129358, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.0509038, "balance_loss_mlp": 1.03300166, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 1.8186132867503446, "language_loss": 0.76056099, "learning_rate": 3.929043395181631e-06, "loss": 0.78242326, "num_input_tokens_seen": 40455570, "step": 1871, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01051114, "balance_loss_clip": 1.04993379, "balance_loss_mlp": 1.03026426, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 1.9425066802508644, "language_loss": 0.81811988, "learning_rate": 3.928940539325929e-06, "loss": 0.83968765, "num_input_tokens_seen": 40473600, "step": 1872, "time_per_iteration": 2.851868152618408 }, { "auxiliary_loss_clip": 0.01179923, "auxiliary_loss_mlp": 0.01055722, "balance_loss_clip": 1.05722499, "balance_loss_mlp": 1.03359652, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 2.186176467187071, "language_loss": 0.8361913, "learning_rate": 3.9288376103246095e-06, "loss": 0.85854775, "num_input_tokens_seen": 40490025, "step": 1873, "time_per_iteration": 2.6668763160705566 }, { "auxiliary_loss_clip": 0.01144862, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.0525465, "balance_loss_mlp": 1.03196871, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8822875514234196, "language_loss": 0.92342389, "learning_rate": 3.928734608181575e-06, "loss": 0.94541967, "num_input_tokens_seen": 40511580, "step": 1874, "time_per_iteration": 2.700533866882324 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.05100179, "balance_loss_mlp": 1.03509891, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.6564425098873434, "language_loss": 0.75359404, "learning_rate": 3.928631532900729e-06, "loss": 0.77556133, "num_input_tokens_seen": 40530155, "step": 1875, "time_per_iteration": 2.7642719745635986 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01055271, "balance_loss_clip": 1.05893159, "balance_loss_mlp": 1.0348264, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 2.12758140825061, "language_loss": 0.71578634, "learning_rate": 3.928528384485984e-06, "loss": 0.73800993, "num_input_tokens_seen": 40549500, "step": 1876, "time_per_iteration": 2.8505096435546875 }, { "auxiliary_loss_clip": 0.01147417, "auxiliary_loss_mlp": 0.01054094, "balance_loss_clip": 1.05223966, "balance_loss_mlp": 1.03200495, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.8103612630164048, "language_loss": 0.76795971, "learning_rate": 3.9284251629412475e-06, "loss": 0.78997481, "num_input_tokens_seen": 40567475, "step": 1877, "time_per_iteration": 2.6972849369049072 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.05518627, "balance_loss_mlp": 1.04026341, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 2.1601834607000368, "language_loss": 0.87843502, "learning_rate": 3.928321868270436e-06, "loss": 0.90074658, "num_input_tokens_seen": 40583280, "step": 1878, "time_per_iteration": 5.6992692947387695 }, { "auxiliary_loss_clip": 0.01140682, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.05420399, "balance_loss_mlp": 1.03333724, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.151084139284284, "language_loss": 0.81623232, "learning_rate": 3.928218500477466e-06, "loss": 0.83818817, "num_input_tokens_seen": 40603080, "step": 1879, "time_per_iteration": 2.8688366413116455 }, { "auxiliary_loss_clip": 0.01155904, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05238748, "balance_loss_mlp": 1.03609526, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 1.941623939252122, "language_loss": 0.70234305, "learning_rate": 3.928115059566259e-06, "loss": 0.72449279, "num_input_tokens_seen": 40623255, "step": 1880, "time_per_iteration": 5.567574739456177 }, { "auxiliary_loss_clip": 0.01155691, "auxiliary_loss_mlp": 0.01052309, "balance_loss_clip": 1.05585837, "balance_loss_mlp": 1.0306015, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.6696082535169858, "language_loss": 0.72690225, "learning_rate": 3.928011545540734e-06, "loss": 0.74898225, "num_input_tokens_seen": 40641570, "step": 1881, "time_per_iteration": 2.792428493499756 }, { "auxiliary_loss_clip": 0.011425, "auxiliary_loss_mlp": 0.00781179, "balance_loss_clip": 1.05046606, "balance_loss_mlp": 1.00008667, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.2964043184115783, "language_loss": 0.74205768, "learning_rate": 3.927907958404819e-06, "loss": 0.76129448, "num_input_tokens_seen": 40658775, "step": 1882, "time_per_iteration": 4.414916515350342 }, { "auxiliary_loss_clip": 0.01177281, "auxiliary_loss_mlp": 0.01054815, "balance_loss_clip": 1.05680335, "balance_loss_mlp": 1.03203452, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 2.4326158086005965, "language_loss": 0.7923016, "learning_rate": 3.92780429816244e-06, "loss": 0.81462252, "num_input_tokens_seen": 40679555, "step": 1883, "time_per_iteration": 2.762615919113159 }, { "auxiliary_loss_clip": 0.01140926, "auxiliary_loss_mlp": 0.01058465, "balance_loss_clip": 1.05226314, "balance_loss_mlp": 1.03520727, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 2.2898863699254974, "language_loss": 0.77047318, "learning_rate": 3.927700564817529e-06, "loss": 0.79246712, "num_input_tokens_seen": 40697295, "step": 1884, "time_per_iteration": 2.835468292236328 }, { "auxiliary_loss_clip": 0.01074478, "auxiliary_loss_mlp": 0.01009476, "balance_loss_clip": 1.03993821, "balance_loss_mlp": 1.00620937, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.8138652948403053, "language_loss": 0.55151373, "learning_rate": 3.927596758374019e-06, "loss": 0.5723533, "num_input_tokens_seen": 40758095, "step": 1885, "time_per_iteration": 3.179532289505005 }, { "auxiliary_loss_clip": 0.01083888, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.04415166, "balance_loss_mlp": 1.02910316, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 1.9836288003076585, "language_loss": 0.90384823, "learning_rate": 3.927492878835848e-06, "loss": 0.92519462, "num_input_tokens_seen": 40777140, "step": 1886, "time_per_iteration": 3.038928747177124 }, { "auxiliary_loss_clip": 0.01116325, "auxiliary_loss_mlp": 0.01057697, "balance_loss_clip": 1.05137897, "balance_loss_mlp": 1.03634632, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.0132756022974023, "language_loss": 0.84852886, "learning_rate": 3.927388926206953e-06, "loss": 0.87026906, "num_input_tokens_seen": 40797505, "step": 1887, "time_per_iteration": 3.178863048553467 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01056557, "balance_loss_clip": 1.05091035, "balance_loss_mlp": 1.03549314, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 2.847610033990257, "language_loss": 0.75826252, "learning_rate": 3.927284900491277e-06, "loss": 0.78019381, "num_input_tokens_seen": 40812970, "step": 1888, "time_per_iteration": 2.7349846363067627 }, { "auxiliary_loss_clip": 0.0113463, "auxiliary_loss_mlp": 0.01062359, "balance_loss_clip": 1.05614805, "balance_loss_mlp": 1.03892243, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 2.0598279187313624, "language_loss": 0.68104899, "learning_rate": 3.927180801692764e-06, "loss": 0.7030189, "num_input_tokens_seen": 40837745, "step": 1889, "time_per_iteration": 3.144444465637207 }, { "auxiliary_loss_clip": 0.01177206, "auxiliary_loss_mlp": 0.01049162, "balance_loss_clip": 1.05653095, "balance_loss_mlp": 1.02694094, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 1.7896678692754837, "language_loss": 0.83947051, "learning_rate": 3.927076629815362e-06, "loss": 0.86173415, "num_input_tokens_seen": 40856490, "step": 1890, "time_per_iteration": 2.73126482963562 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01056017, "balance_loss_clip": 1.05039728, "balance_loss_mlp": 1.03395164, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.1678723202845256, "language_loss": 0.64663875, "learning_rate": 3.926972384863022e-06, "loss": 0.66865045, "num_input_tokens_seen": 40874070, "step": 1891, "time_per_iteration": 2.7474160194396973 }, { "auxiliary_loss_clip": 0.01145505, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05395687, "balance_loss_mlp": 1.02773631, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.126575023047711, "language_loss": 0.87889415, "learning_rate": 3.9268680668396956e-06, "loss": 0.90083933, "num_input_tokens_seen": 40892425, "step": 1892, "time_per_iteration": 2.795269250869751 }, { "auxiliary_loss_clip": 0.01119535, "auxiliary_loss_mlp": 0.01079586, "balance_loss_clip": 1.05541015, "balance_loss_mlp": 1.05461168, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 3.1806920305576973, "language_loss": 0.72902197, "learning_rate": 3.926763675749339e-06, "loss": 0.75101316, "num_input_tokens_seen": 40912190, "step": 1893, "time_per_iteration": 2.890289306640625 }, { "auxiliary_loss_clip": 0.01175698, "auxiliary_loss_mlp": 0.0106591, "balance_loss_clip": 1.05438137, "balance_loss_mlp": 1.04290223, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 1.8842571229841023, "language_loss": 0.79247093, "learning_rate": 3.92665921159591e-06, "loss": 0.81488699, "num_input_tokens_seen": 40928395, "step": 1894, "time_per_iteration": 2.6820743083953857 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.05356526, "balance_loss_mlp": 1.03944933, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 3.429237983174195, "language_loss": 0.79718482, "learning_rate": 3.926554674383371e-06, "loss": 0.81930667, "num_input_tokens_seen": 40946555, "step": 1895, "time_per_iteration": 2.829946994781494 }, { "auxiliary_loss_clip": 0.01075529, "auxiliary_loss_mlp": 0.01018518, "balance_loss_clip": 1.03062391, "balance_loss_mlp": 1.0155375, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8041110638842961, "language_loss": 0.63357508, "learning_rate": 3.926450064115686e-06, "loss": 0.65451556, "num_input_tokens_seen": 41004910, "step": 1896, "time_per_iteration": 3.3087315559387207 }, { "auxiliary_loss_clip": 0.01147265, "auxiliary_loss_mlp": 0.0106086, "balance_loss_clip": 1.05560398, "balance_loss_mlp": 1.03663635, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 1.5952307342327186, "language_loss": 0.85055745, "learning_rate": 3.926345380796821e-06, "loss": 0.8726387, "num_input_tokens_seen": 41026385, "step": 1897, "time_per_iteration": 2.8522274494171143 }, { "auxiliary_loss_clip": 0.0117836, "auxiliary_loss_mlp": 0.00780276, "balance_loss_clip": 1.05591989, "balance_loss_mlp": 1.0001986, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 3.3624139627125587, "language_loss": 0.79675245, "learning_rate": 3.9262406244307465e-06, "loss": 0.81633884, "num_input_tokens_seen": 41045315, "step": 1898, "time_per_iteration": 2.760057210922241 }, { "auxiliary_loss_clip": 0.01115338, "auxiliary_loss_mlp": 0.01064417, "balance_loss_clip": 1.04594529, "balance_loss_mlp": 1.03965724, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 2.0191769665152903, "language_loss": 0.73251313, "learning_rate": 3.926135795021435e-06, "loss": 0.75431061, "num_input_tokens_seen": 41063390, "step": 1899, "time_per_iteration": 2.7363204956054688 }, { "auxiliary_loss_clip": 0.01042449, "auxiliary_loss_mlp": 0.01003313, "balance_loss_clip": 1.03643703, "balance_loss_mlp": 1.0003922, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9089505356695228, "language_loss": 0.63434029, "learning_rate": 3.92603089257286e-06, "loss": 0.65479791, "num_input_tokens_seen": 41124180, "step": 1900, "time_per_iteration": 3.2045955657958984 }, { "auxiliary_loss_clip": 0.01113626, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.04929233, "balance_loss_mlp": 1.04378414, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.577500478750639, "language_loss": 0.77943742, "learning_rate": 3.925925917089001e-06, "loss": 0.80124187, "num_input_tokens_seen": 41143485, "step": 1901, "time_per_iteration": 2.745089530944824 }, { "auxiliary_loss_clip": 0.01171621, "auxiliary_loss_mlp": 0.01057834, "balance_loss_clip": 1.05803061, "balance_loss_mlp": 1.0359118, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 2.175933638179557, "language_loss": 0.84158623, "learning_rate": 3.925820868573839e-06, "loss": 0.86388075, "num_input_tokens_seen": 41161695, "step": 1902, "time_per_iteration": 2.6433799266815186 }, { "auxiliary_loss_clip": 0.01159941, "auxiliary_loss_mlp": 0.01056662, "balance_loss_clip": 1.05280399, "balance_loss_mlp": 1.03122306, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.7702735053047673, "language_loss": 0.77720451, "learning_rate": 3.925715747031356e-06, "loss": 0.79937053, "num_input_tokens_seen": 41181715, "step": 1903, "time_per_iteration": 2.6385905742645264 }, { "auxiliary_loss_clip": 0.01145143, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.05293322, "balance_loss_mlp": 1.02174175, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 2.212790565732917, "language_loss": 0.75751555, "learning_rate": 3.925610552465539e-06, "loss": 0.77938658, "num_input_tokens_seen": 41201770, "step": 1904, "time_per_iteration": 2.632152557373047 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.05207586, "balance_loss_mlp": 1.03279781, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.4422699353972006, "language_loss": 0.91853034, "learning_rate": 3.9255052848803764e-06, "loss": 0.94058943, "num_input_tokens_seen": 41220590, "step": 1905, "time_per_iteration": 2.7421486377716064 }, { "auxiliary_loss_clip": 0.01161686, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.04978943, "balance_loss_mlp": 1.02612448, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.5117992419356066, "language_loss": 0.77484202, "learning_rate": 3.925399944279861e-06, "loss": 0.79696143, "num_input_tokens_seen": 41237250, "step": 1906, "time_per_iteration": 2.69333553314209 }, { "auxiliary_loss_clip": 0.0117911, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.05697322, "balance_loss_mlp": 1.03222847, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 2.0720467666322113, "language_loss": 0.81739306, "learning_rate": 3.925294530667986e-06, "loss": 0.83973539, "num_input_tokens_seen": 41256680, "step": 1907, "time_per_iteration": 2.6531317234039307 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01065473, "balance_loss_clip": 1.05235374, "balance_loss_mlp": 1.04227471, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 2.1769364553121293, "language_loss": 0.84901214, "learning_rate": 3.92518904404875e-06, "loss": 0.87103164, "num_input_tokens_seen": 41270955, "step": 1908, "time_per_iteration": 2.8768258094787598 }, { "auxiliary_loss_clip": 0.01029536, "auxiliary_loss_mlp": 0.01020856, "balance_loss_clip": 1.02524137, "balance_loss_mlp": 1.01694632, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9197306473097341, "language_loss": 0.61072773, "learning_rate": 3.925083484426153e-06, "loss": 0.63123173, "num_input_tokens_seen": 41319180, "step": 1909, "time_per_iteration": 3.0845727920532227 }, { "auxiliary_loss_clip": 0.01182744, "auxiliary_loss_mlp": 0.01054075, "balance_loss_clip": 1.06014562, "balance_loss_mlp": 1.03219986, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 7.319166590530674, "language_loss": 0.79170966, "learning_rate": 3.924977851804197e-06, "loss": 0.81407785, "num_input_tokens_seen": 41337480, "step": 1910, "time_per_iteration": 2.708704710006714 }, { "auxiliary_loss_clip": 0.01156489, "auxiliary_loss_mlp": 0.01052406, "balance_loss_clip": 1.0580864, "balance_loss_mlp": 1.03029275, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.117911712245717, "language_loss": 0.7702589, "learning_rate": 3.9248721461868875e-06, "loss": 0.79234779, "num_input_tokens_seen": 41354650, "step": 1911, "time_per_iteration": 2.7597720623016357 }, { "auxiliary_loss_clip": 0.01159986, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.03227139, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.677508784227342, "language_loss": 0.79177421, "learning_rate": 3.9247663675782336e-06, "loss": 0.81392002, "num_input_tokens_seen": 41376935, "step": 1912, "time_per_iteration": 2.8143310546875 }, { "auxiliary_loss_clip": 0.01183047, "auxiliary_loss_mlp": 0.00779659, "balance_loss_clip": 1.06065917, "balance_loss_mlp": 1.00014925, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 2.291252405113977, "language_loss": 0.77942276, "learning_rate": 3.924660515982246e-06, "loss": 0.79904979, "num_input_tokens_seen": 41396105, "step": 1913, "time_per_iteration": 2.696430206298828 }, { "auxiliary_loss_clip": 0.01166892, "auxiliary_loss_mlp": 0.01052769, "balance_loss_clip": 1.05442226, "balance_loss_mlp": 1.02953506, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 1.8145547055361753, "language_loss": 0.7003395, "learning_rate": 3.924554591402939e-06, "loss": 0.72253609, "num_input_tokens_seen": 41415600, "step": 1914, "time_per_iteration": 2.739251136779785 }, { "auxiliary_loss_clip": 0.01007182, "auxiliary_loss_mlp": 0.01004682, "balance_loss_clip": 1.02677619, "balance_loss_mlp": 1.00191641, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7558771871458172, "language_loss": 0.61059874, "learning_rate": 3.92444859384433e-06, "loss": 0.6307174, "num_input_tokens_seen": 41478760, "step": 1915, "time_per_iteration": 3.56019926071167 }, { "auxiliary_loss_clip": 0.01166434, "auxiliary_loss_mlp": 0.01058573, "balance_loss_clip": 1.05994964, "balance_loss_mlp": 1.03595936, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.437201506258279, "language_loss": 0.93116963, "learning_rate": 3.924342523310436e-06, "loss": 0.95341969, "num_input_tokens_seen": 41495720, "step": 1916, "time_per_iteration": 3.244772434234619 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01059827, "balance_loss_clip": 1.05798697, "balance_loss_mlp": 1.03470993, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 1.8909260082350545, "language_loss": 0.72560197, "learning_rate": 3.9242363798052806e-06, "loss": 0.74781156, "num_input_tokens_seen": 41513585, "step": 1917, "time_per_iteration": 4.502236843109131 }, { "auxiliary_loss_clip": 0.01138773, "auxiliary_loss_mlp": 0.0104964, "balance_loss_clip": 1.05739903, "balance_loss_mlp": 1.02700245, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 9.147356795176979, "language_loss": 0.74213129, "learning_rate": 3.92413016333289e-06, "loss": 0.76401544, "num_input_tokens_seen": 41533390, "step": 1918, "time_per_iteration": 4.344711065292358 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.05532503, "balance_loss_mlp": 1.02450073, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 3.182152136597976, "language_loss": 0.86367452, "learning_rate": 3.92402387389729e-06, "loss": 0.88563335, "num_input_tokens_seen": 41551015, "step": 1919, "time_per_iteration": 4.540036201477051 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01067867, "balance_loss_clip": 1.0496366, "balance_loss_mlp": 1.04172444, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.93595243799445, "language_loss": 0.86735415, "learning_rate": 3.923917511502512e-06, "loss": 0.8893733, "num_input_tokens_seen": 41568055, "step": 1920, "time_per_iteration": 2.7719242572784424 }, { "auxiliary_loss_clip": 0.011686, "auxiliary_loss_mlp": 0.010528, "balance_loss_clip": 1.0593946, "balance_loss_mlp": 1.0302341, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 4.512761907267092, "language_loss": 0.79294932, "learning_rate": 3.923811076152589e-06, "loss": 0.81516337, "num_input_tokens_seen": 41587435, "step": 1921, "time_per_iteration": 2.798673629760742 }, { "auxiliary_loss_clip": 0.01174604, "auxiliary_loss_mlp": 0.01063526, "balance_loss_clip": 1.05685806, "balance_loss_mlp": 1.04007721, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 2.4057040360661484, "language_loss": 0.78464305, "learning_rate": 3.923704567851557e-06, "loss": 0.80702436, "num_input_tokens_seen": 41604975, "step": 1922, "time_per_iteration": 4.352341651916504 }, { "auxiliary_loss_clip": 0.01092284, "auxiliary_loss_mlp": 0.01064602, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.04229808, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.8560991769949675, "language_loss": 0.84293079, "learning_rate": 3.923597986603456e-06, "loss": 0.86449969, "num_input_tokens_seen": 41626155, "step": 1923, "time_per_iteration": 3.2956740856170654 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01056739, "balance_loss_clip": 1.0600003, "balance_loss_mlp": 1.03317094, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 1.944851076041885, "language_loss": 0.80890471, "learning_rate": 3.9234913324123264e-06, "loss": 0.83119166, "num_input_tokens_seen": 41644805, "step": 1924, "time_per_iteration": 3.0939247608184814 }, { "auxiliary_loss_clip": 0.01055916, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.03045607, "balance_loss_mlp": 1.02436543, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8171642061509322, "language_loss": 0.61196578, "learning_rate": 3.923384605282212e-06, "loss": 0.63279623, "num_input_tokens_seen": 41709345, "step": 1925, "time_per_iteration": 3.3765265941619873 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.01079328, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.0549382, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.7772533553430212, "language_loss": 0.74766397, "learning_rate": 3.923277805217161e-06, "loss": 0.77001572, "num_input_tokens_seen": 41730210, "step": 1926, "time_per_iteration": 2.754974126815796 }, { "auxiliary_loss_clip": 0.01116228, "auxiliary_loss_mlp": 0.00781701, "balance_loss_clip": 1.04683304, "balance_loss_mlp": 1.00016665, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 4.731879086182685, "language_loss": 0.71978599, "learning_rate": 3.923170932221222e-06, "loss": 0.7387653, "num_input_tokens_seen": 41750270, "step": 1927, "time_per_iteration": 2.9454004764556885 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01058796, "balance_loss_clip": 1.05250621, "balance_loss_mlp": 1.03572917, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.5938674022456252, "language_loss": 0.86854041, "learning_rate": 3.92306398629845e-06, "loss": 0.89050794, "num_input_tokens_seen": 41772975, "step": 1928, "time_per_iteration": 2.832750082015991 }, { "auxiliary_loss_clip": 0.01129041, "auxiliary_loss_mlp": 0.01060836, "balance_loss_clip": 1.05032003, "balance_loss_mlp": 1.03706551, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.6639520350020578, "language_loss": 0.77450585, "learning_rate": 3.922956967452898e-06, "loss": 0.79640466, "num_input_tokens_seen": 41791765, "step": 1929, "time_per_iteration": 2.7876811027526855 }, { "auxiliary_loss_clip": 0.01176887, "auxiliary_loss_mlp": 0.01063611, "balance_loss_clip": 1.05667901, "balance_loss_mlp": 1.0424509, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.8085677541874856, "language_loss": 0.76831949, "learning_rate": 3.922849875688626e-06, "loss": 0.79072452, "num_input_tokens_seen": 41815615, "step": 1930, "time_per_iteration": 2.819934844970703 }, { "auxiliary_loss_clip": 0.01145781, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.05066586, "balance_loss_mlp": 1.03165817, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.9434791543130712, "language_loss": 0.72291863, "learning_rate": 3.922742711009693e-06, "loss": 0.74491692, "num_input_tokens_seen": 41834810, "step": 1931, "time_per_iteration": 2.8078088760375977 }, { "auxiliary_loss_clip": 0.01146409, "auxiliary_loss_mlp": 0.01061336, "balance_loss_clip": 1.05090261, "balance_loss_mlp": 1.03575325, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 1.7378937044391531, "language_loss": 0.8222791, "learning_rate": 3.922635473420164e-06, "loss": 0.8443566, "num_input_tokens_seen": 41854975, "step": 1932, "time_per_iteration": 2.7495200634002686 }, { "auxiliary_loss_clip": 0.01030493, "auxiliary_loss_mlp": 0.01018834, "balance_loss_clip": 1.02184403, "balance_loss_mlp": 1.01556778, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7669378012870447, "language_loss": 0.61050332, "learning_rate": 3.922528162924105e-06, "loss": 0.63099658, "num_input_tokens_seen": 41911105, "step": 1933, "time_per_iteration": 3.256678581237793 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.00780156, "balance_loss_clip": 1.04764509, "balance_loss_mlp": 1.00006175, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.830760437639296, "language_loss": 0.85790741, "learning_rate": 3.922420779525586e-06, "loss": 0.8767947, "num_input_tokens_seen": 41931750, "step": 1934, "time_per_iteration": 2.9144253730773926 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01059839, "balance_loss_clip": 1.04929256, "balance_loss_mlp": 1.03453088, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.625764216143105, "language_loss": 0.66222906, "learning_rate": 3.9223133232286776e-06, "loss": 0.68400419, "num_input_tokens_seen": 41949400, "step": 1935, "time_per_iteration": 2.867152452468872 }, { "auxiliary_loss_clip": 0.01183991, "auxiliary_loss_mlp": 0.01052492, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.03111792, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 2.025938843377603, "language_loss": 0.75678742, "learning_rate": 3.922205794037456e-06, "loss": 0.77915227, "num_input_tokens_seen": 41968100, "step": 1936, "time_per_iteration": 2.7282185554504395 }, { "auxiliary_loss_clip": 0.01179718, "auxiliary_loss_mlp": 0.01049532, "balance_loss_clip": 1.05632091, "balance_loss_mlp": 1.02639306, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 2.0032002399718905, "language_loss": 0.84086847, "learning_rate": 3.922098191955998e-06, "loss": 0.86316097, "num_input_tokens_seen": 41986375, "step": 1937, "time_per_iteration": 2.715386152267456 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01048961, "balance_loss_clip": 1.05258632, "balance_loss_mlp": 1.0268234, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 3.0485930101216607, "language_loss": 0.7617709, "learning_rate": 3.921990516988384e-06, "loss": 0.78378135, "num_input_tokens_seen": 42006055, "step": 1938, "time_per_iteration": 2.7624804973602295 }, { "auxiliary_loss_clip": 0.01182576, "auxiliary_loss_mlp": 0.01055104, "balance_loss_clip": 1.05742419, "balance_loss_mlp": 1.03250146, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 1.7682499083089231, "language_loss": 0.79677606, "learning_rate": 3.921882769138696e-06, "loss": 0.81915289, "num_input_tokens_seen": 42024995, "step": 1939, "time_per_iteration": 2.71458101272583 }, { "auxiliary_loss_clip": 0.01148291, "auxiliary_loss_mlp": 0.01057951, "balance_loss_clip": 1.05209351, "balance_loss_mlp": 1.03508627, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 2.2281245193552475, "language_loss": 0.85916591, "learning_rate": 3.9217749484110215e-06, "loss": 0.88122833, "num_input_tokens_seen": 42042640, "step": 1940, "time_per_iteration": 2.7322728633880615 }, { "auxiliary_loss_clip": 0.01153746, "auxiliary_loss_mlp": 0.01056301, "balance_loss_clip": 1.05659437, "balance_loss_mlp": 1.03548717, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.4952807995381137, "language_loss": 0.75590646, "learning_rate": 3.921667054809449e-06, "loss": 0.77800703, "num_input_tokens_seen": 42067005, "step": 1941, "time_per_iteration": 2.9211390018463135 }, { "auxiliary_loss_clip": 0.01149585, "auxiliary_loss_mlp": 0.00780203, "balance_loss_clip": 1.05181897, "balance_loss_mlp": 1.00006557, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.277225749463833, "language_loss": 0.88847101, "learning_rate": 3.921559088338068e-06, "loss": 0.90776885, "num_input_tokens_seen": 42082295, "step": 1942, "time_per_iteration": 2.7145469188690186 }, { "auxiliary_loss_clip": 0.01165183, "auxiliary_loss_mlp": 0.01056257, "balance_loss_clip": 1.05553317, "balance_loss_mlp": 1.03552663, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.6547450593003057, "language_loss": 0.67979252, "learning_rate": 3.921451049000975e-06, "loss": 0.70200694, "num_input_tokens_seen": 42105295, "step": 1943, "time_per_iteration": 2.789701461791992 }, { "auxiliary_loss_clip": 0.01153022, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 1.02591634, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 1.9817763000300312, "language_loss": 0.69831288, "learning_rate": 3.921342936802265e-06, "loss": 0.72031963, "num_input_tokens_seen": 42125520, "step": 1944, "time_per_iteration": 2.827150583267212 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.03158641, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.4963028532298175, "language_loss": 0.82662582, "learning_rate": 3.921234751746038e-06, "loss": 0.84870374, "num_input_tokens_seen": 42146335, "step": 1945, "time_per_iteration": 2.7190194129943848 }, { "auxiliary_loss_clip": 0.01137101, "auxiliary_loss_mlp": 0.01062082, "balance_loss_clip": 1.04682803, "balance_loss_mlp": 1.04005265, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.3643045784637735, "language_loss": 0.76298034, "learning_rate": 3.9211264938363975e-06, "loss": 0.78497219, "num_input_tokens_seen": 42165320, "step": 1946, "time_per_iteration": 2.792555093765259 }, { "auxiliary_loss_clip": 0.01134728, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.0507704, "balance_loss_mlp": 1.03536999, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 2.058923240355934, "language_loss": 0.69014907, "learning_rate": 3.921018163077448e-06, "loss": 0.71205747, "num_input_tokens_seen": 42182955, "step": 1947, "time_per_iteration": 2.643807888031006 }, { "auxiliary_loss_clip": 0.01154759, "auxiliary_loss_mlp": 0.01067767, "balance_loss_clip": 1.05707347, "balance_loss_mlp": 1.04604673, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 2.0690991629011615, "language_loss": 0.85044622, "learning_rate": 3.920909759473295e-06, "loss": 0.87267148, "num_input_tokens_seen": 42200760, "step": 1948, "time_per_iteration": 2.6399292945861816 }, { "auxiliary_loss_clip": 0.01051031, "auxiliary_loss_mlp": 0.0075782, "balance_loss_clip": 1.0245688, "balance_loss_mlp": 0.99997467, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8206821069070506, "language_loss": 0.65139282, "learning_rate": 3.920801283028054e-06, "loss": 0.66948134, "num_input_tokens_seen": 42265745, "step": 1949, "time_per_iteration": 3.3030900955200195 }, { "auxiliary_loss_clip": 0.01159399, "auxiliary_loss_mlp": 0.01061163, "balance_loss_clip": 1.05735683, "balance_loss_mlp": 1.04054022, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.512876015443777, "language_loss": 0.71746683, "learning_rate": 3.920692733745835e-06, "loss": 0.73967248, "num_input_tokens_seen": 42286245, "step": 1950, "time_per_iteration": 2.739341974258423 }, { "auxiliary_loss_clip": 0.01175731, "auxiliary_loss_mlp": 0.01061149, "balance_loss_clip": 1.06152189, "balance_loss_mlp": 1.03907192, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.1258853115079996, "language_loss": 0.76671386, "learning_rate": 3.920584111630755e-06, "loss": 0.78908259, "num_input_tokens_seen": 42302710, "step": 1951, "time_per_iteration": 2.624788999557495 }, { "auxiliary_loss_clip": 0.01129104, "auxiliary_loss_mlp": 0.0106562, "balance_loss_clip": 1.05285251, "balance_loss_mlp": 1.04435349, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.7264952730121887, "language_loss": 0.75964963, "learning_rate": 3.9204754166869325e-06, "loss": 0.7815969, "num_input_tokens_seen": 42324115, "step": 1952, "time_per_iteration": 2.824826955795288 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01065929, "balance_loss_clip": 1.04589534, "balance_loss_mlp": 1.04451907, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 2.2111022500713453, "language_loss": 0.72316217, "learning_rate": 3.920366648918491e-06, "loss": 0.74505818, "num_input_tokens_seen": 42342505, "step": 1953, "time_per_iteration": 2.7456531524658203 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.00781136, "balance_loss_clip": 1.0549686, "balance_loss_mlp": 1.0000577, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.1208802652878522, "language_loss": 0.79780388, "learning_rate": 3.920257808329552e-06, "loss": 0.81710744, "num_input_tokens_seen": 42360525, "step": 1954, "time_per_iteration": 2.653949737548828 }, { "auxiliary_loss_clip": 0.01112399, "auxiliary_loss_mlp": 0.01059787, "balance_loss_clip": 1.04880822, "balance_loss_mlp": 1.03763783, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 1.9673692595826442, "language_loss": 0.8553021, "learning_rate": 3.920148894924246e-06, "loss": 0.87702394, "num_input_tokens_seen": 42377045, "step": 1955, "time_per_iteration": 2.7987124919891357 }, { "auxiliary_loss_clip": 0.01163172, "auxiliary_loss_mlp": 0.00779783, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00016606, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.12926288831445, "language_loss": 0.78105426, "learning_rate": 3.920039908706701e-06, "loss": 0.80048382, "num_input_tokens_seen": 42393960, "step": 1956, "time_per_iteration": 2.6247944831848145 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01058454, "balance_loss_clip": 1.05559933, "balance_loss_mlp": 1.03601909, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.264983200322237, "language_loss": 0.80487299, "learning_rate": 3.91993084968105e-06, "loss": 0.82704043, "num_input_tokens_seen": 42413160, "step": 1957, "time_per_iteration": 5.862411260604858 }, { "auxiliary_loss_clip": 0.01168294, "auxiliary_loss_mlp": 0.0105259, "balance_loss_clip": 1.05703866, "balance_loss_mlp": 1.0308696, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 4.8672025609093215, "language_loss": 0.77955222, "learning_rate": 3.919821717851428e-06, "loss": 0.80176103, "num_input_tokens_seen": 42432590, "step": 1958, "time_per_iteration": 4.4218549728393555 }, { "auxiliary_loss_clip": 0.01149976, "auxiliary_loss_mlp": 0.0105003, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.02680755, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 1.7537692363765556, "language_loss": 0.77002251, "learning_rate": 3.919712513221976e-06, "loss": 0.79202259, "num_input_tokens_seen": 42450135, "step": 1959, "time_per_iteration": 2.674323558807373 }, { "auxiliary_loss_clip": 0.01162585, "auxiliary_loss_mlp": 0.01057019, "balance_loss_clip": 1.05857027, "balance_loss_mlp": 1.03484631, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 2.2026367524708927, "language_loss": 0.70078689, "learning_rate": 3.919603235796832e-06, "loss": 0.722983, "num_input_tokens_seen": 42470050, "step": 1960, "time_per_iteration": 2.7704508304595947 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.05841374, "balance_loss_mlp": 1.03228831, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 2.663996374773888, "language_loss": 0.81045067, "learning_rate": 3.9194938855801406e-06, "loss": 0.83261371, "num_input_tokens_seen": 42484335, "step": 1961, "time_per_iteration": 4.67006778717041 }, { "auxiliary_loss_clip": 0.01163817, "auxiliary_loss_mlp": 0.00779643, "balance_loss_clip": 1.05658793, "balance_loss_mlp": 1.00009537, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.71345119244153, "language_loss": 0.92273545, "learning_rate": 3.919384462576049e-06, "loss": 0.94217002, "num_input_tokens_seen": 42502720, "step": 1962, "time_per_iteration": 2.6559524536132812 }, { "auxiliary_loss_clip": 0.01139826, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.05222392, "balance_loss_mlp": 1.03704107, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.157203116008796, "language_loss": 0.87635934, "learning_rate": 3.919274966788707e-06, "loss": 0.8983472, "num_input_tokens_seen": 42519460, "step": 1963, "time_per_iteration": 2.710042715072632 }, { "auxiliary_loss_clip": 0.0115823, "auxiliary_loss_mlp": 0.00779391, "balance_loss_clip": 1.05600929, "balance_loss_mlp": 1.00011134, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 2.8331529324994333, "language_loss": 0.83879703, "learning_rate": 3.919165398222265e-06, "loss": 0.85817325, "num_input_tokens_seen": 42539420, "step": 1964, "time_per_iteration": 2.734941244125366 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01069054, "balance_loss_clip": 1.05171156, "balance_loss_mlp": 1.04628491, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 3.9132941826799543, "language_loss": 0.8313272, "learning_rate": 3.919055756880879e-06, "loss": 0.85324299, "num_input_tokens_seen": 42558225, "step": 1965, "time_per_iteration": 2.7427306175231934 }, { "auxiliary_loss_clip": 0.01178673, "auxiliary_loss_mlp": 0.01053338, "balance_loss_clip": 1.05815279, "balance_loss_mlp": 1.03163004, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.6720023918141877, "language_loss": 0.74227381, "learning_rate": 3.918946042768707e-06, "loss": 0.76459396, "num_input_tokens_seen": 42580790, "step": 1966, "time_per_iteration": 2.8265397548675537 }, { "auxiliary_loss_clip": 0.01163407, "auxiliary_loss_mlp": 0.0106081, "balance_loss_clip": 1.06309748, "balance_loss_mlp": 1.03836274, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 2.5628488285375397, "language_loss": 0.73137337, "learning_rate": 3.918836255889908e-06, "loss": 0.7536155, "num_input_tokens_seen": 42597355, "step": 1967, "time_per_iteration": 2.706193685531616 }, { "auxiliary_loss_clip": 0.01167052, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.05852592, "balance_loss_mlp": 1.03141701, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 5.332816815546028, "language_loss": 0.8831054, "learning_rate": 3.9187263962486456e-06, "loss": 0.90531063, "num_input_tokens_seen": 42616060, "step": 1968, "time_per_iteration": 2.6308343410491943 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.06406927, "balance_loss_mlp": 1.0294776, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 2.252087054693662, "language_loss": 0.67010254, "learning_rate": 3.918616463849087e-06, "loss": 0.69230425, "num_input_tokens_seen": 42636285, "step": 1969, "time_per_iteration": 2.662480592727661 }, { "auxiliary_loss_clip": 0.01130071, "auxiliary_loss_mlp": 0.0106143, "balance_loss_clip": 1.05177045, "balance_loss_mlp": 1.03774357, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 2.153814675458072, "language_loss": 0.80455101, "learning_rate": 3.918506458695399e-06, "loss": 0.82646602, "num_input_tokens_seen": 42658320, "step": 1970, "time_per_iteration": 2.798050880432129 }, { "auxiliary_loss_clip": 0.01060284, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.02553701, "balance_loss_mlp": 1.01892686, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8165911228106061, "language_loss": 0.66192186, "learning_rate": 3.918396380791754e-06, "loss": 0.68273854, "num_input_tokens_seen": 42721500, "step": 1971, "time_per_iteration": 3.167018413543701 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.05294323, "balance_loss_mlp": 1.03422379, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.1839859106137554, "language_loss": 0.79782552, "learning_rate": 3.918286230142327e-06, "loss": 0.81990343, "num_input_tokens_seen": 42739825, "step": 1972, "time_per_iteration": 2.6908793449401855 }, { "auxiliary_loss_clip": 0.01133219, "auxiliary_loss_mlp": 0.00778766, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.00005877, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.0473813607633384, "language_loss": 0.72843599, "learning_rate": 3.918176006751292e-06, "loss": 0.74755585, "num_input_tokens_seen": 42758695, "step": 1973, "time_per_iteration": 2.7801859378814697 }, { "auxiliary_loss_clip": 0.01138022, "auxiliary_loss_mlp": 0.01049764, "balance_loss_clip": 1.05580497, "balance_loss_mlp": 1.02707887, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.6449677647733996, "language_loss": 0.72019619, "learning_rate": 3.918065710622832e-06, "loss": 0.74207413, "num_input_tokens_seen": 42778510, "step": 1974, "time_per_iteration": 2.7337663173675537 }, { "auxiliary_loss_clip": 0.01129602, "auxiliary_loss_mlp": 0.01043161, "balance_loss_clip": 1.05265522, "balance_loss_mlp": 1.02086854, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.017372400194955, "language_loss": 0.77409399, "learning_rate": 3.917955341761128e-06, "loss": 0.79582161, "num_input_tokens_seen": 42793995, "step": 1975, "time_per_iteration": 2.669546604156494 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.05880177, "balance_loss_mlp": 1.03908277, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.3842578575289, "language_loss": 0.75110453, "learning_rate": 3.917844900170364e-06, "loss": 0.77301902, "num_input_tokens_seen": 42809000, "step": 1976, "time_per_iteration": 2.8439090251922607 }, { "auxiliary_loss_clip": 0.0116819, "auxiliary_loss_mlp": 0.01049523, "balance_loss_clip": 1.05999744, "balance_loss_mlp": 1.02835166, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.8674311015318124, "language_loss": 0.74877423, "learning_rate": 3.91773438585473e-06, "loss": 0.77095133, "num_input_tokens_seen": 42831585, "step": 1977, "time_per_iteration": 2.6747169494628906 }, { "auxiliary_loss_clip": 0.01182095, "auxiliary_loss_mlp": 0.01059621, "balance_loss_clip": 1.05954552, "balance_loss_mlp": 1.03805614, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.1793079873879604, "language_loss": 0.74207634, "learning_rate": 3.9176237988184165e-06, "loss": 0.76449353, "num_input_tokens_seen": 42848420, "step": 1978, "time_per_iteration": 2.631664514541626 }, { "auxiliary_loss_clip": 0.01142323, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.06037045, "balance_loss_mlp": 1.0289247, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 1.7170872786869797, "language_loss": 0.73256385, "learning_rate": 3.917513139065616e-06, "loss": 0.754493, "num_input_tokens_seen": 42866645, "step": 1979, "time_per_iteration": 2.7442541122436523 }, { "auxiliary_loss_clip": 0.01137516, "auxiliary_loss_mlp": 0.01051378, "balance_loss_clip": 1.0566175, "balance_loss_mlp": 1.02968168, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.876224505386343, "language_loss": 0.98293436, "learning_rate": 3.917402406600525e-06, "loss": 1.00482333, "num_input_tokens_seen": 42888515, "step": 1980, "time_per_iteration": 2.787667989730835 }, { "auxiliary_loss_clip": 0.01153629, "auxiliary_loss_mlp": 0.01053612, "balance_loss_clip": 1.05595791, "balance_loss_mlp": 1.03077161, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 1.7507584506289393, "language_loss": 0.86265099, "learning_rate": 3.917291601427342e-06, "loss": 0.88472342, "num_input_tokens_seen": 42909035, "step": 1981, "time_per_iteration": 2.6680359840393066 }, { "auxiliary_loss_clip": 0.01158736, "auxiliary_loss_mlp": 0.01064978, "balance_loss_clip": 1.06144083, "balance_loss_mlp": 1.04214907, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.8908045276276995, "language_loss": 0.85375237, "learning_rate": 3.91718072355027e-06, "loss": 0.87598956, "num_input_tokens_seen": 42927555, "step": 1982, "time_per_iteration": 2.732797861099243 }, { "auxiliary_loss_clip": 0.01146432, "auxiliary_loss_mlp": 0.01050259, "balance_loss_clip": 1.05539966, "balance_loss_mlp": 1.02843213, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 2.3856086229742877, "language_loss": 0.85202634, "learning_rate": 3.917069772973513e-06, "loss": 0.87399322, "num_input_tokens_seen": 42945300, "step": 1983, "time_per_iteration": 2.6839804649353027 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.05602145, "balance_loss_mlp": 1.03399742, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 3.6641824085676022, "language_loss": 0.7693429, "learning_rate": 3.916958749701277e-06, "loss": 0.79116929, "num_input_tokens_seen": 42961295, "step": 1984, "time_per_iteration": 2.7008767127990723 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.05752373, "balance_loss_mlp": 1.0334003, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 1.917528093726237, "language_loss": 0.83058321, "learning_rate": 3.9168476537377745e-06, "loss": 0.85275191, "num_input_tokens_seen": 42980330, "step": 1985, "time_per_iteration": 2.6692728996276855 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0541923, "balance_loss_mlp": 1.02835393, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 1.8732848573733223, "language_loss": 0.74398553, "learning_rate": 3.916736485087216e-06, "loss": 0.76600474, "num_input_tokens_seen": 42996125, "step": 1986, "time_per_iteration": 2.722013473510742 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.05472732, "balance_loss_mlp": 1.03791952, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 2.4724436343771083, "language_loss": 0.72123617, "learning_rate": 3.916625243753819e-06, "loss": 0.74328756, "num_input_tokens_seen": 43014180, "step": 1987, "time_per_iteration": 2.814481258392334 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01054644, "balance_loss_clip": 1.05747938, "balance_loss_mlp": 1.03138638, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 1.9246234449532542, "language_loss": 0.72007513, "learning_rate": 3.916513929741799e-06, "loss": 0.74219012, "num_input_tokens_seen": 43032120, "step": 1988, "time_per_iteration": 2.7242019176483154 }, { "auxiliary_loss_clip": 0.0116348, "auxiliary_loss_mlp": 0.01062102, "balance_loss_clip": 1.05559146, "balance_loss_mlp": 1.03913057, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 1.7561483239324645, "language_loss": 0.81144297, "learning_rate": 3.91640254305538e-06, "loss": 0.83369875, "num_input_tokens_seen": 43052215, "step": 1989, "time_per_iteration": 2.6259546279907227 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01057689, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.03325129, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.5516320258539795, "language_loss": 0.75881672, "learning_rate": 3.916291083698784e-06, "loss": 0.7807532, "num_input_tokens_seen": 43069720, "step": 1990, "time_per_iteration": 2.6779251098632812 }, { "auxiliary_loss_clip": 0.0105322, "auxiliary_loss_mlp": 0.01019112, "balance_loss_clip": 1.02816892, "balance_loss_mlp": 1.01647794, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8628582727639288, "language_loss": 0.55184531, "learning_rate": 3.916179551676238e-06, "loss": 0.57256866, "num_input_tokens_seen": 43123130, "step": 1991, "time_per_iteration": 3.3713693618774414 }, { "auxiliary_loss_clip": 0.01136423, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.03326464, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.286300891386994, "language_loss": 0.78371406, "learning_rate": 3.916067946991971e-06, "loss": 0.80561793, "num_input_tokens_seen": 43140015, "step": 1992, "time_per_iteration": 2.6797914505004883 }, { "auxiliary_loss_clip": 0.0117949, "auxiliary_loss_mlp": 0.01056635, "balance_loss_clip": 1.05811, "balance_loss_mlp": 1.03453374, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 1.8481811043026504, "language_loss": 0.78911144, "learning_rate": 3.915956269650216e-06, "loss": 0.81147265, "num_input_tokens_seen": 43160105, "step": 1993, "time_per_iteration": 2.691301107406616 }, { "auxiliary_loss_clip": 0.01126423, "auxiliary_loss_mlp": 0.0106217, "balance_loss_clip": 1.05012226, "balance_loss_mlp": 1.04081941, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.644866568705103, "language_loss": 0.82088816, "learning_rate": 3.915844519655208e-06, "loss": 0.84277415, "num_input_tokens_seen": 43179835, "step": 1994, "time_per_iteration": 2.772905111312866 }, { "auxiliary_loss_clip": 0.0115068, "auxiliary_loss_mlp": 0.01063961, "balance_loss_clip": 1.05523098, "balance_loss_mlp": 1.0433259, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.0065598513575247, "language_loss": 0.88392794, "learning_rate": 3.915732697011183e-06, "loss": 0.9060744, "num_input_tokens_seen": 43197210, "step": 1995, "time_per_iteration": 4.206532716751099 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.06005812, "balance_loss_mlp": 1.0441823, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 1.8775058007239456, "language_loss": 0.73949909, "learning_rate": 3.9156208017223825e-06, "loss": 0.76169801, "num_input_tokens_seen": 43215050, "step": 1996, "time_per_iteration": 2.7263944149017334 }, { "auxiliary_loss_clip": 0.01141484, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05754757, "balance_loss_mlp": 1.03808212, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 1.976051865072764, "language_loss": 0.88125587, "learning_rate": 3.915508833793048e-06, "loss": 0.90327179, "num_input_tokens_seen": 43233900, "step": 1997, "time_per_iteration": 4.29426383972168 }, { "auxiliary_loss_clip": 0.01165634, "auxiliary_loss_mlp": 0.00779568, "balance_loss_clip": 1.05701697, "balance_loss_mlp": 1.00001049, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 2.1091392562336018, "language_loss": 0.79031086, "learning_rate": 3.915396793227428e-06, "loss": 0.80976284, "num_input_tokens_seen": 43252105, "step": 1998, "time_per_iteration": 4.330955266952515 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.00779642, "balance_loss_clip": 1.0576719, "balance_loss_mlp": 1.00002396, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.799585336659533, "language_loss": 0.73583078, "learning_rate": 3.915284680029769e-06, "loss": 0.75529337, "num_input_tokens_seen": 43270315, "step": 1999, "time_per_iteration": 2.754770040512085 }, { "auxiliary_loss_clip": 0.01178966, "auxiliary_loss_mlp": 0.01073097, "balance_loss_clip": 1.0602119, "balance_loss_mlp": 1.05115068, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.916355473014409, "language_loss": 0.74854898, "learning_rate": 3.915172494204323e-06, "loss": 0.77106953, "num_input_tokens_seen": 43289935, "step": 2000, "time_per_iteration": 4.3900322914123535 }, { "auxiliary_loss_clip": 0.01149374, "auxiliary_loss_mlp": 0.01069735, "balance_loss_clip": 1.05375695, "balance_loss_mlp": 1.04763341, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 1.5203973891597686, "language_loss": 0.8496564, "learning_rate": 3.915060235755344e-06, "loss": 0.87184751, "num_input_tokens_seen": 43309325, "step": 2001, "time_per_iteration": 2.6912643909454346 }, { "auxiliary_loss_clip": 0.01154057, "auxiliary_loss_mlp": 0.01063637, "balance_loss_clip": 1.05600786, "balance_loss_mlp": 1.04265642, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 2.932264271186656, "language_loss": 0.74711967, "learning_rate": 3.91494790468709e-06, "loss": 0.76929653, "num_input_tokens_seen": 43327010, "step": 2002, "time_per_iteration": 2.6991024017333984 }, { "auxiliary_loss_clip": 0.01129169, "auxiliary_loss_mlp": 0.01066705, "balance_loss_clip": 1.05340302, "balance_loss_mlp": 1.0429939, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 2.117271428042382, "language_loss": 0.78029454, "learning_rate": 3.9148355010038185e-06, "loss": 0.80225325, "num_input_tokens_seen": 43345650, "step": 2003, "time_per_iteration": 2.731381416320801 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01062886, "balance_loss_clip": 1.05728662, "balance_loss_mlp": 1.04073668, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.585850552088038, "language_loss": 0.72205627, "learning_rate": 3.914723024709793e-06, "loss": 0.74431765, "num_input_tokens_seen": 43365555, "step": 2004, "time_per_iteration": 2.725092649459839 }, { "auxiliary_loss_clip": 0.01160616, "auxiliary_loss_mlp": 0.01069457, "balance_loss_clip": 1.05870187, "balance_loss_mlp": 1.04645014, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.9357732467170252, "language_loss": 0.78415942, "learning_rate": 3.914610475809279e-06, "loss": 0.8064602, "num_input_tokens_seen": 43384990, "step": 2005, "time_per_iteration": 2.7232437133789062 }, { "auxiliary_loss_clip": 0.01073016, "auxiliary_loss_mlp": 0.00758901, "balance_loss_clip": 1.02995479, "balance_loss_mlp": 1.00011683, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.9264315537536937, "language_loss": 0.58087146, "learning_rate": 3.914497854306543e-06, "loss": 0.59919059, "num_input_tokens_seen": 43436335, "step": 2006, "time_per_iteration": 2.9570157527923584 }, { "auxiliary_loss_clip": 0.01155081, "auxiliary_loss_mlp": 0.01053472, "balance_loss_clip": 1.05803597, "balance_loss_mlp": 1.03299201, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.6109316320484448, "language_loss": 0.76524282, "learning_rate": 3.9143851602058575e-06, "loss": 0.78732836, "num_input_tokens_seen": 43456495, "step": 2007, "time_per_iteration": 2.763380289077759 }, { "auxiliary_loss_clip": 0.01147254, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.05931091, "balance_loss_mlp": 1.04177368, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 2.449779851562752, "language_loss": 0.83023942, "learning_rate": 3.914272393511494e-06, "loss": 0.85235405, "num_input_tokens_seen": 43473085, "step": 2008, "time_per_iteration": 2.7693119049072266 }, { "auxiliary_loss_clip": 0.01176157, "auxiliary_loss_mlp": 0.01052894, "balance_loss_clip": 1.0584172, "balance_loss_mlp": 1.03135288, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 2.203355340521787, "language_loss": 0.83835697, "learning_rate": 3.91415955422773e-06, "loss": 0.86064744, "num_input_tokens_seen": 43491135, "step": 2009, "time_per_iteration": 2.640944242477417 }, { "auxiliary_loss_clip": 0.01180076, "auxiliary_loss_mlp": 0.01053549, "balance_loss_clip": 1.06196725, "balance_loss_mlp": 1.02994514, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.6799099601218046, "language_loss": 0.83870012, "learning_rate": 3.914046642358844e-06, "loss": 0.8610363, "num_input_tokens_seen": 43510440, "step": 2010, "time_per_iteration": 2.716127634048462 }, { "auxiliary_loss_clip": 0.01145261, "auxiliary_loss_mlp": 0.00780804, "balance_loss_clip": 1.05555713, "balance_loss_mlp": 1.0000627, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.8933604390076018, "language_loss": 0.84194541, "learning_rate": 3.9139336579091174e-06, "loss": 0.86120605, "num_input_tokens_seen": 43530145, "step": 2011, "time_per_iteration": 2.73793625831604 }, { "auxiliary_loss_clip": 0.01148418, "auxiliary_loss_mlp": 0.01060974, "balance_loss_clip": 1.05480969, "balance_loss_mlp": 1.03905129, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 2.0524904800028154, "language_loss": 0.96236968, "learning_rate": 3.913820600882834e-06, "loss": 0.98446357, "num_input_tokens_seen": 43549315, "step": 2012, "time_per_iteration": 2.7269980907440186 }, { "auxiliary_loss_clip": 0.01146369, "auxiliary_loss_mlp": 0.01051396, "balance_loss_clip": 1.05808425, "balance_loss_mlp": 1.0289607, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 1.853151366811655, "language_loss": 0.80903435, "learning_rate": 3.913707471284283e-06, "loss": 0.83101201, "num_input_tokens_seen": 43569240, "step": 2013, "time_per_iteration": 2.740489959716797 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.0105341, "balance_loss_clip": 1.05300117, "balance_loss_mlp": 1.02962804, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 5.099975898232357, "language_loss": 0.77255923, "learning_rate": 3.9135942691177515e-06, "loss": 0.79434031, "num_input_tokens_seen": 43587710, "step": 2014, "time_per_iteration": 2.7361485958099365 }, { "auxiliary_loss_clip": 0.0116607, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.05832791, "balance_loss_mlp": 1.02791715, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 5.8570343294144465, "language_loss": 0.87169874, "learning_rate": 3.913480994387535e-06, "loss": 0.89387, "num_input_tokens_seen": 43606000, "step": 2015, "time_per_iteration": 2.6881515979766846 }, { "auxiliary_loss_clip": 0.01170382, "auxiliary_loss_mlp": 0.01051162, "balance_loss_clip": 1.05500197, "balance_loss_mlp": 1.0289886, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 2.087765239068409, "language_loss": 0.69146478, "learning_rate": 3.913367647097926e-06, "loss": 0.71368027, "num_input_tokens_seen": 43624815, "step": 2016, "time_per_iteration": 2.7096211910247803 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02390599, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 2.8043603396252865, "language_loss": 0.79858959, "learning_rate": 3.913254227253225e-06, "loss": 0.82058656, "num_input_tokens_seen": 43643960, "step": 2017, "time_per_iteration": 2.7042336463928223 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.0105052, "balance_loss_clip": 1.05479789, "balance_loss_mlp": 1.02740538, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.8700241463026654, "language_loss": 0.68828821, "learning_rate": 3.913140734857731e-06, "loss": 0.71035373, "num_input_tokens_seen": 43662650, "step": 2018, "time_per_iteration": 2.7015058994293213 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.05524123, "balance_loss_mlp": 1.02873111, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.6132330771570709, "language_loss": 0.72476816, "learning_rate": 3.91302716991575e-06, "loss": 0.74663943, "num_input_tokens_seen": 43684205, "step": 2019, "time_per_iteration": 2.8956947326660156 }, { "auxiliary_loss_clip": 0.01107167, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.05286384, "balance_loss_mlp": 1.03482556, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 1.853626515444831, "language_loss": 0.92125106, "learning_rate": 3.912913532431586e-06, "loss": 0.94290185, "num_input_tokens_seen": 43706320, "step": 2020, "time_per_iteration": 2.9980764389038086 }, { "auxiliary_loss_clip": 0.0114145, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.05289125, "balance_loss_mlp": 1.03360391, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 1.9227427415613194, "language_loss": 0.7772885, "learning_rate": 3.912799822409549e-06, "loss": 0.79925752, "num_input_tokens_seen": 43724805, "step": 2021, "time_per_iteration": 3.01798939704895 }, { "auxiliary_loss_clip": 0.0117749, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.0610733, "balance_loss_mlp": 1.0277164, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 2.054228820960504, "language_loss": 0.80712306, "learning_rate": 3.912686039853952e-06, "loss": 0.82938808, "num_input_tokens_seen": 43742320, "step": 2022, "time_per_iteration": 2.684309244155884 }, { "auxiliary_loss_clip": 0.01144749, "auxiliary_loss_mlp": 0.0106163, "balance_loss_clip": 1.055619, "balance_loss_mlp": 1.03697765, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.734031517866852, "language_loss": 0.84842217, "learning_rate": 3.912572184769108e-06, "loss": 0.87048596, "num_input_tokens_seen": 43760665, "step": 2023, "time_per_iteration": 2.6886441707611084 }, { "auxiliary_loss_clip": 0.01139348, "auxiliary_loss_mlp": 0.01053043, "balance_loss_clip": 1.05162323, "balance_loss_mlp": 1.03081048, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.3397199529221546, "language_loss": 0.85514021, "learning_rate": 3.912458257159335e-06, "loss": 0.87706411, "num_input_tokens_seen": 43779020, "step": 2024, "time_per_iteration": 2.8043718338012695 }, { "auxiliary_loss_clip": 0.01169767, "auxiliary_loss_mlp": 0.01055534, "balance_loss_clip": 1.05277538, "balance_loss_mlp": 1.03389716, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 1.8432491304976684, "language_loss": 0.72088945, "learning_rate": 3.912344257028954e-06, "loss": 0.74314243, "num_input_tokens_seen": 43798850, "step": 2025, "time_per_iteration": 2.704876184463501 }, { "auxiliary_loss_clip": 0.01148564, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.05486572, "balance_loss_mlp": 1.02555275, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 1.4969552271445652, "language_loss": 0.76075011, "learning_rate": 3.912230184382286e-06, "loss": 0.78271192, "num_input_tokens_seen": 43820130, "step": 2026, "time_per_iteration": 2.6957921981811523 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.01046261, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.02474427, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 2.2064263994277478, "language_loss": 0.88769746, "learning_rate": 3.912116039223659e-06, "loss": 0.90963376, "num_input_tokens_seen": 43838485, "step": 2027, "time_per_iteration": 2.6847639083862305 }, { "auxiliary_loss_clip": 0.01143778, "auxiliary_loss_mlp": 0.01056715, "balance_loss_clip": 1.05258501, "balance_loss_mlp": 1.03667617, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.5725885574076592, "language_loss": 0.75544459, "learning_rate": 3.912001821557399e-06, "loss": 0.77744961, "num_input_tokens_seen": 43859080, "step": 2028, "time_per_iteration": 2.7706027030944824 }, { "auxiliary_loss_clip": 0.01123185, "auxiliary_loss_mlp": 0.01057136, "balance_loss_clip": 1.0518471, "balance_loss_mlp": 1.03554714, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.0550419223931193, "language_loss": 0.76802504, "learning_rate": 3.911887531387839e-06, "loss": 0.78982824, "num_input_tokens_seen": 43879030, "step": 2029, "time_per_iteration": 2.732637405395508 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01052355, "balance_loss_clip": 1.05253625, "balance_loss_mlp": 1.03107572, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.707195979328818, "language_loss": 0.79164296, "learning_rate": 3.911773168719313e-06, "loss": 0.81373239, "num_input_tokens_seen": 43898505, "step": 2030, "time_per_iteration": 2.7254061698913574 }, { "auxiliary_loss_clip": 0.0116997, "auxiliary_loss_mlp": 0.01051357, "balance_loss_clip": 1.05618095, "balance_loss_mlp": 1.02930319, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 3.038077546298312, "language_loss": 0.74411637, "learning_rate": 3.911658733556155e-06, "loss": 0.76632965, "num_input_tokens_seen": 43917945, "step": 2031, "time_per_iteration": 2.6711080074310303 }, { "auxiliary_loss_clip": 0.01174332, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.02545118, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.7636188348969384, "language_loss": 0.75230348, "learning_rate": 3.911544225902707e-06, "loss": 0.7745049, "num_input_tokens_seen": 43937385, "step": 2032, "time_per_iteration": 2.7134530544281006 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01045735, "balance_loss_clip": 1.05129802, "balance_loss_mlp": 1.02538586, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.5809359138264147, "language_loss": 0.89502287, "learning_rate": 3.911429645763311e-06, "loss": 0.91700387, "num_input_tokens_seen": 43958130, "step": 2033, "time_per_iteration": 2.7105965614318848 }, { "auxiliary_loss_clip": 0.01155694, "auxiliary_loss_mlp": 0.01051169, "balance_loss_clip": 1.05740523, "balance_loss_mlp": 1.03005767, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 1.9580868921695649, "language_loss": 0.65195286, "learning_rate": 3.911314993142311e-06, "loss": 0.67402148, "num_input_tokens_seen": 43976800, "step": 2034, "time_per_iteration": 4.222668886184692 }, { "auxiliary_loss_clip": 0.01152239, "auxiliary_loss_mlp": 0.01055659, "balance_loss_clip": 1.05550218, "balance_loss_mlp": 1.0327704, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.6376942269871653, "language_loss": 0.76459455, "learning_rate": 3.911200268044055e-06, "loss": 0.78667355, "num_input_tokens_seen": 43996620, "step": 2035, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01176703, "auxiliary_loss_mlp": 0.01050008, "balance_loss_clip": 1.0577215, "balance_loss_mlp": 1.02798975, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 1.8460180606974623, "language_loss": 0.71294892, "learning_rate": 3.911085470472892e-06, "loss": 0.73521602, "num_input_tokens_seen": 44016175, "step": 2036, "time_per_iteration": 2.7327258586883545 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.01058389, "balance_loss_clip": 1.05778408, "balance_loss_mlp": 1.03623962, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 1.5772021569883852, "language_loss": 0.83130831, "learning_rate": 3.910970600433178e-06, "loss": 0.85333693, "num_input_tokens_seen": 44035060, "step": 2037, "time_per_iteration": 4.248440742492676 }, { "auxiliary_loss_clip": 0.01153641, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.0556947, "balance_loss_mlp": 1.0366174, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 2.676780030246967, "language_loss": 0.79765236, "learning_rate": 3.910855657929267e-06, "loss": 0.81979132, "num_input_tokens_seen": 44053330, "step": 2038, "time_per_iteration": 2.7321341037750244 }, { "auxiliary_loss_clip": 0.010642, "auxiliary_loss_mlp": 0.00759248, "balance_loss_clip": 1.02961969, "balance_loss_mlp": 1.00006962, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8248048644272604, "language_loss": 0.58659601, "learning_rate": 3.910740642965518e-06, "loss": 0.6048305, "num_input_tokens_seen": 44107575, "step": 2039, "time_per_iteration": 4.739040851593018 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01064411, "balance_loss_clip": 1.05292714, "balance_loss_mlp": 1.03912663, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 2.1548467753138136, "language_loss": 0.80099291, "learning_rate": 3.910625555546292e-06, "loss": 0.82291704, "num_input_tokens_seen": 44126075, "step": 2040, "time_per_iteration": 2.723247766494751 }, { "auxiliary_loss_clip": 0.01149343, "auxiliary_loss_mlp": 0.01058534, "balance_loss_clip": 1.05517352, "balance_loss_mlp": 1.03673029, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.8247690225218605, "language_loss": 0.82841176, "learning_rate": 3.910510395675953e-06, "loss": 0.85049051, "num_input_tokens_seen": 44145605, "step": 2041, "time_per_iteration": 2.699110984802246 }, { "auxiliary_loss_clip": 0.01136001, "auxiliary_loss_mlp": 0.01053451, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.03061032, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.9386136063873771, "language_loss": 0.67272276, "learning_rate": 3.9103951633588694e-06, "loss": 0.69461727, "num_input_tokens_seen": 44164770, "step": 2042, "time_per_iteration": 2.7042133808135986 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.05079007, "balance_loss_mlp": 1.03517294, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.912164915278887, "language_loss": 0.81765604, "learning_rate": 3.910279858599409e-06, "loss": 0.83955657, "num_input_tokens_seen": 44184025, "step": 2043, "time_per_iteration": 2.6942050457000732 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01052365, "balance_loss_clip": 1.05161905, "balance_loss_mlp": 1.03040695, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 1.7894844734354058, "language_loss": 0.80192459, "learning_rate": 3.910164481401946e-06, "loss": 0.82391244, "num_input_tokens_seen": 44202950, "step": 2044, "time_per_iteration": 2.6227192878723145 }, { "auxiliary_loss_clip": 0.01116285, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.05284619, "balance_loss_mlp": 1.03055525, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7152742607840916, "language_loss": 0.7794897, "learning_rate": 3.910049031770853e-06, "loss": 0.80117267, "num_input_tokens_seen": 44221115, "step": 2045, "time_per_iteration": 2.769017219543457 }, { "auxiliary_loss_clip": 0.01163545, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.05796146, "balance_loss_mlp": 1.03827095, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 1.852572781372854, "language_loss": 0.67284262, "learning_rate": 3.90993350971051e-06, "loss": 0.69508278, "num_input_tokens_seen": 44240575, "step": 2046, "time_per_iteration": 2.6377944946289062 }, { "auxiliary_loss_clip": 0.01173803, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.06010675, "balance_loss_mlp": 1.03202295, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 4.982373490718116, "language_loss": 0.72730684, "learning_rate": 3.909817915225297e-06, "loss": 0.74958241, "num_input_tokens_seen": 44257145, "step": 2047, "time_per_iteration": 2.5791239738464355 }, { "auxiliary_loss_clip": 0.01155159, "auxiliary_loss_mlp": 0.01060632, "balance_loss_clip": 1.05398846, "balance_loss_mlp": 1.03817296, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.8194194024321948, "language_loss": 0.76583183, "learning_rate": 3.909702248319597e-06, "loss": 0.78798974, "num_input_tokens_seen": 44278035, "step": 2048, "time_per_iteration": 2.6997592449188232 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.05524468, "balance_loss_mlp": 1.02798486, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 1.8097490634569602, "language_loss": 0.85359102, "learning_rate": 3.909586508997797e-06, "loss": 0.87553203, "num_input_tokens_seen": 44296980, "step": 2049, "time_per_iteration": 2.739617109298706 }, { "auxiliary_loss_clip": 0.01120276, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.0533725, "balance_loss_mlp": 1.02887857, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 2.6582136339172724, "language_loss": 0.75563407, "learning_rate": 3.909470697264285e-06, "loss": 0.77733827, "num_input_tokens_seen": 44318005, "step": 2050, "time_per_iteration": 2.7814078330993652 }, { "auxiliary_loss_clip": 0.01138568, "auxiliary_loss_mlp": 0.01057939, "balance_loss_clip": 1.05428278, "balance_loss_mlp": 1.03608823, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 1.81408967902731, "language_loss": 0.81166679, "learning_rate": 3.909354813123452e-06, "loss": 0.83363187, "num_input_tokens_seen": 44335260, "step": 2051, "time_per_iteration": 2.7555224895477295 }, { "auxiliary_loss_clip": 0.01171646, "auxiliary_loss_mlp": 0.00779218, "balance_loss_clip": 1.05882978, "balance_loss_mlp": 0.99996465, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 1.8885516327307212, "language_loss": 0.80445349, "learning_rate": 3.909238856579693e-06, "loss": 0.82396215, "num_input_tokens_seen": 44355315, "step": 2052, "time_per_iteration": 2.7676405906677246 }, { "auxiliary_loss_clip": 0.01165489, "auxiliary_loss_mlp": 0.010569, "balance_loss_clip": 1.0581975, "balance_loss_mlp": 1.03537059, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 2.171205541070781, "language_loss": 0.73676848, "learning_rate": 3.909122827637406e-06, "loss": 0.75899243, "num_input_tokens_seen": 44373020, "step": 2053, "time_per_iteration": 2.648609161376953 }, { "auxiliary_loss_clip": 0.01168883, "auxiliary_loss_mlp": 0.00778478, "balance_loss_clip": 1.05302441, "balance_loss_mlp": 0.99995315, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.5051513438882418, "language_loss": 0.7413671, "learning_rate": 3.909006726300991e-06, "loss": 0.76084077, "num_input_tokens_seen": 44397525, "step": 2054, "time_per_iteration": 2.871469020843506 }, { "auxiliary_loss_clip": 0.01147607, "auxiliary_loss_mlp": 0.01044612, "balance_loss_clip": 1.05402803, "balance_loss_mlp": 1.02482307, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 4.50189877271012, "language_loss": 0.85417157, "learning_rate": 3.908890552574849e-06, "loss": 0.8760938, "num_input_tokens_seen": 44415890, "step": 2055, "time_per_iteration": 2.7136077880859375 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.05999517, "balance_loss_mlp": 1.02802706, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 2.0629908776416688, "language_loss": 0.77506042, "learning_rate": 3.908774306463384e-06, "loss": 0.79687333, "num_input_tokens_seen": 44436625, "step": 2056, "time_per_iteration": 2.83107852935791 }, { "auxiliary_loss_clip": 0.01158234, "auxiliary_loss_mlp": 0.01055, "balance_loss_clip": 1.05444396, "balance_loss_mlp": 1.03405499, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 1.9893743253373262, "language_loss": 0.83361745, "learning_rate": 3.908657987971009e-06, "loss": 0.85574985, "num_input_tokens_seen": 44455265, "step": 2057, "time_per_iteration": 2.6987085342407227 }, { "auxiliary_loss_clip": 0.01141319, "auxiliary_loss_mlp": 0.01051708, "balance_loss_clip": 1.05057144, "balance_loss_mlp": 1.02991605, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.4905135493793764, "language_loss": 0.77818203, "learning_rate": 3.90854159710213e-06, "loss": 0.80011231, "num_input_tokens_seen": 44475815, "step": 2058, "time_per_iteration": 2.7149016857147217 }, { "auxiliary_loss_clip": 0.01138087, "auxiliary_loss_mlp": 0.01058134, "balance_loss_clip": 1.05117273, "balance_loss_mlp": 1.03482866, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 1.8387803476985631, "language_loss": 0.8342883, "learning_rate": 3.9084251338611624e-06, "loss": 0.85625052, "num_input_tokens_seen": 44494045, "step": 2059, "time_per_iteration": 2.7030091285705566 }, { "auxiliary_loss_clip": 0.01133517, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05123472, "balance_loss_mlp": 1.03445077, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.7478129466394217, "language_loss": 0.81420219, "learning_rate": 3.908308598252523e-06, "loss": 0.83611137, "num_input_tokens_seen": 44509120, "step": 2060, "time_per_iteration": 2.738499402999878 }, { "auxiliary_loss_clip": 0.01150334, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.05367386, "balance_loss_mlp": 1.0315125, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 1.8699548955873522, "language_loss": 0.86224365, "learning_rate": 3.9081919902806306e-06, "loss": 0.88429129, "num_input_tokens_seen": 44525780, "step": 2061, "time_per_iteration": 2.6492960453033447 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.05506253, "balance_loss_mlp": 1.03031528, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 2.006361909654615, "language_loss": 0.84949362, "learning_rate": 3.908075309949906e-06, "loss": 0.87154901, "num_input_tokens_seen": 44543125, "step": 2062, "time_per_iteration": 2.5925393104553223 }, { "auxiliary_loss_clip": 0.01124676, "auxiliary_loss_mlp": 0.01058304, "balance_loss_clip": 1.05198252, "balance_loss_mlp": 1.03498697, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 1.6181471799462952, "language_loss": 0.78765064, "learning_rate": 3.907958557264774e-06, "loss": 0.80948043, "num_input_tokens_seen": 44560275, "step": 2063, "time_per_iteration": 2.7551674842834473 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01057465, "balance_loss_clip": 1.05492854, "balance_loss_mlp": 1.03450513, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.9315517002695017, "language_loss": 0.79452097, "learning_rate": 3.907841732229663e-06, "loss": 0.81633931, "num_input_tokens_seen": 44577640, "step": 2064, "time_per_iteration": 2.699711322784424 }, { "auxiliary_loss_clip": 0.01144709, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.05316699, "balance_loss_mlp": 1.03847849, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.5611248351266016, "language_loss": 0.92676973, "learning_rate": 3.907724834849002e-06, "loss": 0.9488045, "num_input_tokens_seen": 44594860, "step": 2065, "time_per_iteration": 2.7114996910095215 }, { "auxiliary_loss_clip": 0.01147841, "auxiliary_loss_mlp": 0.01052058, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.02943158, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.7498294279318665, "language_loss": 0.80540735, "learning_rate": 3.907607865127225e-06, "loss": 0.82740629, "num_input_tokens_seen": 44614780, "step": 2066, "time_per_iteration": 2.6958389282226562 }, { "auxiliary_loss_clip": 0.01030831, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.02768898, "balance_loss_mlp": 1.04884958, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.8715885531008962, "language_loss": 0.63299954, "learning_rate": 3.907490823068766e-06, "loss": 0.6538223, "num_input_tokens_seen": 44671240, "step": 2067, "time_per_iteration": 3.200000762939453 }, { "auxiliary_loss_clip": 0.01117858, "auxiliary_loss_mlp": 0.01057985, "balance_loss_clip": 1.04878855, "balance_loss_mlp": 1.0344646, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 1.9218217735084064, "language_loss": 0.93783462, "learning_rate": 3.907373708678063e-06, "loss": 0.959593, "num_input_tokens_seen": 44691050, "step": 2068, "time_per_iteration": 2.7631025314331055 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.0105657, "balance_loss_clip": 1.05994427, "balance_loss_mlp": 1.03697169, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 1.8717926968048342, "language_loss": 0.80861229, "learning_rate": 3.9072565219595596e-06, "loss": 0.83084196, "num_input_tokens_seen": 44709850, "step": 2069, "time_per_iteration": 2.6630098819732666 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01062592, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.03963184, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.5649570979854035, "language_loss": 0.777978, "learning_rate": 3.907139262917696e-06, "loss": 0.79973656, "num_input_tokens_seen": 44731475, "step": 2070, "time_per_iteration": 2.7750463485717773 }, { "auxiliary_loss_clip": 0.01156875, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05520415, "balance_loss_mlp": 1.03055048, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 2.2051981544638166, "language_loss": 0.80743957, "learning_rate": 3.907021931556922e-06, "loss": 0.8295334, "num_input_tokens_seen": 44749685, "step": 2071, "time_per_iteration": 2.654171943664551 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01055767, "balance_loss_clip": 1.05492425, "balance_loss_mlp": 1.03405952, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 2.118828414072521, "language_loss": 0.78278041, "learning_rate": 3.906904527881684e-06, "loss": 0.80488491, "num_input_tokens_seen": 44772165, "step": 2072, "time_per_iteration": 2.753159284591675 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01055287, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.03381729, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 7.360489773093417, "language_loss": 0.752267, "learning_rate": 3.9067870518964355e-06, "loss": 0.77427667, "num_input_tokens_seen": 44790580, "step": 2073, "time_per_iteration": 2.6561899185180664 }, { "auxiliary_loss_clip": 0.01096485, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.04471385, "balance_loss_mlp": 1.03086543, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 1.9234955386089483, "language_loss": 0.90560025, "learning_rate": 3.906669503605631e-06, "loss": 0.92709696, "num_input_tokens_seen": 44806730, "step": 2074, "time_per_iteration": 2.7846343517303467 }, { "auxiliary_loss_clip": 0.01105332, "auxiliary_loss_mlp": 0.01056651, "balance_loss_clip": 1.04977274, "balance_loss_mlp": 1.03346491, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.8321626325497493, "language_loss": 0.83836985, "learning_rate": 3.906551883013728e-06, "loss": 0.8599897, "num_input_tokens_seen": 44825550, "step": 2075, "time_per_iteration": 4.412928342819214 }, { "auxiliary_loss_clip": 0.01107078, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.04380202, "balance_loss_mlp": 1.03972864, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 2.042892519020311, "language_loss": 0.73648787, "learning_rate": 3.9064341901251865e-06, "loss": 0.75818682, "num_input_tokens_seen": 44844155, "step": 2076, "time_per_iteration": 5.925223112106323 }, { "auxiliary_loss_clip": 0.01101731, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.04774427, "balance_loss_mlp": 1.02751708, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 1.8779339700875872, "language_loss": 0.7622484, "learning_rate": 3.906316424944469e-06, "loss": 0.78374755, "num_input_tokens_seen": 44863780, "step": 2077, "time_per_iteration": 2.70566987991333 }, { "auxiliary_loss_clip": 0.01156274, "auxiliary_loss_mlp": 0.01062042, "balance_loss_clip": 1.05365288, "balance_loss_mlp": 1.04001164, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.022280968605665, "language_loss": 0.82290226, "learning_rate": 3.906198587476043e-06, "loss": 0.84508544, "num_input_tokens_seen": 44881480, "step": 2078, "time_per_iteration": 4.302385568618774 }, { "auxiliary_loss_clip": 0.01144821, "auxiliary_loss_mlp": 0.01050482, "balance_loss_clip": 1.05281842, "balance_loss_mlp": 1.02855957, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.6413520418295044, "language_loss": 0.75195324, "learning_rate": 3.906080677724374e-06, "loss": 0.77390629, "num_input_tokens_seen": 44900390, "step": 2079, "time_per_iteration": 2.6915946006774902 }, { "auxiliary_loss_clip": 0.01166758, "auxiliary_loss_mlp": 0.01058474, "balance_loss_clip": 1.05881989, "balance_loss_mlp": 1.03696847, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 6.733284446627088, "language_loss": 0.83874094, "learning_rate": 3.905962695693935e-06, "loss": 0.86099327, "num_input_tokens_seen": 44920375, "step": 2080, "time_per_iteration": 2.7467572689056396 }, { "auxiliary_loss_clip": 0.01156163, "auxiliary_loss_mlp": 0.01059409, "balance_loss_clip": 1.05525088, "balance_loss_mlp": 1.03885686, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 1.8581885454518776, "language_loss": 0.84644079, "learning_rate": 3.9058446413892e-06, "loss": 0.86859655, "num_input_tokens_seen": 44938415, "step": 2081, "time_per_iteration": 2.685875654220581 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046398, "balance_loss_clip": 1.05375946, "balance_loss_mlp": 1.02594149, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.8191819349610059, "language_loss": 0.76739037, "learning_rate": 3.905726514814646e-06, "loss": 0.78942269, "num_input_tokens_seen": 44957135, "step": 2082, "time_per_iteration": 2.6133053302764893 }, { "auxiliary_loss_clip": 0.01152911, "auxiliary_loss_mlp": 0.0104632, "balance_loss_clip": 1.05701911, "balance_loss_mlp": 1.02463615, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.5415589476696265, "language_loss": 0.79044539, "learning_rate": 3.9056083159747495e-06, "loss": 0.81243765, "num_input_tokens_seen": 44974480, "step": 2083, "time_per_iteration": 2.6963307857513428 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.05509973, "balance_loss_mlp": 1.02421284, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 2.1696249857299, "language_loss": 0.89831448, "learning_rate": 3.9054900448739966e-06, "loss": 0.92026675, "num_input_tokens_seen": 44990310, "step": 2084, "time_per_iteration": 2.6770403385162354 }, { "auxiliary_loss_clip": 0.01131068, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.05299771, "balance_loss_mlp": 1.02729464, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 1.8896331095253402, "language_loss": 0.80354226, "learning_rate": 3.905371701516869e-06, "loss": 0.82533598, "num_input_tokens_seen": 45010720, "step": 2085, "time_per_iteration": 2.749783515930176 }, { "auxiliary_loss_clip": 0.01170318, "auxiliary_loss_mlp": 0.01051018, "balance_loss_clip": 1.05725896, "balance_loss_mlp": 1.03001356, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 1.8300316094254767, "language_loss": 0.88228154, "learning_rate": 3.905253285907856e-06, "loss": 0.90449488, "num_input_tokens_seen": 45030360, "step": 2086, "time_per_iteration": 2.603515148162842 }, { "auxiliary_loss_clip": 0.01134598, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05278981, "balance_loss_mlp": 1.02522027, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.0471238132540344, "language_loss": 0.86819696, "learning_rate": 3.905134798051447e-06, "loss": 0.88999224, "num_input_tokens_seen": 45045085, "step": 2087, "time_per_iteration": 2.6265859603881836 }, { "auxiliary_loss_clip": 0.01146999, "auxiliary_loss_mlp": 0.01058875, "balance_loss_clip": 1.05599046, "balance_loss_mlp": 1.03651142, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 2.3362397674907758, "language_loss": 0.73027468, "learning_rate": 3.905016237952136e-06, "loss": 0.75233346, "num_input_tokens_seen": 45065145, "step": 2088, "time_per_iteration": 2.65324330329895 }, { "auxiliary_loss_clip": 0.01062529, "auxiliary_loss_mlp": 0.01013405, "balance_loss_clip": 1.02985716, "balance_loss_mlp": 1.01079392, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7742255614948045, "language_loss": 0.61767036, "learning_rate": 3.904897605614418e-06, "loss": 0.6384297, "num_input_tokens_seen": 45126230, "step": 2089, "time_per_iteration": 3.1219804286956787 }, { "auxiliary_loss_clip": 0.01149606, "auxiliary_loss_mlp": 0.01060841, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 1.0388943, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 1.817095421446176, "language_loss": 0.7781918, "learning_rate": 3.904778901042793e-06, "loss": 0.80029625, "num_input_tokens_seen": 45145545, "step": 2090, "time_per_iteration": 2.700425863265991 }, { "auxiliary_loss_clip": 0.01046946, "auxiliary_loss_mlp": 0.01013884, "balance_loss_clip": 1.03125095, "balance_loss_mlp": 1.01101136, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.760599485634597, "language_loss": 0.59434772, "learning_rate": 3.90466012424176e-06, "loss": 0.61495602, "num_input_tokens_seen": 45206845, "step": 2091, "time_per_iteration": 3.0814294815063477 }, { "auxiliary_loss_clip": 0.01159814, "auxiliary_loss_mlp": 0.01060546, "balance_loss_clip": 1.05760789, "balance_loss_mlp": 1.041067, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 1.6552462178493936, "language_loss": 0.62916517, "learning_rate": 3.904541275215825e-06, "loss": 0.6513688, "num_input_tokens_seen": 45228495, "step": 2092, "time_per_iteration": 2.7813880443573 }, { "auxiliary_loss_clip": 0.01147016, "auxiliary_loss_mlp": 0.01061963, "balance_loss_clip": 1.05395663, "balance_loss_mlp": 1.04069614, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 2.279616692029291, "language_loss": 0.80507946, "learning_rate": 3.904422353969493e-06, "loss": 0.82716924, "num_input_tokens_seen": 45245720, "step": 2093, "time_per_iteration": 2.6768014430999756 }, { "auxiliary_loss_clip": 0.01146976, "auxiliary_loss_mlp": 0.01075616, "balance_loss_clip": 1.0524025, "balance_loss_mlp": 1.05380058, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.7347385846840702, "language_loss": 0.76003867, "learning_rate": 3.904303360507276e-06, "loss": 0.78226459, "num_input_tokens_seen": 45265650, "step": 2094, "time_per_iteration": 2.6730611324310303 }, { "auxiliary_loss_clip": 0.01117887, "auxiliary_loss_mlp": 0.01069309, "balance_loss_clip": 1.0500071, "balance_loss_mlp": 1.04892457, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.5703706409155747, "language_loss": 0.76664734, "learning_rate": 3.9041842948336835e-06, "loss": 0.78851926, "num_input_tokens_seen": 45287790, "step": 2095, "time_per_iteration": 2.958367109298706 }, { "auxiliary_loss_clip": 0.01147751, "auxiliary_loss_mlp": 0.01058477, "balance_loss_clip": 1.05202031, "balance_loss_mlp": 1.03782988, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.2556524892449326, "language_loss": 0.83266854, "learning_rate": 3.904065156953232e-06, "loss": 0.85473078, "num_input_tokens_seen": 45305720, "step": 2096, "time_per_iteration": 2.7097342014312744 }, { "auxiliary_loss_clip": 0.01163652, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05806553, "balance_loss_mlp": 1.03577375, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 1.7589400475615893, "language_loss": 0.75478256, "learning_rate": 3.903945946870439e-06, "loss": 0.77698463, "num_input_tokens_seen": 45325290, "step": 2097, "time_per_iteration": 2.642056703567505 }, { "auxiliary_loss_clip": 0.01156719, "auxiliary_loss_mlp": 0.01063976, "balance_loss_clip": 1.05648863, "balance_loss_mlp": 1.04527175, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 1.8828235460619742, "language_loss": 0.87110066, "learning_rate": 3.9038266645898246e-06, "loss": 0.89330757, "num_input_tokens_seen": 45344465, "step": 2098, "time_per_iteration": 2.63826584815979 }, { "auxiliary_loss_clip": 0.01117414, "auxiliary_loss_mlp": 0.01058025, "balance_loss_clip": 1.04983974, "balance_loss_mlp": 1.03475559, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 1.8855647331078333, "language_loss": 0.69494271, "learning_rate": 3.903707310115912e-06, "loss": 0.7166971, "num_input_tokens_seen": 45362465, "step": 2099, "time_per_iteration": 2.7813057899475098 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01061431, "balance_loss_clip": 1.04979372, "balance_loss_mlp": 1.03923464, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.0457253500590498, "language_loss": 0.81949925, "learning_rate": 3.903587883453228e-06, "loss": 0.84154058, "num_input_tokens_seen": 45382700, "step": 2100, "time_per_iteration": 2.704871416091919 }, { "auxiliary_loss_clip": 0.01159613, "auxiliary_loss_mlp": 0.01055067, "balance_loss_clip": 1.0620985, "balance_loss_mlp": 1.03408623, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 1.7810176086536167, "language_loss": 0.80399859, "learning_rate": 3.903468384606302e-06, "loss": 0.82614541, "num_input_tokens_seen": 45401005, "step": 2101, "time_per_iteration": 2.7071452140808105 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01010859, "balance_loss_clip": 1.02823138, "balance_loss_mlp": 1.00803375, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7128618749962091, "language_loss": 0.57087427, "learning_rate": 3.903348813579662e-06, "loss": 0.59168136, "num_input_tokens_seen": 45466555, "step": 2102, "time_per_iteration": 3.20320987701416 }, { "auxiliary_loss_clip": 0.01140495, "auxiliary_loss_mlp": 0.01056574, "balance_loss_clip": 1.053671, "balance_loss_mlp": 1.03661788, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 2.0306165352193988, "language_loss": 0.93653679, "learning_rate": 3.903229170377845e-06, "loss": 0.95850742, "num_input_tokens_seen": 45485165, "step": 2103, "time_per_iteration": 2.6628894805908203 }, { "auxiliary_loss_clip": 0.01144405, "auxiliary_loss_mlp": 0.01040745, "balance_loss_clip": 1.04991472, "balance_loss_mlp": 1.02174282, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 1.5962316578756222, "language_loss": 0.7804662, "learning_rate": 3.903109455005387e-06, "loss": 0.80231774, "num_input_tokens_seen": 45504630, "step": 2104, "time_per_iteration": 2.6215474605560303 }, { "auxiliary_loss_clip": 0.01135927, "auxiliary_loss_mlp": 0.01056343, "balance_loss_clip": 1.05414486, "balance_loss_mlp": 1.03683996, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 1.7362499149688688, "language_loss": 0.80728614, "learning_rate": 3.902989667466828e-06, "loss": 0.82920885, "num_input_tokens_seen": 45524885, "step": 2105, "time_per_iteration": 2.74128794670105 }, { "auxiliary_loss_clip": 0.01162904, "auxiliary_loss_mlp": 0.01056367, "balance_loss_clip": 1.05482686, "balance_loss_mlp": 1.03514743, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 1.9810187943106816, "language_loss": 0.83402872, "learning_rate": 3.90286980776671e-06, "loss": 0.85622144, "num_input_tokens_seen": 45545000, "step": 2106, "time_per_iteration": 2.676694631576538 }, { "auxiliary_loss_clip": 0.01126632, "auxiliary_loss_mlp": 0.01052067, "balance_loss_clip": 1.05697966, "balance_loss_mlp": 1.03147984, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.6951691508845637, "language_loss": 0.73469931, "learning_rate": 3.902749875909578e-06, "loss": 0.7564863, "num_input_tokens_seen": 45564210, "step": 2107, "time_per_iteration": 2.7506372928619385 }, { "auxiliary_loss_clip": 0.01162931, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.05320692, "balance_loss_mlp": 1.02599406, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.0116792159666477, "language_loss": 0.79395336, "learning_rate": 3.90262987189998e-06, "loss": 0.81602579, "num_input_tokens_seen": 45583030, "step": 2108, "time_per_iteration": 2.6611146926879883 }, { "auxiliary_loss_clip": 0.01168073, "auxiliary_loss_mlp": 0.01049192, "balance_loss_clip": 1.05300844, "balance_loss_mlp": 1.02945089, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 1.9298328790617403, "language_loss": 0.7561394, "learning_rate": 3.902509795742467e-06, "loss": 0.77831209, "num_input_tokens_seen": 45602265, "step": 2109, "time_per_iteration": 2.5963573455810547 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.04636049, "balance_loss_mlp": 1.0335331, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6171901700648081, "language_loss": 0.82806516, "learning_rate": 3.902389647441592e-06, "loss": 0.84971368, "num_input_tokens_seen": 45620595, "step": 2110, "time_per_iteration": 2.6745550632476807 }, { "auxiliary_loss_clip": 0.01145969, "auxiliary_loss_mlp": 0.00778071, "balance_loss_clip": 1.05419564, "balance_loss_mlp": 0.99996144, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 1.6765217216011241, "language_loss": 0.78092968, "learning_rate": 3.90226942700191e-06, "loss": 0.80017006, "num_input_tokens_seen": 45641140, "step": 2111, "time_per_iteration": 2.65983510017395 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01076547, "balance_loss_clip": 1.05490458, "balance_loss_mlp": 1.05352807, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 2.15738266202174, "language_loss": 0.77103376, "learning_rate": 3.902149134427982e-06, "loss": 0.79310858, "num_input_tokens_seen": 45662315, "step": 2112, "time_per_iteration": 2.870299816131592 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01074863, "balance_loss_clip": 1.05213726, "balance_loss_mlp": 1.05427516, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.9191529425470424, "language_loss": 0.85806453, "learning_rate": 3.902028769724367e-06, "loss": 0.88010758, "num_input_tokens_seen": 45680335, "step": 2113, "time_per_iteration": 4.26338267326355 }, { "auxiliary_loss_clip": 0.01137468, "auxiliary_loss_mlp": 0.01078067, "balance_loss_clip": 1.05511892, "balance_loss_mlp": 1.05670488, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 1.9721234476704599, "language_loss": 0.74027002, "learning_rate": 3.9019083328956315e-06, "loss": 0.7624253, "num_input_tokens_seen": 45696240, "step": 2114, "time_per_iteration": 2.7573230266571045 }, { "auxiliary_loss_clip": 0.01156713, "auxiliary_loss_mlp": 0.01060574, "balance_loss_clip": 1.05770111, "balance_loss_mlp": 1.03924704, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 1.7921743813213327, "language_loss": 0.83240676, "learning_rate": 3.901787823946341e-06, "loss": 0.85457963, "num_input_tokens_seen": 45713695, "step": 2115, "time_per_iteration": 4.1369829177856445 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01065557, "balance_loss_clip": 1.05875492, "balance_loss_mlp": 1.04476702, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.4840591347809418, "language_loss": 0.87010503, "learning_rate": 3.901667242881065e-06, "loss": 0.89230716, "num_input_tokens_seen": 45736655, "step": 2116, "time_per_iteration": 2.73896861076355 }, { "auxiliary_loss_clip": 0.01139498, "auxiliary_loss_mlp": 0.00777066, "balance_loss_clip": 1.05413389, "balance_loss_mlp": 0.99995339, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.753205985010591, "language_loss": 0.70374918, "learning_rate": 3.9015465897043775e-06, "loss": 0.72291481, "num_input_tokens_seen": 45758195, "step": 2117, "time_per_iteration": 2.783156156539917 }, { "auxiliary_loss_clip": 0.01127455, "auxiliary_loss_mlp": 0.0106424, "balance_loss_clip": 1.04978406, "balance_loss_mlp": 1.04068434, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 1.9957647698478755, "language_loss": 0.86237884, "learning_rate": 3.901425864420852e-06, "loss": 0.8842957, "num_input_tokens_seen": 45774280, "step": 2118, "time_per_iteration": 4.322036266326904 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01049008, "balance_loss_clip": 1.05827069, "balance_loss_mlp": 1.02951694, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 1.705293179953873, "language_loss": 0.87577266, "learning_rate": 3.901305067035068e-06, "loss": 0.89787692, "num_input_tokens_seen": 45792760, "step": 2119, "time_per_iteration": 2.6559741497039795 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.0077754, "balance_loss_clip": 1.05233431, "balance_loss_mlp": 0.99984539, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 2.05013605026053, "language_loss": 0.87824571, "learning_rate": 3.901184197551605e-06, "loss": 0.89747536, "num_input_tokens_seen": 45804300, "step": 2120, "time_per_iteration": 2.6154048442840576 }, { "auxiliary_loss_clip": 0.01170497, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.05822706, "balance_loss_mlp": 1.02626204, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 1.9784951602308867, "language_loss": 0.75584805, "learning_rate": 3.901063255975046e-06, "loss": 0.77801377, "num_input_tokens_seen": 45823780, "step": 2121, "time_per_iteration": 2.579265832901001 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.04741263, "balance_loss_mlp": 1.02727842, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.0293629108662405, "language_loss": 0.82732606, "learning_rate": 3.900942242309978e-06, "loss": 0.84893048, "num_input_tokens_seen": 45840495, "step": 2122, "time_per_iteration": 2.793870210647583 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05901408, "balance_loss_mlp": 1.02983987, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.7660235451894624, "language_loss": 0.78699338, "learning_rate": 3.90082115656099e-06, "loss": 0.80900776, "num_input_tokens_seen": 45857735, "step": 2123, "time_per_iteration": 2.70546293258667 }, { "auxiliary_loss_clip": 0.01172823, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.05931985, "balance_loss_mlp": 1.03478789, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.5643885422181942, "language_loss": 0.78931451, "learning_rate": 3.900699998732673e-06, "loss": 0.81159604, "num_input_tokens_seen": 45876485, "step": 2124, "time_per_iteration": 2.661712408065796 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.00776885, "balance_loss_clip": 1.05457389, "balance_loss_mlp": 0.99987447, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 1.9695028631977674, "language_loss": 0.75605726, "learning_rate": 3.900578768829623e-06, "loss": 0.7754308, "num_input_tokens_seen": 45894645, "step": 2125, "time_per_iteration": 2.696021556854248 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.00777059, "balance_loss_clip": 1.05398965, "balance_loss_mlp": 1.00002348, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.019802885219414, "language_loss": 0.78016824, "learning_rate": 3.900457466856434e-06, "loss": 0.79952049, "num_input_tokens_seen": 45913755, "step": 2126, "time_per_iteration": 2.721435308456421 }, { "auxiliary_loss_clip": 0.01124637, "auxiliary_loss_mlp": 0.010537, "balance_loss_clip": 1.05406642, "balance_loss_mlp": 1.03504348, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.3825945270792501, "language_loss": 0.6927852, "learning_rate": 3.9003360928177085e-06, "loss": 0.71456861, "num_input_tokens_seen": 45936095, "step": 2127, "time_per_iteration": 2.902101993560791 }, { "auxiliary_loss_clip": 0.01030231, "auxiliary_loss_mlp": 0.00759051, "balance_loss_clip": 1.02830005, "balance_loss_mlp": 1.00050259, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.853491438999862, "language_loss": 0.62831402, "learning_rate": 3.900214646718047e-06, "loss": 0.64620686, "num_input_tokens_seen": 46004655, "step": 2128, "time_per_iteration": 3.3387396335601807 }, { "auxiliary_loss_clip": 0.01145823, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.05080712, "balance_loss_mlp": 1.02599955, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.066959353069841, "language_loss": 0.77626479, "learning_rate": 3.900093128562056e-06, "loss": 0.7982012, "num_input_tokens_seen": 46023610, "step": 2129, "time_per_iteration": 2.611309766769409 }, { "auxiliary_loss_clip": 0.01122914, "auxiliary_loss_mlp": 0.01052577, "balance_loss_clip": 1.05058527, "balance_loss_mlp": 1.03029668, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.1214737401843893, "language_loss": 0.79263359, "learning_rate": 3.899971538354343e-06, "loss": 0.81438851, "num_input_tokens_seen": 46041725, "step": 2130, "time_per_iteration": 2.753243923187256 }, { "auxiliary_loss_clip": 0.01139626, "auxiliary_loss_mlp": 0.01052453, "balance_loss_clip": 1.05133748, "balance_loss_mlp": 1.03147244, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 1.7780274650921335, "language_loss": 0.70945668, "learning_rate": 3.899849876099518e-06, "loss": 0.73137754, "num_input_tokens_seen": 46061095, "step": 2131, "time_per_iteration": 2.6809306144714355 }, { "auxiliary_loss_clip": 0.01102824, "auxiliary_loss_mlp": 0.01052393, "balance_loss_clip": 1.04982638, "balance_loss_mlp": 1.03163886, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 2.2916674504462655, "language_loss": 0.72298968, "learning_rate": 3.899728141802197e-06, "loss": 0.74454176, "num_input_tokens_seen": 46082670, "step": 2132, "time_per_iteration": 2.8769233226776123 }, { "auxiliary_loss_clip": 0.01102594, "auxiliary_loss_mlp": 0.01055993, "balance_loss_clip": 1.04384947, "balance_loss_mlp": 1.03348672, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 2.0316054281953155, "language_loss": 0.82128644, "learning_rate": 3.8996063354669935e-06, "loss": 0.84287226, "num_input_tokens_seen": 46102410, "step": 2133, "time_per_iteration": 2.766897678375244 }, { "auxiliary_loss_clip": 0.01163396, "auxiliary_loss_mlp": 0.01057069, "balance_loss_clip": 1.05397773, "balance_loss_mlp": 1.03458595, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 3.232115826630309, "language_loss": 0.80001891, "learning_rate": 3.899484457098528e-06, "loss": 0.82222354, "num_input_tokens_seen": 46121145, "step": 2134, "time_per_iteration": 2.6347672939300537 }, { "auxiliary_loss_clip": 0.01159056, "auxiliary_loss_mlp": 0.01046209, "balance_loss_clip": 1.05907345, "balance_loss_mlp": 1.02614641, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.731952504909339, "language_loss": 0.82657921, "learning_rate": 3.899362506701421e-06, "loss": 0.84863198, "num_input_tokens_seen": 46140740, "step": 2135, "time_per_iteration": 2.6393656730651855 }, { "auxiliary_loss_clip": 0.0114208, "auxiliary_loss_mlp": 0.0105553, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 1.03411996, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 2.1083924470752278, "language_loss": 0.7764526, "learning_rate": 3.899240484280298e-06, "loss": 0.79842871, "num_input_tokens_seen": 46156805, "step": 2136, "time_per_iteration": 2.7195920944213867 }, { "auxiliary_loss_clip": 0.01020946, "auxiliary_loss_mlp": 0.01003991, "balance_loss_clip": 1.01967573, "balance_loss_mlp": 1.00096273, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.8964253308146478, "language_loss": 0.59152198, "learning_rate": 3.899118389839785e-06, "loss": 0.61177135, "num_input_tokens_seen": 46222085, "step": 2137, "time_per_iteration": 3.416015625 }, { "auxiliary_loss_clip": 0.01153694, "auxiliary_loss_mlp": 0.01054623, "balance_loss_clip": 1.05178177, "balance_loss_mlp": 1.03483438, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 3.244493357011547, "language_loss": 0.82344306, "learning_rate": 3.898996223384512e-06, "loss": 0.84552622, "num_input_tokens_seen": 46239970, "step": 2138, "time_per_iteration": 2.65515398979187 }, { "auxiliary_loss_clip": 0.01159586, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.05592752, "balance_loss_mlp": 1.02665496, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.5417837252920323, "language_loss": 0.78691363, "learning_rate": 3.898873984919113e-06, "loss": 0.8090024, "num_input_tokens_seen": 46257740, "step": 2139, "time_per_iteration": 2.651132345199585 }, { "auxiliary_loss_clip": 0.01136892, "auxiliary_loss_mlp": 0.01045928, "balance_loss_clip": 1.05267286, "balance_loss_mlp": 1.02582908, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 1.9541049485452633, "language_loss": 0.85289955, "learning_rate": 3.8987516744482215e-06, "loss": 0.87472773, "num_input_tokens_seen": 46275445, "step": 2140, "time_per_iteration": 2.730156183242798 }, { "auxiliary_loss_clip": 0.01143134, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05203128, "balance_loss_mlp": 1.02482224, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 1.8185491602156885, "language_loss": 0.86268306, "learning_rate": 3.898629291976476e-06, "loss": 0.88455778, "num_input_tokens_seen": 46291710, "step": 2141, "time_per_iteration": 2.62223482131958 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.0528295, "balance_loss_mlp": 1.02548814, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 3.1267362471736684, "language_loss": 0.68282312, "learning_rate": 3.898506837508518e-06, "loss": 0.70475101, "num_input_tokens_seen": 46311335, "step": 2142, "time_per_iteration": 2.71232271194458 }, { "auxiliary_loss_clip": 0.01165678, "auxiliary_loss_mlp": 0.0077895, "balance_loss_clip": 1.05764627, "balance_loss_mlp": 0.99990749, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 2.373838274123079, "language_loss": 0.83479214, "learning_rate": 3.89838431104899e-06, "loss": 0.85423845, "num_input_tokens_seen": 46330985, "step": 2143, "time_per_iteration": 2.677692174911499 }, { "auxiliary_loss_clip": 0.01175134, "auxiliary_loss_mlp": 0.00777405, "balance_loss_clip": 1.0598439, "balance_loss_mlp": 0.99994075, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.5662270309624111, "language_loss": 0.81703234, "learning_rate": 3.898261712602539e-06, "loss": 0.83655775, "num_input_tokens_seen": 46351295, "step": 2144, "time_per_iteration": 2.712620496749878 }, { "auxiliary_loss_clip": 0.01130321, "auxiliary_loss_mlp": 0.01053521, "balance_loss_clip": 1.04658103, "balance_loss_mlp": 1.03145528, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 1.8026346290528672, "language_loss": 0.78304374, "learning_rate": 3.898139042173813e-06, "loss": 0.80488217, "num_input_tokens_seen": 46368600, "step": 2145, "time_per_iteration": 2.6766605377197266 }, { "auxiliary_loss_clip": 0.01170585, "auxiliary_loss_mlp": 0.01047893, "balance_loss_clip": 1.0543592, "balance_loss_mlp": 1.02662635, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 2.147087506474235, "language_loss": 0.82865375, "learning_rate": 3.898016299767465e-06, "loss": 0.85083848, "num_input_tokens_seen": 46387370, "step": 2146, "time_per_iteration": 2.5860395431518555 }, { "auxiliary_loss_clip": 0.01141916, "auxiliary_loss_mlp": 0.0105138, "balance_loss_clip": 1.05367482, "balance_loss_mlp": 1.03062606, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.344626501147968, "language_loss": 0.71275079, "learning_rate": 3.897893485388149e-06, "loss": 0.73468375, "num_input_tokens_seen": 46409570, "step": 2147, "time_per_iteration": 2.7870359420776367 }, { "auxiliary_loss_clip": 0.01147238, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.05527067, "balance_loss_mlp": 1.03297925, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 2.120275205230366, "language_loss": 0.71432978, "learning_rate": 3.897770599040521e-06, "loss": 0.73632509, "num_input_tokens_seen": 46429320, "step": 2148, "time_per_iteration": 2.6865081787109375 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.05762172, "balance_loss_mlp": 1.03016782, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.6388902851592406, "language_loss": 0.79064089, "learning_rate": 3.897647640729242e-06, "loss": 0.81282145, "num_input_tokens_seen": 46450155, "step": 2149, "time_per_iteration": 2.6041862964630127 }, { "auxiliary_loss_clip": 0.01159527, "auxiliary_loss_mlp": 0.01046069, "balance_loss_clip": 1.05377793, "balance_loss_mlp": 1.02531469, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.034796374339078, "language_loss": 0.75976646, "learning_rate": 3.897524610458975e-06, "loss": 0.78182244, "num_input_tokens_seen": 46470280, "step": 2150, "time_per_iteration": 2.647224187850952 }, { "auxiliary_loss_clip": 0.01155787, "auxiliary_loss_mlp": 0.01055192, "balance_loss_clip": 1.05445433, "balance_loss_mlp": 1.03491461, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.3830500835005592, "language_loss": 0.70986372, "learning_rate": 3.8974015082343835e-06, "loss": 0.73197353, "num_input_tokens_seen": 46487605, "step": 2151, "time_per_iteration": 2.7008492946624756 }, { "auxiliary_loss_clip": 0.01167835, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.05603719, "balance_loss_mlp": 1.03017378, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.058334480733051, "language_loss": 0.83964819, "learning_rate": 3.897278334060137e-06, "loss": 0.86182165, "num_input_tokens_seen": 46505100, "step": 2152, "time_per_iteration": 2.6467373371124268 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01058416, "balance_loss_clip": 1.05283821, "balance_loss_mlp": 1.03888893, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.5624811365269535, "language_loss": 0.78585124, "learning_rate": 3.897155087940906e-06, "loss": 0.80802095, "num_input_tokens_seen": 46524020, "step": 2153, "time_per_iteration": 4.286921262741089 }, { "auxiliary_loss_clip": 0.01113716, "auxiliary_loss_mlp": 0.00777812, "balance_loss_clip": 1.04707122, "balance_loss_mlp": 0.99989671, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.6189787343362376, "language_loss": 0.80253434, "learning_rate": 3.897031769881364e-06, "loss": 0.82144964, "num_input_tokens_seen": 46544640, "step": 2154, "time_per_iteration": 2.7602338790893555 }, { "auxiliary_loss_clip": 0.01149958, "auxiliary_loss_mlp": 0.0105188, "balance_loss_clip": 1.05262971, "balance_loss_mlp": 1.03099442, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 1.8080432584650143, "language_loss": 0.83717728, "learning_rate": 3.896908379886188e-06, "loss": 0.85919571, "num_input_tokens_seen": 46561395, "step": 2155, "time_per_iteration": 5.696707010269165 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01056273, "balance_loss_clip": 1.05426383, "balance_loss_mlp": 1.03611445, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.4972858828122666, "language_loss": 0.76114857, "learning_rate": 3.896784917960055e-06, "loss": 0.78333133, "num_input_tokens_seen": 46579395, "step": 2156, "time_per_iteration": 2.6279313564300537 }, { "auxiliary_loss_clip": 0.01105089, "auxiliary_loss_mlp": 0.01056603, "balance_loss_clip": 1.0510118, "balance_loss_mlp": 1.03679013, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.6652476704410177, "language_loss": 0.86493659, "learning_rate": 3.896661384107648e-06, "loss": 0.88655347, "num_input_tokens_seen": 46597090, "step": 2157, "time_per_iteration": 4.4089202880859375 }, { "auxiliary_loss_clip": 0.01170107, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.05253935, "balance_loss_mlp": 1.0349642, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 2.5240136552338956, "language_loss": 0.80393612, "learning_rate": 3.896537778333651e-06, "loss": 0.8261953, "num_input_tokens_seen": 46617355, "step": 2158, "time_per_iteration": 2.702765703201294 }, { "auxiliary_loss_clip": 0.01177017, "auxiliary_loss_mlp": 0.01060365, "balance_loss_clip": 1.05905974, "balance_loss_mlp": 1.04050517, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.5307604694159607, "language_loss": 0.74881256, "learning_rate": 3.896414100642752e-06, "loss": 0.77118635, "num_input_tokens_seen": 46633130, "step": 2159, "time_per_iteration": 2.534163475036621 }, { "auxiliary_loss_clip": 0.01122909, "auxiliary_loss_mlp": 0.01058309, "balance_loss_clip": 1.04594469, "balance_loss_mlp": 1.03471708, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 1.954419432637739, "language_loss": 0.8259204, "learning_rate": 3.89629035103964e-06, "loss": 0.84773254, "num_input_tokens_seen": 46650575, "step": 2160, "time_per_iteration": 2.7358646392822266 }, { "auxiliary_loss_clip": 0.01154348, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.05873609, "balance_loss_mlp": 1.02732301, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.7252123805741888, "language_loss": 0.82310414, "learning_rate": 3.896166529529008e-06, "loss": 0.84512007, "num_input_tokens_seen": 46668780, "step": 2161, "time_per_iteration": 2.7029623985290527 }, { "auxiliary_loss_clip": 0.01145886, "auxiliary_loss_mlp": 0.01060381, "balance_loss_clip": 1.05145073, "balance_loss_mlp": 1.03911448, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.0780374068601253, "language_loss": 0.82668459, "learning_rate": 3.896042636115551e-06, "loss": 0.84874725, "num_input_tokens_seen": 46687550, "step": 2162, "time_per_iteration": 2.674825668334961 }, { "auxiliary_loss_clip": 0.0113921, "auxiliary_loss_mlp": 0.0105953, "balance_loss_clip": 1.05468941, "balance_loss_mlp": 1.03957474, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 3.928222506771022, "language_loss": 0.72579277, "learning_rate": 3.895918670803968e-06, "loss": 0.7477802, "num_input_tokens_seen": 46706730, "step": 2163, "time_per_iteration": 2.678394079208374 }, { "auxiliary_loss_clip": 0.01173873, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.05635965, "balance_loss_mlp": 0.99994016, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 2.0196348424542827, "language_loss": 0.81330699, "learning_rate": 3.895794633598958e-06, "loss": 0.83283234, "num_input_tokens_seen": 46724250, "step": 2164, "time_per_iteration": 2.6116931438446045 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01050661, "balance_loss_clip": 1.04808033, "balance_loss_mlp": 1.03061032, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.274563635903502, "language_loss": 0.72262049, "learning_rate": 3.8956705245052256e-06, "loss": 0.74432552, "num_input_tokens_seen": 46744105, "step": 2165, "time_per_iteration": 2.7646515369415283 }, { "auxiliary_loss_clip": 0.01109832, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.05059505, "balance_loss_mlp": 1.02707219, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 2.8383873988269217, "language_loss": 0.74749964, "learning_rate": 3.8955463435274765e-06, "loss": 0.76908153, "num_input_tokens_seen": 46764250, "step": 2166, "time_per_iteration": 2.7939398288726807 }, { "auxiliary_loss_clip": 0.01170298, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05364752, "balance_loss_mlp": 1.02827251, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.5379857106114436, "language_loss": 0.83098066, "learning_rate": 3.895422090670421e-06, "loss": 0.85316396, "num_input_tokens_seen": 46786865, "step": 2167, "time_per_iteration": 2.700505495071411 }, { "auxiliary_loss_clip": 0.01108628, "auxiliary_loss_mlp": 0.01059921, "balance_loss_clip": 1.04567361, "balance_loss_mlp": 1.03841531, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.6054044551173634, "language_loss": 0.83578718, "learning_rate": 3.89529776593877e-06, "loss": 0.85747266, "num_input_tokens_seen": 46807030, "step": 2168, "time_per_iteration": 2.839285135269165 }, { "auxiliary_loss_clip": 0.01079188, "auxiliary_loss_mlp": 0.01063413, "balance_loss_clip": 1.04247975, "balance_loss_mlp": 1.03861713, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 1.950315007602454, "language_loss": 0.79910588, "learning_rate": 3.8951733693372375e-06, "loss": 0.8205319, "num_input_tokens_seen": 46826280, "step": 2169, "time_per_iteration": 2.8150076866149902 }, { "auxiliary_loss_clip": 0.01174566, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05822575, "balance_loss_mlp": 1.02339983, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.4117618540057766, "language_loss": 0.66804767, "learning_rate": 3.8950489008705406e-06, "loss": 0.69024229, "num_input_tokens_seen": 46846505, "step": 2170, "time_per_iteration": 2.722769021987915 }, { "auxiliary_loss_clip": 0.0114216, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05424142, "balance_loss_mlp": 1.02637053, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.9089846415842238, "language_loss": 0.66768706, "learning_rate": 3.8949243605434e-06, "loss": 0.68957549, "num_input_tokens_seen": 46867380, "step": 2171, "time_per_iteration": 2.7474682331085205 }, { "auxiliary_loss_clip": 0.01157431, "auxiliary_loss_mlp": 0.01049079, "balance_loss_clip": 1.05283058, "balance_loss_mlp": 1.02701378, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 2.103440896006443, "language_loss": 0.72157478, "learning_rate": 3.894799748360537e-06, "loss": 0.74363995, "num_input_tokens_seen": 46886810, "step": 2172, "time_per_iteration": 2.8062691688537598 }, { "auxiliary_loss_clip": 0.01131178, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05676126, "balance_loss_mlp": 1.0248909, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.8662964619330822, "language_loss": 0.75331408, "learning_rate": 3.894675064326678e-06, "loss": 0.77508402, "num_input_tokens_seen": 46905620, "step": 2173, "time_per_iteration": 2.749630928039551 }, { "auxiliary_loss_clip": 0.01132129, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.05241716, "balance_loss_mlp": 1.03388715, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 2.8034072456055426, "language_loss": 0.70175481, "learning_rate": 3.894550308446551e-06, "loss": 0.72363639, "num_input_tokens_seen": 46925120, "step": 2174, "time_per_iteration": 2.723314046859741 }, { "auxiliary_loss_clip": 0.01047643, "auxiliary_loss_mlp": 0.01015006, "balance_loss_clip": 1.02629197, "balance_loss_mlp": 1.01260972, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.7998489021914615, "language_loss": 0.59026134, "learning_rate": 3.894425480724886e-06, "loss": 0.61088777, "num_input_tokens_seen": 46988195, "step": 2175, "time_per_iteration": 3.318049192428589 }, { "auxiliary_loss_clip": 0.01159762, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.05441868, "balance_loss_mlp": 1.03342521, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.2309284705459707, "language_loss": 0.80365628, "learning_rate": 3.894300581166417e-06, "loss": 0.82579315, "num_input_tokens_seen": 47004720, "step": 2176, "time_per_iteration": 2.631732702255249 }, { "auxiliary_loss_clip": 0.01169648, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.05513525, "balance_loss_mlp": 1.02529645, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.6906214681317566, "language_loss": 0.74661696, "learning_rate": 3.894175609775881e-06, "loss": 0.76878858, "num_input_tokens_seen": 47024255, "step": 2177, "time_per_iteration": 2.701422691345215 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.051373, "balance_loss_mlp": 1.02905297, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.8043513019060269, "language_loss": 0.82266748, "learning_rate": 3.894050566558015e-06, "loss": 0.84449303, "num_input_tokens_seen": 47042465, "step": 2178, "time_per_iteration": 2.6934497356414795 }, { "auxiliary_loss_clip": 0.01170524, "auxiliary_loss_mlp": 0.01047895, "balance_loss_clip": 1.05729508, "balance_loss_mlp": 1.02705729, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.9251611149508276, "language_loss": 0.74291968, "learning_rate": 3.893925451517562e-06, "loss": 0.76510382, "num_input_tokens_seen": 47060370, "step": 2179, "time_per_iteration": 2.6111502647399902 }, { "auxiliary_loss_clip": 0.01128297, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.04917574, "balance_loss_mlp": 1.03184354, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 1.9805514150688242, "language_loss": 0.84366202, "learning_rate": 3.893800264659266e-06, "loss": 0.8654691, "num_input_tokens_seen": 47081415, "step": 2180, "time_per_iteration": 2.731229543685913 }, { "auxiliary_loss_clip": 0.01162028, "auxiliary_loss_mlp": 0.0105845, "balance_loss_clip": 1.05875921, "balance_loss_mlp": 1.03757644, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 1.8389866248015785, "language_loss": 0.89840436, "learning_rate": 3.8936750059878746e-06, "loss": 0.92060918, "num_input_tokens_seen": 47099860, "step": 2181, "time_per_iteration": 2.643890380859375 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01051982, "balance_loss_clip": 1.05222976, "balance_loss_mlp": 1.03126323, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 2.117586475019142, "language_loss": 0.68813586, "learning_rate": 3.893549675508137e-06, "loss": 0.7101934, "num_input_tokens_seen": 47118540, "step": 2182, "time_per_iteration": 2.6198863983154297 }, { "auxiliary_loss_clip": 0.01123039, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.0502702, "balance_loss_mlp": 1.0292381, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 1.787500136217105, "language_loss": 0.78694725, "learning_rate": 3.893424273224806e-06, "loss": 0.8086918, "num_input_tokens_seen": 47136710, "step": 2183, "time_per_iteration": 2.715517520904541 }, { "auxiliary_loss_clip": 0.01169106, "auxiliary_loss_mlp": 0.01047098, "balance_loss_clip": 1.05452895, "balance_loss_mlp": 1.02586675, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 26.753588494231124, "language_loss": 0.85792655, "learning_rate": 3.893298799142636e-06, "loss": 0.88008863, "num_input_tokens_seen": 47157155, "step": 2184, "time_per_iteration": 2.632539987564087 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.01054657, "balance_loss_clip": 1.05349112, "balance_loss_mlp": 1.03230524, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 2.50466124454056, "language_loss": 0.82703435, "learning_rate": 3.893173253266387e-06, "loss": 0.84896809, "num_input_tokens_seen": 47176820, "step": 2185, "time_per_iteration": 2.6809136867523193 }, { "auxiliary_loss_clip": 0.01144077, "auxiliary_loss_mlp": 0.01054121, "balance_loss_clip": 1.05262399, "balance_loss_mlp": 1.03236496, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 1.8949462712827352, "language_loss": 0.72956109, "learning_rate": 3.893047635600818e-06, "loss": 0.75154305, "num_input_tokens_seen": 47195855, "step": 2186, "time_per_iteration": 2.628096342086792 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01050695, "balance_loss_clip": 1.05436552, "balance_loss_mlp": 1.02783096, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 1.9822444068613732, "language_loss": 0.80363685, "learning_rate": 3.892921946150693e-06, "loss": 0.82572162, "num_input_tokens_seen": 47214535, "step": 2187, "time_per_iteration": 2.762223720550537 }, { "auxiliary_loss_clip": 0.01027324, "auxiliary_loss_mlp": 0.0101023, "balance_loss_clip": 1.02364707, "balance_loss_mlp": 1.00792885, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8471850380496847, "language_loss": 0.59082437, "learning_rate": 3.892796184920778e-06, "loss": 0.61119986, "num_input_tokens_seen": 47270300, "step": 2188, "time_per_iteration": 3.302457571029663 }, { "auxiliary_loss_clip": 0.01095126, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.04827487, "balance_loss_mlp": 1.03676724, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.7340345041340466, "language_loss": 0.74211109, "learning_rate": 3.892670351915842e-06, "loss": 0.76365584, "num_input_tokens_seen": 47290720, "step": 2189, "time_per_iteration": 2.7990496158599854 }, { "auxiliary_loss_clip": 0.01160124, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.05551052, "balance_loss_mlp": 1.02799821, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.8160574809616576, "language_loss": 0.73152113, "learning_rate": 3.892544447140657e-06, "loss": 0.75361335, "num_input_tokens_seen": 47311820, "step": 2190, "time_per_iteration": 2.6485326290130615 }, { "auxiliary_loss_clip": 0.01160351, "auxiliary_loss_mlp": 0.01058461, "balance_loss_clip": 1.05671644, "balance_loss_mlp": 1.03811169, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 1.8825588242208007, "language_loss": 0.74617779, "learning_rate": 3.892418470599996e-06, "loss": 0.76836598, "num_input_tokens_seen": 47331605, "step": 2191, "time_per_iteration": 2.644484281539917 }, { "auxiliary_loss_clip": 0.0112783, "auxiliary_loss_mlp": 0.01054712, "balance_loss_clip": 1.05129039, "balance_loss_mlp": 1.03356445, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 1.8823393822145031, "language_loss": 0.79093283, "learning_rate": 3.892292422298637e-06, "loss": 0.81275827, "num_input_tokens_seen": 47350455, "step": 2192, "time_per_iteration": 2.735225200653076 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01051113, "balance_loss_clip": 1.04457211, "balance_loss_mlp": 1.02936912, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.7242105632860862, "language_loss": 0.85350716, "learning_rate": 3.892166302241361e-06, "loss": 0.87514639, "num_input_tokens_seen": 47368225, "step": 2193, "time_per_iteration": 4.262877941131592 }, { "auxiliary_loss_clip": 0.0104173, "auxiliary_loss_mlp": 0.01015651, "balance_loss_clip": 1.02609122, "balance_loss_mlp": 1.01280212, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.7746813180799224, "language_loss": 0.54112649, "learning_rate": 3.8920401104329475e-06, "loss": 0.56170022, "num_input_tokens_seen": 47427125, "step": 2194, "time_per_iteration": 6.223008394241333 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.02828002, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 2.1079865649821925, "language_loss": 0.72433972, "learning_rate": 3.891913846878185e-06, "loss": 0.74650574, "num_input_tokens_seen": 47450275, "step": 2195, "time_per_iteration": 2.6357345581054688 }, { "auxiliary_loss_clip": 0.01136503, "auxiliary_loss_mlp": 0.00778731, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 0.99996454, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.5737174748369949, "language_loss": 0.78126895, "learning_rate": 3.891787511581859e-06, "loss": 0.8004213, "num_input_tokens_seen": 47469155, "step": 2196, "time_per_iteration": 2.7118594646453857 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05453539, "balance_loss_mlp": 1.03210831, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 1.9385650447291836, "language_loss": 0.74632496, "learning_rate": 3.89166110454876e-06, "loss": 0.76847541, "num_input_tokens_seen": 47488405, "step": 2197, "time_per_iteration": 4.270530939102173 }, { "auxiliary_loss_clip": 0.01173786, "auxiliary_loss_mlp": 0.01050846, "balance_loss_clip": 1.05440533, "balance_loss_mlp": 1.02947164, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 1.785688190112577, "language_loss": 0.79566747, "learning_rate": 3.891534625783685e-06, "loss": 0.81791383, "num_input_tokens_seen": 47505650, "step": 2198, "time_per_iteration": 2.6145474910736084 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.01057159, "balance_loss_clip": 1.05536175, "balance_loss_mlp": 1.03647637, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.56313218775589, "language_loss": 0.82932216, "learning_rate": 3.891408075291425e-06, "loss": 0.85159647, "num_input_tokens_seen": 47521540, "step": 2199, "time_per_iteration": 2.5715503692626953 }, { "auxiliary_loss_clip": 0.01122554, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.05047798, "balance_loss_mlp": 1.03045249, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.8710902505917797, "language_loss": 0.69579422, "learning_rate": 3.8912814530767826e-06, "loss": 0.71754128, "num_input_tokens_seen": 47543625, "step": 2200, "time_per_iteration": 2.8001365661621094 }, { "auxiliary_loss_clip": 0.01167798, "auxiliary_loss_mlp": 0.01058155, "balance_loss_clip": 1.05345917, "balance_loss_mlp": 1.03618431, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.647659287704997, "language_loss": 0.84624702, "learning_rate": 3.891154759144557e-06, "loss": 0.86850655, "num_input_tokens_seen": 47563740, "step": 2201, "time_per_iteration": 2.6485981941223145 }, { "auxiliary_loss_clip": 0.0117188, "auxiliary_loss_mlp": 0.01055627, "balance_loss_clip": 1.05427861, "balance_loss_mlp": 1.03431273, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.7446392584198542, "language_loss": 0.87088037, "learning_rate": 3.891027993499554e-06, "loss": 0.8931554, "num_input_tokens_seen": 47582655, "step": 2202, "time_per_iteration": 2.5921456813812256 }, { "auxiliary_loss_clip": 0.01139991, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.05299544, "balance_loss_mlp": 1.03267026, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 2.405254380671628, "language_loss": 0.72801507, "learning_rate": 3.89090115614658e-06, "loss": 0.7499491, "num_input_tokens_seen": 47600875, "step": 2203, "time_per_iteration": 2.6257405281066895 }, { "auxiliary_loss_clip": 0.01124508, "auxiliary_loss_mlp": 0.0105959, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.03916979, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.044348475010678, "language_loss": 0.73170948, "learning_rate": 3.890774247090444e-06, "loss": 0.75355047, "num_input_tokens_seen": 47619250, "step": 2204, "time_per_iteration": 2.753830909729004 }, { "auxiliary_loss_clip": 0.01160826, "auxiliary_loss_mlp": 0.01054406, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.03225708, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 2.094172729236468, "language_loss": 0.78377104, "learning_rate": 3.89064726633596e-06, "loss": 0.80592328, "num_input_tokens_seen": 47639445, "step": 2205, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01048818, "balance_loss_clip": 1.04975629, "balance_loss_mlp": 1.02782559, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 1.8609089802832188, "language_loss": 0.78638101, "learning_rate": 3.890520213887941e-06, "loss": 0.80812073, "num_input_tokens_seen": 47658740, "step": 2206, "time_per_iteration": 2.691962718963623 }, { "auxiliary_loss_clip": 0.01124965, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.04958403, "balance_loss_mlp": 1.02649069, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 2.2777192787220066, "language_loss": 0.74672282, "learning_rate": 3.890393089751208e-06, "loss": 0.76843208, "num_input_tokens_seen": 47676880, "step": 2207, "time_per_iteration": 2.7062454223632812 }, { "auxiliary_loss_clip": 0.01143208, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.05257845, "balance_loss_mlp": 1.02672219, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 1.692212064021935, "language_loss": 0.84061795, "learning_rate": 3.890265893930578e-06, "loss": 0.8625294, "num_input_tokens_seen": 47696635, "step": 2208, "time_per_iteration": 2.687717914581299 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.05847478, "balance_loss_mlp": 1.03411973, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 1.7032258459750478, "language_loss": 0.85587811, "learning_rate": 3.890138626430876e-06, "loss": 0.8779313, "num_input_tokens_seen": 47717760, "step": 2209, "time_per_iteration": 2.646015167236328 }, { "auxiliary_loss_clip": 0.01138084, "auxiliary_loss_mlp": 0.00778828, "balance_loss_clip": 1.05316806, "balance_loss_mlp": 1.00002563, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.237247968175465, "language_loss": 0.81797457, "learning_rate": 3.890011287256929e-06, "loss": 0.83714366, "num_input_tokens_seen": 47737685, "step": 2210, "time_per_iteration": 2.676262378692627 }, { "auxiliary_loss_clip": 0.0104445, "auxiliary_loss_mlp": 0.00757817, "balance_loss_clip": 1.03801322, "balance_loss_mlp": 1.00007725, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7515252652740232, "language_loss": 0.58031559, "learning_rate": 3.889883876413563e-06, "loss": 0.59833825, "num_input_tokens_seen": 47802415, "step": 2211, "time_per_iteration": 3.3914146423339844 }, { "auxiliary_loss_clip": 0.01064712, "auxiliary_loss_mlp": 0.01012978, "balance_loss_clip": 1.04205871, "balance_loss_mlp": 1.01083231, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.8012428422082742, "language_loss": 0.55299425, "learning_rate": 3.889756393905611e-06, "loss": 0.57377112, "num_input_tokens_seen": 47871485, "step": 2212, "time_per_iteration": 3.2910914421081543 }, { "auxiliary_loss_clip": 0.01132433, "auxiliary_loss_mlp": 0.01054299, "balance_loss_clip": 1.05107963, "balance_loss_mlp": 1.0331986, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.484635795733661, "language_loss": 0.74228692, "learning_rate": 3.889628839737908e-06, "loss": 0.7641542, "num_input_tokens_seen": 47888315, "step": 2213, "time_per_iteration": 2.755777597427368 }, { "auxiliary_loss_clip": 0.01114671, "auxiliary_loss_mlp": 0.01051459, "balance_loss_clip": 1.04682255, "balance_loss_mlp": 1.03231359, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.850943077435394, "language_loss": 0.79699469, "learning_rate": 3.889501213915291e-06, "loss": 0.81865597, "num_input_tokens_seen": 47906600, "step": 2214, "time_per_iteration": 2.702603340148926 }, { "auxiliary_loss_clip": 0.01143494, "auxiliary_loss_mlp": 0.01052411, "balance_loss_clip": 1.05555344, "balance_loss_mlp": 1.03171659, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.8782588426913054, "language_loss": 0.69341159, "learning_rate": 3.889373516442597e-06, "loss": 0.71537066, "num_input_tokens_seen": 47927630, "step": 2215, "time_per_iteration": 2.769237518310547 }, { "auxiliary_loss_clip": 0.01167307, "auxiliary_loss_mlp": 0.01051423, "balance_loss_clip": 1.06098068, "balance_loss_mlp": 1.03132463, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.884566493826098, "language_loss": 0.81262428, "learning_rate": 3.889245747324671e-06, "loss": 0.83481157, "num_input_tokens_seen": 47947935, "step": 2216, "time_per_iteration": 2.7427120208740234 }, { "auxiliary_loss_clip": 0.01163681, "auxiliary_loss_mlp": 0.01056545, "balance_loss_clip": 1.06198788, "balance_loss_mlp": 1.03631544, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 3.783334161704178, "language_loss": 0.87299347, "learning_rate": 3.889117906566356e-06, "loss": 0.89519572, "num_input_tokens_seen": 47965515, "step": 2217, "time_per_iteration": 2.709527015686035 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01056364, "balance_loss_clip": 1.06054497, "balance_loss_mlp": 1.0343225, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 4.412823416345162, "language_loss": 0.73105222, "learning_rate": 3.888989994172501e-06, "loss": 0.75314289, "num_input_tokens_seen": 47985675, "step": 2218, "time_per_iteration": 2.697733163833618 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.01051151, "balance_loss_clip": 1.0535965, "balance_loss_mlp": 1.02993202, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7935349411013712, "language_loss": 0.86911142, "learning_rate": 3.8888620101479565e-06, "loss": 0.89091408, "num_input_tokens_seen": 48004985, "step": 2219, "time_per_iteration": 2.7641642093658447 }, { "auxiliary_loss_clip": 0.01141172, "auxiliary_loss_mlp": 0.0106326, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.04406714, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 1.8604531362737113, "language_loss": 0.77244747, "learning_rate": 3.888733954497574e-06, "loss": 0.79449183, "num_input_tokens_seen": 48024965, "step": 2220, "time_per_iteration": 2.732160806655884 }, { "auxiliary_loss_clip": 0.01146487, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.05399704, "balance_loss_mlp": 1.03001785, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.3004113327688955, "language_loss": 0.79467338, "learning_rate": 3.888605827226212e-06, "loss": 0.81662482, "num_input_tokens_seen": 48040890, "step": 2221, "time_per_iteration": 2.685612440109253 }, { "auxiliary_loss_clip": 0.01062777, "auxiliary_loss_mlp": 0.01021711, "balance_loss_clip": 1.03293467, "balance_loss_mlp": 1.0194701, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9755051104211709, "language_loss": 0.68938822, "learning_rate": 3.8884776283387275e-06, "loss": 0.71023309, "num_input_tokens_seen": 48091855, "step": 2222, "time_per_iteration": 3.0336835384368896 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01058574, "balance_loss_clip": 1.05544209, "balance_loss_mlp": 1.03940475, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 2.1295993667823416, "language_loss": 0.67389107, "learning_rate": 3.888349357839982e-06, "loss": 0.69577825, "num_input_tokens_seen": 48111350, "step": 2223, "time_per_iteration": 2.7134146690368652 }, { "auxiliary_loss_clip": 0.01161386, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.05785358, "balance_loss_mlp": 1.04010296, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 4.277142483609355, "language_loss": 0.82505226, "learning_rate": 3.88822101573484e-06, "loss": 0.84727186, "num_input_tokens_seen": 48129840, "step": 2224, "time_per_iteration": 2.608372926712036 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0573926, "balance_loss_mlp": 1.0290221, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 1.9890294619132924, "language_loss": 0.66270435, "learning_rate": 3.888092602028167e-06, "loss": 0.68493932, "num_input_tokens_seen": 48149240, "step": 2225, "time_per_iteration": 2.6304945945739746 }, { "auxiliary_loss_clip": 0.01153626, "auxiliary_loss_mlp": 0.01051637, "balance_loss_clip": 1.05233717, "balance_loss_mlp": 1.03180075, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.2915668787246997, "language_loss": 0.89469218, "learning_rate": 3.887964116724835e-06, "loss": 0.91674477, "num_input_tokens_seen": 48166330, "step": 2226, "time_per_iteration": 2.6002328395843506 }, { "auxiliary_loss_clip": 0.01150395, "auxiliary_loss_mlp": 0.01054296, "balance_loss_clip": 1.0549798, "balance_loss_mlp": 1.03423262, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.7271512115821777, "language_loss": 0.73209751, "learning_rate": 3.887835559829712e-06, "loss": 0.75414443, "num_input_tokens_seen": 48187600, "step": 2227, "time_per_iteration": 2.706193447113037 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.05518484, "balance_loss_mlp": 1.02683568, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.848999829625599, "language_loss": 0.85160232, "learning_rate": 3.8877069313476764e-06, "loss": 0.87365323, "num_input_tokens_seen": 48204400, "step": 2228, "time_per_iteration": 2.689209222793579 }, { "auxiliary_loss_clip": 0.01132803, "auxiliary_loss_mlp": 0.01052829, "balance_loss_clip": 1.04935181, "balance_loss_mlp": 1.03126431, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 1.909679794697233, "language_loss": 0.81460214, "learning_rate": 3.8875782312836054e-06, "loss": 0.83645844, "num_input_tokens_seen": 48222180, "step": 2229, "time_per_iteration": 2.6380228996276855 }, { "auxiliary_loss_clip": 0.0110557, "auxiliary_loss_mlp": 0.01052684, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.03233457, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 1.7464076089691416, "language_loss": 0.73822236, "learning_rate": 3.887449459642378e-06, "loss": 0.7598049, "num_input_tokens_seen": 48243245, "step": 2230, "time_per_iteration": 2.7332983016967773 }, { "auxiliary_loss_clip": 0.01125236, "auxiliary_loss_mlp": 0.01058977, "balance_loss_clip": 1.05213606, "balance_loss_mlp": 1.03890252, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 1.6827882777998602, "language_loss": 0.80133682, "learning_rate": 3.8873206164288785e-06, "loss": 0.82317901, "num_input_tokens_seen": 48262600, "step": 2231, "time_per_iteration": 2.6759045124053955 }, { "auxiliary_loss_clip": 0.01111387, "auxiliary_loss_mlp": 0.01057582, "balance_loss_clip": 1.04997492, "balance_loss_mlp": 1.03499198, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 1.746756846769887, "language_loss": 0.72152746, "learning_rate": 3.887191701647992e-06, "loss": 0.74321723, "num_input_tokens_seen": 48285075, "step": 2232, "time_per_iteration": 4.391890048980713 }, { "auxiliary_loss_clip": 0.0112104, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.0481019, "balance_loss_mlp": 1.03039551, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 2.4719586176391686, "language_loss": 0.65116024, "learning_rate": 3.8870627153046066e-06, "loss": 0.67288864, "num_input_tokens_seen": 48301285, "step": 2233, "time_per_iteration": 4.234508037567139 }, { "auxiliary_loss_clip": 0.01167005, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.05189967, "balance_loss_mlp": 1.02421367, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.4864430088666656, "language_loss": 0.80878961, "learning_rate": 3.886933657403615e-06, "loss": 0.8309058, "num_input_tokens_seen": 48317835, "step": 2234, "time_per_iteration": 4.175215005874634 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.05052733, "balance_loss_mlp": 1.03268874, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 2.0569321713284827, "language_loss": 0.82114553, "learning_rate": 3.886804527949909e-06, "loss": 0.84309351, "num_input_tokens_seen": 48335670, "step": 2235, "time_per_iteration": 2.6588025093078613 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02983022, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.6363146905087136, "language_loss": 0.86092007, "learning_rate": 3.8866753269483864e-06, "loss": 0.88293117, "num_input_tokens_seen": 48357805, "step": 2236, "time_per_iteration": 4.349383592605591 }, { "auxiliary_loss_clip": 0.01166751, "auxiliary_loss_mlp": 0.01047925, "balance_loss_clip": 1.05288053, "balance_loss_mlp": 1.02724242, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 1.82135056053112, "language_loss": 0.77258497, "learning_rate": 3.886546054403946e-06, "loss": 0.79473174, "num_input_tokens_seen": 48377845, "step": 2237, "time_per_iteration": 2.6398766040802 }, { "auxiliary_loss_clip": 0.01145425, "auxiliary_loss_mlp": 0.01051006, "balance_loss_clip": 1.05016851, "balance_loss_mlp": 1.02919102, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 2.440947698046141, "language_loss": 0.78772336, "learning_rate": 3.886416710321491e-06, "loss": 0.80968761, "num_input_tokens_seen": 48394735, "step": 2238, "time_per_iteration": 2.6556923389434814 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01050085, "balance_loss_clip": 1.05123293, "balance_loss_mlp": 1.02878201, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 2.9136729194949735, "language_loss": 0.68486369, "learning_rate": 3.886287294705924e-06, "loss": 0.70678043, "num_input_tokens_seen": 48414200, "step": 2239, "time_per_iteration": 2.6778814792633057 }, { "auxiliary_loss_clip": 0.01147129, "auxiliary_loss_mlp": 0.01052633, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.03197384, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.3763106012672925, "language_loss": 0.81277847, "learning_rate": 3.8861578075621555e-06, "loss": 0.8347761, "num_input_tokens_seen": 48431065, "step": 2240, "time_per_iteration": 2.5920939445495605 }, { "auxiliary_loss_clip": 0.01107793, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.04459488, "balance_loss_mlp": 1.02884459, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.7269080191231387, "language_loss": 0.77183759, "learning_rate": 3.886028248895093e-06, "loss": 0.79341465, "num_input_tokens_seen": 48450335, "step": 2241, "time_per_iteration": 2.7224419116973877 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.05439126, "balance_loss_mlp": 1.02324009, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 2.0305903786470743, "language_loss": 0.83062387, "learning_rate": 3.88589861870965e-06, "loss": 0.85267115, "num_input_tokens_seen": 48468555, "step": 2242, "time_per_iteration": 2.5794169902801514 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.05504107, "balance_loss_mlp": 1.03469825, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 2.465549548535016, "language_loss": 0.6498239, "learning_rate": 3.885768917010744e-06, "loss": 0.67209053, "num_input_tokens_seen": 48488515, "step": 2243, "time_per_iteration": 2.6709110736846924 }, { "auxiliary_loss_clip": 0.01125086, "auxiliary_loss_mlp": 0.01046786, "balance_loss_clip": 1.04593956, "balance_loss_mlp": 1.02618706, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.7770524512670738, "language_loss": 0.72633034, "learning_rate": 3.8856391438032895e-06, "loss": 0.74804902, "num_input_tokens_seen": 48510515, "step": 2244, "time_per_iteration": 2.713803768157959 }, { "auxiliary_loss_clip": 0.0115377, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.05312431, "balance_loss_mlp": 1.03209639, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.7564166456764931, "language_loss": 0.86023217, "learning_rate": 3.88550929909221e-06, "loss": 0.88228464, "num_input_tokens_seen": 48529940, "step": 2245, "time_per_iteration": 2.626560926437378 }, { "auxiliary_loss_clip": 0.01149467, "auxiliary_loss_mlp": 0.0105327, "balance_loss_clip": 1.05035663, "balance_loss_mlp": 1.03346968, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.7861449859595755, "language_loss": 0.78912753, "learning_rate": 3.88537938288243e-06, "loss": 0.8111549, "num_input_tokens_seen": 48548190, "step": 2246, "time_per_iteration": 2.6543703079223633 }, { "auxiliary_loss_clip": 0.010304, "auxiliary_loss_mlp": 0.01015407, "balance_loss_clip": 1.03666449, "balance_loss_mlp": 1.01285601, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7509256694227144, "language_loss": 0.6054731, "learning_rate": 3.885249395178874e-06, "loss": 0.62593114, "num_input_tokens_seen": 48613165, "step": 2247, "time_per_iteration": 3.3349809646606445 }, { "auxiliary_loss_clip": 0.01162017, "auxiliary_loss_mlp": 0.01056869, "balance_loss_clip": 1.05492628, "balance_loss_mlp": 1.03470767, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 2.562042993856578, "language_loss": 0.80841738, "learning_rate": 3.885119335986473e-06, "loss": 0.83060622, "num_input_tokens_seen": 48631705, "step": 2248, "time_per_iteration": 2.6279287338256836 }, { "auxiliary_loss_clip": 0.0114073, "auxiliary_loss_mlp": 0.01049128, "balance_loss_clip": 1.05086231, "balance_loss_mlp": 1.03054309, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9247838227480492, "language_loss": 0.77108699, "learning_rate": 3.884989205310157e-06, "loss": 0.79298556, "num_input_tokens_seen": 48649740, "step": 2249, "time_per_iteration": 2.7100210189819336 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01057649, "balance_loss_clip": 1.05325472, "balance_loss_mlp": 1.03863478, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.7403695434994237, "language_loss": 0.84457541, "learning_rate": 3.884859003154862e-06, "loss": 0.86642522, "num_input_tokens_seen": 48671565, "step": 2250, "time_per_iteration": 2.789350986480713 }, { "auxiliary_loss_clip": 0.01155547, "auxiliary_loss_mlp": 0.0105348, "balance_loss_clip": 1.05310512, "balance_loss_mlp": 1.03243995, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 3.018154510939524, "language_loss": 0.81796515, "learning_rate": 3.884728729525524e-06, "loss": 0.84005541, "num_input_tokens_seen": 48690425, "step": 2251, "time_per_iteration": 2.685617208480835 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.05235004, "balance_loss_mlp": 1.03888273, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.7680273527580506, "language_loss": 0.86173487, "learning_rate": 3.884598384427084e-06, "loss": 0.88399172, "num_input_tokens_seen": 48707505, "step": 2252, "time_per_iteration": 2.597219467163086 }, { "auxiliary_loss_clip": 0.01052296, "auxiliary_loss_mlp": 0.01018557, "balance_loss_clip": 1.02446079, "balance_loss_mlp": 1.01632786, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.8028920055572067, "language_loss": 0.61837333, "learning_rate": 3.884467967864485e-06, "loss": 0.6390819, "num_input_tokens_seen": 48775895, "step": 2253, "time_per_iteration": 3.25115704536438 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01055639, "balance_loss_clip": 1.0539906, "balance_loss_mlp": 1.03587449, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 1.6376691715964824, "language_loss": 0.89441288, "learning_rate": 3.884337479842671e-06, "loss": 0.91652036, "num_input_tokens_seen": 48798370, "step": 2254, "time_per_iteration": 2.6803932189941406 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01063066, "balance_loss_clip": 1.04506016, "balance_loss_mlp": 1.03872383, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.1104776784573787, "language_loss": 0.84626925, "learning_rate": 3.884206920366591e-06, "loss": 0.86821771, "num_input_tokens_seen": 48817955, "step": 2255, "time_per_iteration": 2.7074074745178223 }, { "auxiliary_loss_clip": 0.01165481, "auxiliary_loss_mlp": 0.01058458, "balance_loss_clip": 1.05211091, "balance_loss_mlp": 1.03767991, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 4.791676738707355, "language_loss": 0.74684238, "learning_rate": 3.884076289441196e-06, "loss": 0.76908177, "num_input_tokens_seen": 48836330, "step": 2256, "time_per_iteration": 2.590178966522217 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01054317, "balance_loss_clip": 1.04977024, "balance_loss_mlp": 1.03338361, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 5.890843360804152, "language_loss": 0.8309083, "learning_rate": 3.88394558707144e-06, "loss": 0.85272169, "num_input_tokens_seen": 48851890, "step": 2257, "time_per_iteration": 2.642096519470215 }, { "auxiliary_loss_clip": 0.0114984, "auxiliary_loss_mlp": 0.00780177, "balance_loss_clip": 1.05128407, "balance_loss_mlp": 1.00013828, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.1957250492246505, "language_loss": 0.82045269, "learning_rate": 3.883814813262277e-06, "loss": 0.83975297, "num_input_tokens_seen": 48865510, "step": 2258, "time_per_iteration": 2.6279473304748535 }, { "auxiliary_loss_clip": 0.01155515, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.05172098, "balance_loss_mlp": 1.03152323, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.6364031487830464, "language_loss": 0.82694167, "learning_rate": 3.883683968018669e-06, "loss": 0.849042, "num_input_tokens_seen": 48882360, "step": 2259, "time_per_iteration": 2.677804708480835 }, { "auxiliary_loss_clip": 0.01127201, "auxiliary_loss_mlp": 0.01054646, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.03547728, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 2.0790748617118853, "language_loss": 0.73916006, "learning_rate": 3.8835530513455755e-06, "loss": 0.76097858, "num_input_tokens_seen": 48902700, "step": 2260, "time_per_iteration": 2.7416799068450928 }, { "auxiliary_loss_clip": 0.01144177, "auxiliary_loss_mlp": 0.01056881, "balance_loss_clip": 1.05196047, "balance_loss_mlp": 1.03691387, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 3.546593987683097, "language_loss": 0.74799728, "learning_rate": 3.883422063247961e-06, "loss": 0.77000785, "num_input_tokens_seen": 48922525, "step": 2261, "time_per_iteration": 2.675342559814453 }, { "auxiliary_loss_clip": 0.01170469, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.05486035, "balance_loss_mlp": 1.03043413, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 2.967396076139427, "language_loss": 0.63602281, "learning_rate": 3.883291003730794e-06, "loss": 0.65823734, "num_input_tokens_seen": 48942510, "step": 2262, "time_per_iteration": 2.660538911819458 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.0516696, "balance_loss_mlp": 1.03216195, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.301949377353301, "language_loss": 0.81810403, "learning_rate": 3.883159872799043e-06, "loss": 0.84010524, "num_input_tokens_seen": 48962625, "step": 2263, "time_per_iteration": 2.840043783187866 }, { "auxiliary_loss_clip": 0.01098888, "auxiliary_loss_mlp": 0.01064302, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.0410558, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.7561035968690553, "language_loss": 0.87737143, "learning_rate": 3.8830286704576815e-06, "loss": 0.89900339, "num_input_tokens_seen": 48982525, "step": 2264, "time_per_iteration": 2.784648895263672 }, { "auxiliary_loss_clip": 0.01157618, "auxiliary_loss_mlp": 0.01049521, "balance_loss_clip": 1.05161715, "balance_loss_mlp": 1.02709746, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 3.151792845640157, "language_loss": 0.7115528, "learning_rate": 3.882897396711683e-06, "loss": 0.7336241, "num_input_tokens_seen": 48997605, "step": 2265, "time_per_iteration": 2.6108245849609375 }, { "auxiliary_loss_clip": 0.01111831, "auxiliary_loss_mlp": 0.01042545, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02256525, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 4.918827494175735, "language_loss": 0.6671263, "learning_rate": 3.882766051566027e-06, "loss": 0.68867004, "num_input_tokens_seen": 49018535, "step": 2266, "time_per_iteration": 2.7810373306274414 }, { "auxiliary_loss_clip": 0.01127539, "auxiliary_loss_mlp": 0.01057589, "balance_loss_clip": 1.05683684, "balance_loss_mlp": 1.03739524, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.707924588861666, "language_loss": 0.7634865, "learning_rate": 3.882634635025694e-06, "loss": 0.78533769, "num_input_tokens_seen": 49038865, "step": 2267, "time_per_iteration": 2.7682721614837646 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01048207, "balance_loss_clip": 1.04668903, "balance_loss_mlp": 1.02641535, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 2.9531688260339934, "language_loss": 0.81653506, "learning_rate": 3.882503147095667e-06, "loss": 0.83835161, "num_input_tokens_seen": 49058010, "step": 2268, "time_per_iteration": 2.645081043243408 }, { "auxiliary_loss_clip": 0.01155147, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.05424881, "balance_loss_mlp": 1.02738333, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.9923150848418427, "language_loss": 0.75975174, "learning_rate": 3.882371587780931e-06, "loss": 0.78178769, "num_input_tokens_seen": 49080330, "step": 2269, "time_per_iteration": 2.6764814853668213 }, { "auxiliary_loss_clip": 0.0113465, "auxiliary_loss_mlp": 0.01049702, "balance_loss_clip": 1.04941857, "balance_loss_mlp": 1.02844727, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 2.1475090354855473, "language_loss": 0.81328762, "learning_rate": 3.882239957086477e-06, "loss": 0.83513117, "num_input_tokens_seen": 49097035, "step": 2270, "time_per_iteration": 2.6801655292510986 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.04989171, "balance_loss_mlp": 1.03773928, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 3.2227070482893976, "language_loss": 0.75812757, "learning_rate": 3.882108255017295e-06, "loss": 0.78014266, "num_input_tokens_seen": 49113945, "step": 2271, "time_per_iteration": 4.197805166244507 }, { "auxiliary_loss_clip": 0.01156913, "auxiliary_loss_mlp": 0.01061846, "balance_loss_clip": 1.05097795, "balance_loss_mlp": 1.03921962, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.2800716885469754, "language_loss": 0.80251753, "learning_rate": 3.881976481578379e-06, "loss": 0.82470512, "num_input_tokens_seen": 49132855, "step": 2272, "time_per_iteration": 4.1461029052734375 }, { "auxiliary_loss_clip": 0.01055091, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.02539539, "balance_loss_mlp": 1.04001904, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.7097054685047118, "language_loss": 0.60739923, "learning_rate": 3.8818446367747255e-06, "loss": 0.62837708, "num_input_tokens_seen": 49198310, "step": 2273, "time_per_iteration": 4.731219530105591 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.00780474, "balance_loss_clip": 1.0523783, "balance_loss_mlp": 1.00008452, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 2.4844725334882583, "language_loss": 0.77506429, "learning_rate": 3.881712720611336e-06, "loss": 0.79452413, "num_input_tokens_seen": 49217250, "step": 2274, "time_per_iteration": 2.7122738361358643 }, { "auxiliary_loss_clip": 0.01154937, "auxiliary_loss_mlp": 0.01054542, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03271496, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 2.391437383339344, "language_loss": 0.78256011, "learning_rate": 3.881580733093211e-06, "loss": 0.8046549, "num_input_tokens_seen": 49236615, "step": 2275, "time_per_iteration": 2.6674444675445557 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.05220842, "balance_loss_mlp": 1.02449977, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.271072834476717, "language_loss": 0.81682789, "learning_rate": 3.881448674225356e-06, "loss": 0.83882004, "num_input_tokens_seen": 49253935, "step": 2276, "time_per_iteration": 4.202202558517456 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01060078, "balance_loss_clip": 1.05228245, "balance_loss_mlp": 1.03604531, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 5.063053962589045, "language_loss": 0.69948691, "learning_rate": 3.881316544012779e-06, "loss": 0.72173715, "num_input_tokens_seen": 49273605, "step": 2277, "time_per_iteration": 2.708591938018799 }, { "auxiliary_loss_clip": 0.01160044, "auxiliary_loss_mlp": 0.00780297, "balance_loss_clip": 1.05169702, "balance_loss_mlp": 1.00017083, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.062701620585305, "language_loss": 0.80197465, "learning_rate": 3.88118434246049e-06, "loss": 0.82137805, "num_input_tokens_seen": 49291785, "step": 2278, "time_per_iteration": 2.6916158199310303 }, { "auxiliary_loss_clip": 0.01159146, "auxiliary_loss_mlp": 0.01060686, "balance_loss_clip": 1.05954766, "balance_loss_mlp": 1.03925228, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 7.088344486179519, "language_loss": 0.75048816, "learning_rate": 3.881052069573502e-06, "loss": 0.77268648, "num_input_tokens_seen": 49311405, "step": 2279, "time_per_iteration": 2.7316977977752686 }, { "auxiliary_loss_clip": 0.01101952, "auxiliary_loss_mlp": 0.01066685, "balance_loss_clip": 1.04605758, "balance_loss_mlp": 1.04485774, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 2.5293116992138223, "language_loss": 0.76743513, "learning_rate": 3.880919725356831e-06, "loss": 0.78912151, "num_input_tokens_seen": 49331835, "step": 2280, "time_per_iteration": 2.813720941543579 }, { "auxiliary_loss_clip": 0.01108594, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04457331, "balance_loss_mlp": 1.04022956, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 2.0597640944890325, "language_loss": 0.79657966, "learning_rate": 3.880787309815496e-06, "loss": 0.81827366, "num_input_tokens_seen": 49352290, "step": 2281, "time_per_iteration": 2.8325345516204834 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.0107773, "balance_loss_clip": 1.05715084, "balance_loss_mlp": 1.05671358, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 2.0769142230572877, "language_loss": 0.83383757, "learning_rate": 3.880654822954518e-06, "loss": 0.85638046, "num_input_tokens_seen": 49370285, "step": 2282, "time_per_iteration": 2.5988755226135254 }, { "auxiliary_loss_clip": 0.01142098, "auxiliary_loss_mlp": 0.01075909, "balance_loss_clip": 1.04898703, "balance_loss_mlp": 1.05583453, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 1.5269487193470777, "language_loss": 0.73526621, "learning_rate": 3.8805222647789195e-06, "loss": 0.75744629, "num_input_tokens_seen": 49389610, "step": 2283, "time_per_iteration": 2.7099714279174805 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01062577, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.04173923, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 2.2306012559941455, "language_loss": 0.83934438, "learning_rate": 3.880389635293729e-06, "loss": 0.86157191, "num_input_tokens_seen": 49408390, "step": 2284, "time_per_iteration": 2.7315831184387207 }, { "auxiliary_loss_clip": 0.01151427, "auxiliary_loss_mlp": 0.01070288, "balance_loss_clip": 1.05204272, "balance_loss_mlp": 1.04779351, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 2.0900141273659223, "language_loss": 0.7557056, "learning_rate": 3.880256934503974e-06, "loss": 0.77792281, "num_input_tokens_seen": 49427725, "step": 2285, "time_per_iteration": 2.7257747650146484 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01064539, "balance_loss_clip": 1.05233073, "balance_loss_mlp": 1.04392731, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 2.727019945657865, "language_loss": 0.74521589, "learning_rate": 3.880124162414689e-06, "loss": 0.76730204, "num_input_tokens_seen": 49449000, "step": 2286, "time_per_iteration": 2.742582082748413 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01059198, "balance_loss_clip": 1.04906356, "balance_loss_mlp": 1.03659606, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 2.2168449035378357, "language_loss": 0.86683542, "learning_rate": 3.879991319030908e-06, "loss": 0.88868147, "num_input_tokens_seen": 49468360, "step": 2287, "time_per_iteration": 2.802088499069214 }, { "auxiliary_loss_clip": 0.01124712, "auxiliary_loss_mlp": 0.01064517, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.04207003, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.0592152854463106, "language_loss": 0.68410838, "learning_rate": 3.879858404357666e-06, "loss": 0.70600063, "num_input_tokens_seen": 49493450, "step": 2288, "time_per_iteration": 2.861175537109375 }, { "auxiliary_loss_clip": 0.01112106, "auxiliary_loss_mlp": 0.01071262, "balance_loss_clip": 1.05062151, "balance_loss_mlp": 1.04666936, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 2.3933568244149357, "language_loss": 0.87090456, "learning_rate": 3.879725418400005e-06, "loss": 0.89273822, "num_input_tokens_seen": 49511220, "step": 2289, "time_per_iteration": 2.7185773849487305 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.00781167, "balance_loss_clip": 1.0480957, "balance_loss_mlp": 1.00019848, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.8106848287624444, "language_loss": 0.74668044, "learning_rate": 3.879592361162969e-06, "loss": 0.76579404, "num_input_tokens_seen": 49529820, "step": 2290, "time_per_iteration": 2.6751222610473633 }, { "auxiliary_loss_clip": 0.01039657, "auxiliary_loss_mlp": 0.01081332, "balance_loss_clip": 1.03094769, "balance_loss_mlp": 1.07881641, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7179159366671727, "language_loss": 0.51597112, "learning_rate": 3.8794592326516015e-06, "loss": 0.53718102, "num_input_tokens_seen": 49595325, "step": 2291, "time_per_iteration": 3.2823359966278076 }, { "auxiliary_loss_clip": 0.01157406, "auxiliary_loss_mlp": 0.01052846, "balance_loss_clip": 1.05224037, "balance_loss_mlp": 1.03123331, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 1.9326408617769533, "language_loss": 0.71273667, "learning_rate": 3.879326032870952e-06, "loss": 0.7348392, "num_input_tokens_seen": 49615850, "step": 2292, "time_per_iteration": 2.74045729637146 }, { "auxiliary_loss_clip": 0.01156871, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.05427122, "balance_loss_mlp": 1.02931166, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 6.592759889378346, "language_loss": 0.8047784, "learning_rate": 3.879192761826071e-06, "loss": 0.82684022, "num_input_tokens_seen": 49631860, "step": 2293, "time_per_iteration": 2.587576389312744 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.0554558, "balance_loss_mlp": 1.02921653, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 1.9082895606463517, "language_loss": 0.78440171, "learning_rate": 3.879059419522011e-06, "loss": 0.80647767, "num_input_tokens_seen": 49652145, "step": 2294, "time_per_iteration": 2.7152793407440186 }, { "auxiliary_loss_clip": 0.01126374, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.05281758, "balance_loss_mlp": 1.03104973, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 1.991103290125302, "language_loss": 0.80339509, "learning_rate": 3.878926005963831e-06, "loss": 0.82516527, "num_input_tokens_seen": 49669880, "step": 2295, "time_per_iteration": 2.7026021480560303 }, { "auxiliary_loss_clip": 0.01154693, "auxiliary_loss_mlp": 0.01052186, "balance_loss_clip": 1.05239046, "balance_loss_mlp": 1.03102624, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.7450624966187134, "language_loss": 0.78661883, "learning_rate": 3.878792521156588e-06, "loss": 0.80868757, "num_input_tokens_seen": 49687255, "step": 2296, "time_per_iteration": 2.566929340362549 }, { "auxiliary_loss_clip": 0.01153425, "auxiliary_loss_mlp": 0.01069343, "balance_loss_clip": 1.05437231, "balance_loss_mlp": 1.04811132, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 1.7434096141785573, "language_loss": 0.78663194, "learning_rate": 3.8786589651053446e-06, "loss": 0.80885959, "num_input_tokens_seen": 49706650, "step": 2297, "time_per_iteration": 2.6254489421844482 }, { "auxiliary_loss_clip": 0.01110905, "auxiliary_loss_mlp": 0.01059754, "balance_loss_clip": 1.05296302, "balance_loss_mlp": 1.03871369, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 1.929043788877404, "language_loss": 0.69199705, "learning_rate": 3.878525337815164e-06, "loss": 0.71370363, "num_input_tokens_seen": 49725715, "step": 2298, "time_per_iteration": 2.791301965713501 }, { "auxiliary_loss_clip": 0.01137772, "auxiliary_loss_mlp": 0.01061768, "balance_loss_clip": 1.0517292, "balance_loss_mlp": 1.04059684, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 1.7910922430646712, "language_loss": 0.86382294, "learning_rate": 3.878391639291116e-06, "loss": 0.88581836, "num_input_tokens_seen": 49744710, "step": 2299, "time_per_iteration": 2.6075453758239746 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.05378175, "balance_loss_mlp": 1.03292871, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 2.2378660690879606, "language_loss": 0.75468475, "learning_rate": 3.878257869538267e-06, "loss": 0.77690154, "num_input_tokens_seen": 49764300, "step": 2300, "time_per_iteration": 2.663328170776367 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01047248, "balance_loss_clip": 1.05274105, "balance_loss_mlp": 1.02664876, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 2.5571861214345963, "language_loss": 0.82463622, "learning_rate": 3.878124028561692e-06, "loss": 0.8464148, "num_input_tokens_seen": 49778380, "step": 2301, "time_per_iteration": 2.6705129146575928 }, { "auxiliary_loss_clip": 0.0113862, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05323792, "balance_loss_mlp": 1.00021625, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9612043619218924, "language_loss": 0.85957694, "learning_rate": 3.877990116366466e-06, "loss": 0.87874192, "num_input_tokens_seen": 49797460, "step": 2302, "time_per_iteration": 2.679797410964966 }, { "auxiliary_loss_clip": 0.01059341, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.03226125, "balance_loss_mlp": 1.02244604, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7598813547967705, "language_loss": 0.65591633, "learning_rate": 3.877856132957667e-06, "loss": 0.67676187, "num_input_tokens_seen": 49868005, "step": 2303, "time_per_iteration": 3.3249399662017822 }, { "auxiliary_loss_clip": 0.01151443, "auxiliary_loss_mlp": 0.01046478, "balance_loss_clip": 1.05337632, "balance_loss_mlp": 1.02655792, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 3.141207945865242, "language_loss": 0.78663635, "learning_rate": 3.877722078340374e-06, "loss": 0.80861557, "num_input_tokens_seen": 49885825, "step": 2304, "time_per_iteration": 2.7364001274108887 }, { "auxiliary_loss_clip": 0.01157514, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.05566275, "balance_loss_mlp": 1.02385736, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.7487365854034607, "language_loss": 0.77559888, "learning_rate": 3.877587952519672e-06, "loss": 0.79760659, "num_input_tokens_seen": 49905975, "step": 2305, "time_per_iteration": 2.7814202308654785 }, { "auxiliary_loss_clip": 0.01074766, "auxiliary_loss_mlp": 0.01055718, "balance_loss_clip": 1.04160607, "balance_loss_mlp": 1.03473723, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 1.8207477060355044, "language_loss": 0.87737936, "learning_rate": 3.877453755500647e-06, "loss": 0.89868426, "num_input_tokens_seen": 49925800, "step": 2306, "time_per_iteration": 2.917616605758667 }, { "auxiliary_loss_clip": 0.01064826, "auxiliary_loss_mlp": 0.0101208, "balance_loss_clip": 1.02692199, "balance_loss_mlp": 1.0094099, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8728538231155298, "language_loss": 0.59008431, "learning_rate": 3.877319487288387e-06, "loss": 0.61085337, "num_input_tokens_seen": 49977620, "step": 2307, "time_per_iteration": 3.4345149993896484 }, { "auxiliary_loss_clip": 0.01169624, "auxiliary_loss_mlp": 0.00778134, "balance_loss_clip": 1.05528641, "balance_loss_mlp": 1.00021303, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 1.8467673932802395, "language_loss": 0.79483795, "learning_rate": 3.877185147887984e-06, "loss": 0.81431556, "num_input_tokens_seen": 49996650, "step": 2308, "time_per_iteration": 2.7137296199798584 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05118585, "balance_loss_mlp": 1.03054297, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 2.352128383160346, "language_loss": 0.78101134, "learning_rate": 3.877050737304533e-06, "loss": 0.80282485, "num_input_tokens_seen": 50015640, "step": 2309, "time_per_iteration": 2.9259471893310547 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.01057348, "balance_loss_clip": 1.04979932, "balance_loss_mlp": 1.03620028, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 3.914796791761399, "language_loss": 0.68133545, "learning_rate": 3.876916255543129e-06, "loss": 0.70318997, "num_input_tokens_seen": 50033500, "step": 2310, "time_per_iteration": 4.27877140045166 }, { "auxiliary_loss_clip": 0.01164985, "auxiliary_loss_mlp": 0.01062516, "balance_loss_clip": 1.05356944, "balance_loss_mlp": 1.04021168, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 1.934954545600412, "language_loss": 0.84295756, "learning_rate": 3.8767817026088725e-06, "loss": 0.86523259, "num_input_tokens_seen": 50050075, "step": 2311, "time_per_iteration": 2.5612359046936035 }, { "auxiliary_loss_clip": 0.01173749, "auxiliary_loss_mlp": 0.01055474, "balance_loss_clip": 1.05752683, "balance_loss_mlp": 1.0350771, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.9213009430481143, "language_loss": 0.82358992, "learning_rate": 3.876647078506866e-06, "loss": 0.84588212, "num_input_tokens_seen": 50070080, "step": 2312, "time_per_iteration": 5.737139701843262 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.00778347, "balance_loss_clip": 1.05464363, "balance_loss_mlp": 1.00023031, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 2.109799495913242, "language_loss": 0.86732674, "learning_rate": 3.876512383242215e-06, "loss": 0.88640809, "num_input_tokens_seen": 50090040, "step": 2313, "time_per_iteration": 2.8402304649353027 }, { "auxiliary_loss_clip": 0.01168088, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.05670547, "balance_loss_mlp": 1.04115057, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 1.784990717237318, "language_loss": 0.79935932, "learning_rate": 3.876377616820024e-06, "loss": 0.8216576, "num_input_tokens_seen": 50110595, "step": 2314, "time_per_iteration": 2.683448076248169 }, { "auxiliary_loss_clip": 0.01124732, "auxiliary_loss_mlp": 0.01061041, "balance_loss_clip": 1.04845023, "balance_loss_mlp": 1.04103708, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 2.585875079553688, "language_loss": 0.85367405, "learning_rate": 3.876242779245409e-06, "loss": 0.87553179, "num_input_tokens_seen": 50125430, "step": 2315, "time_per_iteration": 4.332594394683838 }, { "auxiliary_loss_clip": 0.01156122, "auxiliary_loss_mlp": 0.01058532, "balance_loss_clip": 1.05397022, "balance_loss_mlp": 1.0372889, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.333331492160627, "language_loss": 0.77170396, "learning_rate": 3.876107870523477e-06, "loss": 0.79385042, "num_input_tokens_seen": 50144120, "step": 2316, "time_per_iteration": 2.654604911804199 }, { "auxiliary_loss_clip": 0.01163967, "auxiliary_loss_mlp": 0.00780027, "balance_loss_clip": 1.05353916, "balance_loss_mlp": 1.00024533, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 2.1485284032262086, "language_loss": 0.76820493, "learning_rate": 3.875972890659349e-06, "loss": 0.78764486, "num_input_tokens_seen": 50162500, "step": 2317, "time_per_iteration": 2.6501235961914062 }, { "auxiliary_loss_clip": 0.01144052, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.05156648, "balance_loss_mlp": 1.04074025, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.7797832869421444, "language_loss": 0.80185997, "learning_rate": 3.875837839658139e-06, "loss": 0.82391089, "num_input_tokens_seen": 50182415, "step": 2318, "time_per_iteration": 2.7097995281219482 }, { "auxiliary_loss_clip": 0.01049096, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.03358936, "balance_loss_mlp": 1.04518783, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.854553938374386, "language_loss": 0.59004617, "learning_rate": 3.87570271752497e-06, "loss": 0.61102188, "num_input_tokens_seen": 50245160, "step": 2319, "time_per_iteration": 3.2631640434265137 }, { "auxiliary_loss_clip": 0.0111484, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.04508984, "balance_loss_mlp": 1.03437412, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.3313836691947722, "language_loss": 0.64993447, "learning_rate": 3.875567524264967e-06, "loss": 0.67163646, "num_input_tokens_seen": 50268215, "step": 2320, "time_per_iteration": 2.8668782711029053 }, { "auxiliary_loss_clip": 0.01096421, "auxiliary_loss_mlp": 0.01056652, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.03521848, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 2.285151015895421, "language_loss": 0.70708811, "learning_rate": 3.875432259883256e-06, "loss": 0.72861886, "num_input_tokens_seen": 50288575, "step": 2321, "time_per_iteration": 2.8273603916168213 }, { "auxiliary_loss_clip": 0.01117698, "auxiliary_loss_mlp": 0.01061754, "balance_loss_clip": 1.04603076, "balance_loss_mlp": 1.03698206, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.7926270181208543, "language_loss": 0.85931206, "learning_rate": 3.875296924384965e-06, "loss": 0.88110662, "num_input_tokens_seen": 50308735, "step": 2322, "time_per_iteration": 2.833807945251465 }, { "auxiliary_loss_clip": 0.01120545, "auxiliary_loss_mlp": 0.01055036, "balance_loss_clip": 1.04616976, "balance_loss_mlp": 1.03568828, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.5963293576391182, "language_loss": 0.67159557, "learning_rate": 3.875161517775226e-06, "loss": 0.69335139, "num_input_tokens_seen": 50331025, "step": 2323, "time_per_iteration": 2.875265121459961 }, { "auxiliary_loss_clip": 0.01127992, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04900301, "balance_loss_mlp": 1.03432369, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 2.0757452253793485, "language_loss": 0.88878977, "learning_rate": 3.875026040059175e-06, "loss": 0.9106214, "num_input_tokens_seen": 50349725, "step": 2324, "time_per_iteration": 2.6841063499450684 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05145955, "balance_loss_mlp": 1.03541231, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 2.8450589371660526, "language_loss": 0.70621002, "learning_rate": 3.8748904912419485e-06, "loss": 0.72832638, "num_input_tokens_seen": 50367965, "step": 2325, "time_per_iteration": 2.694218397140503 }, { "auxiliary_loss_clip": 0.01134393, "auxiliary_loss_mlp": 0.00778751, "balance_loss_clip": 1.05273592, "balance_loss_mlp": 1.00028229, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.230299294128946, "language_loss": 0.81657004, "learning_rate": 3.874754871328688e-06, "loss": 0.83570141, "num_input_tokens_seen": 50385605, "step": 2326, "time_per_iteration": 2.715306282043457 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.02745473, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.729713540462037, "language_loss": 0.89241689, "learning_rate": 3.874619180324534e-06, "loss": 0.91438794, "num_input_tokens_seen": 50403985, "step": 2327, "time_per_iteration": 2.679626941680908 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01057397, "balance_loss_clip": 1.04873121, "balance_loss_mlp": 1.0352242, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.9217951598838363, "language_loss": 0.84760427, "learning_rate": 3.874483418234632e-06, "loss": 0.86937821, "num_input_tokens_seen": 50421590, "step": 2328, "time_per_iteration": 2.7277352809906006 }, { "auxiliary_loss_clip": 0.01151775, "auxiliary_loss_mlp": 0.0104443, "balance_loss_clip": 1.05300856, "balance_loss_mlp": 1.02421176, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.6116398320348613, "language_loss": 0.73835862, "learning_rate": 3.874347585064131e-06, "loss": 0.76032066, "num_input_tokens_seen": 50443945, "step": 2329, "time_per_iteration": 2.6911025047302246 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.02644169, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.565670250114109, "language_loss": 0.78373277, "learning_rate": 3.874211680818183e-06, "loss": 0.80573165, "num_input_tokens_seen": 50462065, "step": 2330, "time_per_iteration": 2.703225612640381 }, { "auxiliary_loss_clip": 0.01144455, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.05247569, "balance_loss_mlp": 1.02692819, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.2215524337864143, "language_loss": 0.72115719, "learning_rate": 3.87407570550194e-06, "loss": 0.74306256, "num_input_tokens_seen": 50479565, "step": 2331, "time_per_iteration": 2.7044217586517334 }, { "auxiliary_loss_clip": 0.01159691, "auxiliary_loss_mlp": 0.01051771, "balance_loss_clip": 1.0558939, "balance_loss_mlp": 1.03234017, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.5806705357110964, "language_loss": 0.72634697, "learning_rate": 3.873939659120557e-06, "loss": 0.7484616, "num_input_tokens_seen": 50497305, "step": 2332, "time_per_iteration": 2.647564649581909 }, { "auxiliary_loss_clip": 0.01063058, "auxiliary_loss_mlp": 0.01022564, "balance_loss_clip": 1.03391051, "balance_loss_mlp": 1.01944101, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.8445516092095569, "language_loss": 0.56185365, "learning_rate": 3.873803541679196e-06, "loss": 0.58270991, "num_input_tokens_seen": 50549735, "step": 2333, "time_per_iteration": 3.038390636444092 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.05246043, "balance_loss_mlp": 1.02587318, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.7702774265545234, "language_loss": 0.82728767, "learning_rate": 3.873667353183016e-06, "loss": 0.84902453, "num_input_tokens_seen": 50570100, "step": 2334, "time_per_iteration": 2.7205803394317627 }, { "auxiliary_loss_clip": 0.01129244, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02593565, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7790720657464538, "language_loss": 0.80958998, "learning_rate": 3.8735310936371825e-06, "loss": 0.83132899, "num_input_tokens_seen": 50589185, "step": 2335, "time_per_iteration": 2.7844314575195312 }, { "auxiliary_loss_clip": 0.01108373, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04802513, "balance_loss_mlp": 1.02160311, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 1.739505291070366, "language_loss": 0.81987065, "learning_rate": 3.873394763046862e-06, "loss": 0.84139174, "num_input_tokens_seen": 50609645, "step": 2336, "time_per_iteration": 2.7787351608276367 }, { "auxiliary_loss_clip": 0.01150445, "auxiliary_loss_mlp": 0.01046319, "balance_loss_clip": 1.05603921, "balance_loss_mlp": 1.02709103, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.7584048007565314, "language_loss": 0.80606967, "learning_rate": 3.873258361417225e-06, "loss": 0.82803738, "num_input_tokens_seen": 50628385, "step": 2337, "time_per_iteration": 2.6119275093078613 }, { "auxiliary_loss_clip": 0.01150898, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.03202438, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 2.383737065589604, "language_loss": 0.78994334, "learning_rate": 3.873121888753442e-06, "loss": 0.81196302, "num_input_tokens_seen": 50647260, "step": 2338, "time_per_iteration": 2.672427177429199 }, { "auxiliary_loss_clip": 0.01158377, "auxiliary_loss_mlp": 0.01050168, "balance_loss_clip": 1.05894089, "balance_loss_mlp": 1.02919865, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 2.117725014058833, "language_loss": 0.79766536, "learning_rate": 3.87298534506069e-06, "loss": 0.81975079, "num_input_tokens_seen": 50666130, "step": 2339, "time_per_iteration": 2.68635892868042 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.04686952, "balance_loss_mlp": 1.04463232, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 2.0269377249156793, "language_loss": 0.65632963, "learning_rate": 3.872848730344146e-06, "loss": 0.67795384, "num_input_tokens_seen": 50687440, "step": 2340, "time_per_iteration": 2.9426286220550537 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.0310297, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.8518792803213917, "language_loss": 0.78760445, "learning_rate": 3.87271204460899e-06, "loss": 0.80959821, "num_input_tokens_seen": 50704030, "step": 2341, "time_per_iteration": 2.8814899921417236 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.0554986, "balance_loss_mlp": 1.03876162, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.2693198584224454, "language_loss": 0.80322361, "learning_rate": 3.8725752878604066e-06, "loss": 0.82542449, "num_input_tokens_seen": 50723305, "step": 2342, "time_per_iteration": 2.604814291000366 }, { "auxiliary_loss_clip": 0.01152048, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.03858757, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 2.4727499245104343, "language_loss": 0.77686632, "learning_rate": 3.87243846010358e-06, "loss": 0.79895234, "num_input_tokens_seen": 50743270, "step": 2343, "time_per_iteration": 2.676823854446411 }, { "auxiliary_loss_clip": 0.0105659, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.03650093, "balance_loss_mlp": 1.03438878, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8521752699932517, "language_loss": 0.61553669, "learning_rate": 3.872301561343699e-06, "loss": 0.63647842, "num_input_tokens_seen": 50802710, "step": 2344, "time_per_iteration": 3.156792402267456 }, { "auxiliary_loss_clip": 0.01147637, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.05167484, "balance_loss_mlp": 1.03121877, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.558783678159347, "language_loss": 0.64331692, "learning_rate": 3.872164591585956e-06, "loss": 0.6652869, "num_input_tokens_seen": 50822625, "step": 2345, "time_per_iteration": 2.654100179672241 }, { "auxiliary_loss_clip": 0.01154879, "auxiliary_loss_mlp": 0.0104633, "balance_loss_clip": 1.05009735, "balance_loss_mlp": 1.02562308, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 2.26337760563351, "language_loss": 0.73892581, "learning_rate": 3.8720275508355435e-06, "loss": 0.76093793, "num_input_tokens_seen": 50842330, "step": 2346, "time_per_iteration": 2.7032830715179443 }, { "auxiliary_loss_clip": 0.0115447, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.0572027, "balance_loss_mlp": 1.02929008, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.7675181118684058, "language_loss": 0.7727294, "learning_rate": 3.8718904390976585e-06, "loss": 0.79476202, "num_input_tokens_seen": 50861035, "step": 2347, "time_per_iteration": 2.678647518157959 }, { "auxiliary_loss_clip": 0.01164131, "auxiliary_loss_mlp": 0.01052088, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.03370619, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 2.592464695784388, "language_loss": 0.76753062, "learning_rate": 3.8717532563775e-06, "loss": 0.78969282, "num_input_tokens_seen": 50880105, "step": 2348, "time_per_iteration": 2.7450597286224365 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05267334, "balance_loss_mlp": 1.02295136, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.8617784303344698, "language_loss": 0.86794335, "learning_rate": 3.871616002680272e-06, "loss": 0.8898412, "num_input_tokens_seen": 50897720, "step": 2349, "time_per_iteration": 2.662508964538574 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01048616, "balance_loss_clip": 1.05632985, "balance_loss_mlp": 1.02897048, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 2.650060051711467, "language_loss": 0.88758218, "learning_rate": 3.871478678011177e-06, "loss": 0.90957808, "num_input_tokens_seen": 50918385, "step": 2350, "time_per_iteration": 4.1697962284088135 }, { "auxiliary_loss_clip": 0.01142704, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 1.05369377, "balance_loss_mlp": 1.02442729, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.801090232061166, "language_loss": 0.8094542, "learning_rate": 3.871341282375423e-06, "loss": 0.83133256, "num_input_tokens_seen": 50938270, "step": 2351, "time_per_iteration": 2.6769907474517822 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.05100775, "balance_loss_mlp": 1.02096045, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 2.590933181784672, "language_loss": 0.82796198, "learning_rate": 3.871203815778219e-06, "loss": 0.84985888, "num_input_tokens_seen": 50958155, "step": 2352, "time_per_iteration": 5.713203430175781 }, { "auxiliary_loss_clip": 0.01063742, "auxiliary_loss_mlp": 0.01009803, "balance_loss_clip": 1.03462291, "balance_loss_mlp": 1.0060122, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.9118003008214054, "language_loss": 0.61876011, "learning_rate": 3.87106627822478e-06, "loss": 0.63949555, "num_input_tokens_seen": 51020705, "step": 2353, "time_per_iteration": 3.1698319911956787 }, { "auxiliary_loss_clip": 0.01134069, "auxiliary_loss_mlp": 0.01049094, "balance_loss_clip": 1.0536828, "balance_loss_mlp": 1.03039002, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.5909284402791886, "language_loss": 0.87075388, "learning_rate": 3.8709286697203196e-06, "loss": 0.89258552, "num_input_tokens_seen": 51039995, "step": 2354, "time_per_iteration": 2.6781272888183594 }, { "auxiliary_loss_clip": 0.01124592, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.0527302, "balance_loss_mlp": 1.02562428, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.035812967878614, "language_loss": 0.74701214, "learning_rate": 3.870790990270057e-06, "loss": 0.76871634, "num_input_tokens_seen": 51059075, "step": 2355, "time_per_iteration": 4.464852571487427 }, { "auxiliary_loss_clip": 0.01062228, "auxiliary_loss_mlp": 0.01003337, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 0.99947417, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6801443738216844, "language_loss": 0.51819825, "learning_rate": 3.870653239879212e-06, "loss": 0.53885388, "num_input_tokens_seen": 51120380, "step": 2356, "time_per_iteration": 3.094026803970337 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01057535, "balance_loss_clip": 1.05662966, "balance_loss_mlp": 1.0379492, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 1.9928903491175036, "language_loss": 0.70598352, "learning_rate": 3.8705154185530095e-06, "loss": 0.72821522, "num_input_tokens_seen": 51136950, "step": 2357, "time_per_iteration": 2.569486141204834 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.04706419, "balance_loss_mlp": 1.0355413, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 2.1046358800035234, "language_loss": 0.82020235, "learning_rate": 3.870377526296674e-06, "loss": 0.84192204, "num_input_tokens_seen": 51155175, "step": 2358, "time_per_iteration": 2.719344139099121 }, { "auxiliary_loss_clip": 0.01145283, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.05257189, "balance_loss_mlp": 1.02932954, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 2.2336131404929787, "language_loss": 0.71575904, "learning_rate": 3.870239563115436e-06, "loss": 0.73771417, "num_input_tokens_seen": 51174500, "step": 2359, "time_per_iteration": 2.6914820671081543 }, { "auxiliary_loss_clip": 0.0111529, "auxiliary_loss_mlp": 0.007787, "balance_loss_clip": 1.0526464, "balance_loss_mlp": 1.00033379, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 2.4314273775499906, "language_loss": 0.7541784, "learning_rate": 3.870101529014526e-06, "loss": 0.77311832, "num_input_tokens_seen": 51194270, "step": 2360, "time_per_iteration": 2.803493022918701 }, { "auxiliary_loss_clip": 0.01108644, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.0491271, "balance_loss_mlp": 1.03136814, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.374719540518049, "language_loss": 0.81920552, "learning_rate": 3.869963423999178e-06, "loss": 0.84082878, "num_input_tokens_seen": 51211850, "step": 2361, "time_per_iteration": 2.8039920330047607 }, { "auxiliary_loss_clip": 0.0115065, "auxiliary_loss_mlp": 0.01057946, "balance_loss_clip": 1.05230403, "balance_loss_mlp": 1.03802609, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 1.9397979109407166, "language_loss": 0.74081504, "learning_rate": 3.86982524807463e-06, "loss": 0.76290095, "num_input_tokens_seen": 51233545, "step": 2362, "time_per_iteration": 2.7272114753723145 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.05355787, "balance_loss_mlp": 1.02861547, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 1.7489521991344694, "language_loss": 0.74221587, "learning_rate": 3.869687001246122e-06, "loss": 0.76423442, "num_input_tokens_seen": 51257615, "step": 2363, "time_per_iteration": 2.789802312850952 }, { "auxiliary_loss_clip": 0.01128802, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.04769099, "balance_loss_mlp": 1.03180885, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.7832713632097879, "language_loss": 0.73034167, "learning_rate": 3.8695486835188946e-06, "loss": 0.75215018, "num_input_tokens_seen": 51279645, "step": 2364, "time_per_iteration": 2.8508312702178955 }, { "auxiliary_loss_clip": 0.01142769, "auxiliary_loss_mlp": 0.01049829, "balance_loss_clip": 1.05160844, "balance_loss_mlp": 1.03207827, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 1.875477198706701, "language_loss": 0.90395916, "learning_rate": 3.869410294898195e-06, "loss": 0.92588514, "num_input_tokens_seen": 51299775, "step": 2365, "time_per_iteration": 2.6807806491851807 }, { "auxiliary_loss_clip": 0.01127252, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.04759967, "balance_loss_mlp": 1.03394318, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.719218863067841, "language_loss": 0.65305161, "learning_rate": 3.869271835389268e-06, "loss": 0.67487329, "num_input_tokens_seen": 51319430, "step": 2366, "time_per_iteration": 2.7293641567230225 }, { "auxiliary_loss_clip": 0.01143576, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05218709, "balance_loss_mlp": 1.03058839, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.3740196514966256, "language_loss": 0.80331928, "learning_rate": 3.8691333049973665e-06, "loss": 0.82527137, "num_input_tokens_seen": 51336045, "step": 2367, "time_per_iteration": 2.67529296875 }, { "auxiliary_loss_clip": 0.01138517, "auxiliary_loss_mlp": 0.01062653, "balance_loss_clip": 1.05117869, "balance_loss_mlp": 1.0402534, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 2.0081973718426283, "language_loss": 0.82346755, "learning_rate": 3.868994703727742e-06, "loss": 0.84547925, "num_input_tokens_seen": 51357030, "step": 2368, "time_per_iteration": 2.7447288036346436 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01052229, "balance_loss_clip": 1.05180073, "balance_loss_mlp": 1.03065228, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 2.6586279461428144, "language_loss": 0.8711772, "learning_rate": 3.868856031585652e-06, "loss": 0.89292705, "num_input_tokens_seen": 51374890, "step": 2369, "time_per_iteration": 2.736872673034668 }, { "auxiliary_loss_clip": 0.01127301, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.05011857, "balance_loss_mlp": 1.02170992, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.7900856007188275, "language_loss": 0.75828248, "learning_rate": 3.868717288576354e-06, "loss": 0.77997375, "num_input_tokens_seen": 51398100, "step": 2370, "time_per_iteration": 2.762603998184204 }, { "auxiliary_loss_clip": 0.01158195, "auxiliary_loss_mlp": 0.00781098, "balance_loss_clip": 1.05268764, "balance_loss_mlp": 1.00028419, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 1.7770434161065212, "language_loss": 0.82934797, "learning_rate": 3.868578474705109e-06, "loss": 0.84874088, "num_input_tokens_seen": 51418745, "step": 2371, "time_per_iteration": 2.6224656105041504 }, { "auxiliary_loss_clip": 0.01173447, "auxiliary_loss_mlp": 0.0105718, "balance_loss_clip": 1.05837953, "balance_loss_mlp": 1.03638947, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.0431625041319825, "language_loss": 0.82982123, "learning_rate": 3.868439589977181e-06, "loss": 0.85212755, "num_input_tokens_seen": 51437455, "step": 2372, "time_per_iteration": 2.575690269470215 }, { "auxiliary_loss_clip": 0.01172196, "auxiliary_loss_mlp": 0.0105022, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.0285356, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 3.3704326167450582, "language_loss": 0.8438468, "learning_rate": 3.868300634397836e-06, "loss": 0.86607099, "num_input_tokens_seen": 51455710, "step": 2373, "time_per_iteration": 2.7160356044769287 }, { "auxiliary_loss_clip": 0.01141742, "auxiliary_loss_mlp": 0.01055295, "balance_loss_clip": 1.05160809, "balance_loss_mlp": 1.03598261, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 3.5035356392631836, "language_loss": 0.86027539, "learning_rate": 3.8681616079723445e-06, "loss": 0.88224572, "num_input_tokens_seen": 51471270, "step": 2374, "time_per_iteration": 2.6845595836639404 }, { "auxiliary_loss_clip": 0.01164623, "auxiliary_loss_mlp": 0.01061957, "balance_loss_clip": 1.05515146, "balance_loss_mlp": 1.03996301, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6059368749673757, "language_loss": 0.79169822, "learning_rate": 3.868022510705977e-06, "loss": 0.81396401, "num_input_tokens_seen": 51492705, "step": 2375, "time_per_iteration": 2.738156795501709 }, { "auxiliary_loss_clip": 0.01163115, "auxiliary_loss_mlp": 0.01058224, "balance_loss_clip": 1.05641222, "balance_loss_mlp": 1.0368259, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.559097553272684, "language_loss": 0.76907504, "learning_rate": 3.867883342604009e-06, "loss": 0.79128844, "num_input_tokens_seen": 51510780, "step": 2376, "time_per_iteration": 2.751178741455078 }, { "auxiliary_loss_clip": 0.01160115, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.054515, "balance_loss_mlp": 1.03040111, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 2.7331999261828592, "language_loss": 0.92795181, "learning_rate": 3.867744103671717e-06, "loss": 0.95006979, "num_input_tokens_seen": 51531400, "step": 2377, "time_per_iteration": 2.6584725379943848 }, { "auxiliary_loss_clip": 0.01147246, "auxiliary_loss_mlp": 0.01061419, "balance_loss_clip": 1.05362535, "balance_loss_mlp": 1.03793442, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 2.9252003733204894, "language_loss": 0.91754365, "learning_rate": 3.867604793914382e-06, "loss": 0.93963027, "num_input_tokens_seen": 51548215, "step": 2378, "time_per_iteration": 2.8107075691223145 }, { "auxiliary_loss_clip": 0.01164153, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.05712187, "balance_loss_mlp": 1.03092849, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 2.1292902842232966, "language_loss": 0.73961306, "learning_rate": 3.8674654133372864e-06, "loss": 0.76178491, "num_input_tokens_seen": 51566820, "step": 2379, "time_per_iteration": 2.7029881477355957 }, { "auxiliary_loss_clip": 0.01137551, "auxiliary_loss_mlp": 0.01055012, "balance_loss_clip": 1.05204058, "balance_loss_mlp": 1.0330174, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 2.1898245228218784, "language_loss": 0.78818595, "learning_rate": 3.867325961945714e-06, "loss": 0.81011152, "num_input_tokens_seen": 51585075, "step": 2380, "time_per_iteration": 2.7213294506073 }, { "auxiliary_loss_clip": 0.01126442, "auxiliary_loss_mlp": 0.01057409, "balance_loss_clip": 1.05457354, "balance_loss_mlp": 1.03580785, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 4.699041640805274, "language_loss": 0.87895483, "learning_rate": 3.867186439744955e-06, "loss": 0.90079331, "num_input_tokens_seen": 51603185, "step": 2381, "time_per_iteration": 2.7144110202789307 }, { "auxiliary_loss_clip": 0.01141327, "auxiliary_loss_mlp": 0.01052708, "balance_loss_clip": 1.05200005, "balance_loss_mlp": 1.03088117, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.47508592106904, "language_loss": 0.76396096, "learning_rate": 3.867046846740299e-06, "loss": 0.78590137, "num_input_tokens_seen": 51620880, "step": 2382, "time_per_iteration": 2.6185953617095947 }, { "auxiliary_loss_clip": 0.01132222, "auxiliary_loss_mlp": 0.01054019, "balance_loss_clip": 1.05162048, "balance_loss_mlp": 1.03319359, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 4.3017095308344375, "language_loss": 0.76636785, "learning_rate": 3.866907182937039e-06, "loss": 0.7882303, "num_input_tokens_seen": 51640170, "step": 2383, "time_per_iteration": 2.7408525943756104 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01052888, "balance_loss_clip": 1.05078864, "balance_loss_mlp": 1.02926064, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.3526544982502284, "language_loss": 0.87649417, "learning_rate": 3.866767448340471e-06, "loss": 0.8984201, "num_input_tokens_seen": 51656580, "step": 2384, "time_per_iteration": 2.6798789501190186 }, { "auxiliary_loss_clip": 0.01164805, "auxiliary_loss_mlp": 0.01053206, "balance_loss_clip": 1.05644679, "balance_loss_mlp": 1.02985239, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.6134761315069284, "language_loss": 0.79340684, "learning_rate": 3.866627642955895e-06, "loss": 0.81558692, "num_input_tokens_seen": 51674645, "step": 2385, "time_per_iteration": 2.5856544971466064 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.05148256, "balance_loss_mlp": 1.02182722, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 2.6990187663653247, "language_loss": 0.74960196, "learning_rate": 3.866487766788612e-06, "loss": 0.77159584, "num_input_tokens_seen": 51695770, "step": 2386, "time_per_iteration": 2.6670751571655273 }, { "auxiliary_loss_clip": 0.01171639, "auxiliary_loss_mlp": 0.01048096, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02733016, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 2.299870083842227, "language_loss": 0.78659731, "learning_rate": 3.866347819843925e-06, "loss": 0.80879462, "num_input_tokens_seen": 51714165, "step": 2387, "time_per_iteration": 2.5805532932281494 }, { "auxiliary_loss_clip": 0.01140581, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05355716, "balance_loss_mlp": 1.03317428, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 6.554164509194222, "language_loss": 0.82492924, "learning_rate": 3.866207802127143e-06, "loss": 0.84688807, "num_input_tokens_seen": 51734440, "step": 2388, "time_per_iteration": 2.656609058380127 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.0537287, "balance_loss_mlp": 1.02674508, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 2.5973624291758655, "language_loss": 0.82025754, "learning_rate": 3.866067713643573e-06, "loss": 0.84227914, "num_input_tokens_seen": 51753730, "step": 2389, "time_per_iteration": 4.21793794631958 }, { "auxiliary_loss_clip": 0.01145665, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.05107975, "balance_loss_mlp": 1.02513266, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 3.7970835440683097, "language_loss": 0.83056784, "learning_rate": 3.8659275543985285e-06, "loss": 0.85249299, "num_input_tokens_seen": 51771195, "step": 2390, "time_per_iteration": 2.6859514713287354 }, { "auxiliary_loss_clip": 0.01152608, "auxiliary_loss_mlp": 0.01054404, "balance_loss_clip": 1.05400729, "balance_loss_mlp": 1.0334475, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 1.8176612067028404, "language_loss": 0.75018179, "learning_rate": 3.865787324397324e-06, "loss": 0.77225184, "num_input_tokens_seen": 51792290, "step": 2391, "time_per_iteration": 5.726900577545166 }, { "auxiliary_loss_clip": 0.01045505, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.03226101, "balance_loss_mlp": 1.0303973, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8787809928903102, "language_loss": 0.61848003, "learning_rate": 3.865647023645277e-06, "loss": 0.63926852, "num_input_tokens_seen": 51843675, "step": 2392, "time_per_iteration": 3.113558053970337 }, { "auxiliary_loss_clip": 0.01158698, "auxiliary_loss_mlp": 0.01058807, "balance_loss_clip": 1.05467868, "balance_loss_mlp": 1.03608608, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 2.718376715006273, "language_loss": 0.77346605, "learning_rate": 3.865506652147709e-06, "loss": 0.79564106, "num_input_tokens_seen": 51860285, "step": 2393, "time_per_iteration": 2.6578521728515625 }, { "auxiliary_loss_clip": 0.0116951, "auxiliary_loss_mlp": 0.01052986, "balance_loss_clip": 1.05671048, "balance_loss_mlp": 1.03287578, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 5.715284956255472, "language_loss": 0.76301813, "learning_rate": 3.865366209909941e-06, "loss": 0.78524309, "num_input_tokens_seen": 51880105, "step": 2394, "time_per_iteration": 4.345217943191528 }, { "auxiliary_loss_clip": 0.01165266, "auxiliary_loss_mlp": 0.01053501, "balance_loss_clip": 1.05325842, "balance_loss_mlp": 1.03365326, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 2.2496244390836893, "language_loss": 0.85859704, "learning_rate": 3.8652256969372994e-06, "loss": 0.88078463, "num_input_tokens_seen": 51905175, "step": 2395, "time_per_iteration": 2.739717483520508 }, { "auxiliary_loss_clip": 0.0112523, "auxiliary_loss_mlp": 0.01051092, "balance_loss_clip": 1.04946184, "balance_loss_mlp": 1.028669, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 4.117082508421602, "language_loss": 0.82894099, "learning_rate": 3.865085113235113e-06, "loss": 0.85070425, "num_input_tokens_seen": 51924490, "step": 2396, "time_per_iteration": 2.686732053756714 }, { "auxiliary_loss_clip": 0.01126754, "auxiliary_loss_mlp": 0.00779833, "balance_loss_clip": 1.04752374, "balance_loss_mlp": 1.00036597, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 6.956399779275871, "language_loss": 0.82801461, "learning_rate": 3.864944458808712e-06, "loss": 0.84708053, "num_input_tokens_seen": 51940490, "step": 2397, "time_per_iteration": 2.742809534072876 }, { "auxiliary_loss_clip": 0.01168871, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.05485702, "balance_loss_mlp": 1.02892387, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 8.355198005975433, "language_loss": 0.8001197, "learning_rate": 3.86480373366343e-06, "loss": 0.82230783, "num_input_tokens_seen": 51957910, "step": 2398, "time_per_iteration": 2.573267936706543 }, { "auxiliary_loss_clip": 0.01152449, "auxiliary_loss_mlp": 0.01053407, "balance_loss_clip": 1.05287588, "balance_loss_mlp": 1.03336823, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 3.294581575970509, "language_loss": 0.64690518, "learning_rate": 3.864662937804603e-06, "loss": 0.66896379, "num_input_tokens_seen": 51978010, "step": 2399, "time_per_iteration": 2.6831774711608887 }, { "auxiliary_loss_clip": 0.01134916, "auxiliary_loss_mlp": 0.01052493, "balance_loss_clip": 1.04998159, "balance_loss_mlp": 1.03119016, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 3.586256880371596, "language_loss": 0.82207137, "learning_rate": 3.864522071237571e-06, "loss": 0.84394544, "num_input_tokens_seen": 51998515, "step": 2400, "time_per_iteration": 2.6812663078308105 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01051884, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.02954376, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 2.3908005596579165, "language_loss": 0.74217784, "learning_rate": 3.864381133967676e-06, "loss": 0.76419652, "num_input_tokens_seen": 52019270, "step": 2401, "time_per_iteration": 2.773838520050049 }, { "auxiliary_loss_clip": 0.01137207, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.05065656, "balance_loss_mlp": 1.02671885, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 2.616063077702737, "language_loss": 0.80771816, "learning_rate": 3.86424012600026e-06, "loss": 0.82956612, "num_input_tokens_seen": 52039315, "step": 2402, "time_per_iteration": 2.786031723022461 }, { "auxiliary_loss_clip": 0.01120897, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.02988231, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.397935571801219, "language_loss": 0.84159613, "learning_rate": 3.864099047340673e-06, "loss": 0.86332625, "num_input_tokens_seen": 52056555, "step": 2403, "time_per_iteration": 2.8113911151885986 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.00783127, "balance_loss_clip": 1.04854488, "balance_loss_mlp": 1.00030184, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 2.224282169770823, "language_loss": 0.70142806, "learning_rate": 3.863957897994262e-06, "loss": 0.72055018, "num_input_tokens_seen": 52075800, "step": 2404, "time_per_iteration": 2.7748003005981445 }, { "auxiliary_loss_clip": 0.01144289, "auxiliary_loss_mlp": 0.01051404, "balance_loss_clip": 1.05279732, "balance_loss_mlp": 1.03099549, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.429117427076043, "language_loss": 0.73179376, "learning_rate": 3.863816677966381e-06, "loss": 0.75375068, "num_input_tokens_seen": 52092585, "step": 2405, "time_per_iteration": 2.7927868366241455 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01054584, "balance_loss_clip": 1.04661417, "balance_loss_mlp": 1.0326612, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 7.089523066959408, "language_loss": 0.73039794, "learning_rate": 3.863675387262386e-06, "loss": 0.75202763, "num_input_tokens_seen": 52108990, "step": 2406, "time_per_iteration": 2.742253303527832 }, { "auxiliary_loss_clip": 0.01157268, "auxiliary_loss_mlp": 0.01054465, "balance_loss_clip": 1.05420268, "balance_loss_mlp": 1.03198171, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 5.383630788916188, "language_loss": 0.75570732, "learning_rate": 3.8635340258876325e-06, "loss": 0.77782464, "num_input_tokens_seen": 52125385, "step": 2407, "time_per_iteration": 2.654636859893799 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05440819, "balance_loss_mlp": 1.03392315, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 2.0240540465866146, "language_loss": 0.79426706, "learning_rate": 3.8633925938474826e-06, "loss": 0.81648088, "num_input_tokens_seen": 52144985, "step": 2408, "time_per_iteration": 2.663611650466919 }, { "auxiliary_loss_clip": 0.01155332, "auxiliary_loss_mlp": 0.01053557, "balance_loss_clip": 1.05411625, "balance_loss_mlp": 1.03107429, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 2.249858190268702, "language_loss": 0.82188261, "learning_rate": 3.863251091147299e-06, "loss": 0.84397143, "num_input_tokens_seen": 52163885, "step": 2409, "time_per_iteration": 2.6218342781066895 }, { "auxiliary_loss_clip": 0.01116852, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.04859877, "balance_loss_mlp": 1.04340839, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 3.918408886138166, "language_loss": 0.74477464, "learning_rate": 3.863109517792446e-06, "loss": 0.76659817, "num_input_tokens_seen": 52184325, "step": 2410, "time_per_iteration": 2.8525002002716064 }, { "auxiliary_loss_clip": 0.01166422, "auxiliary_loss_mlp": 0.0105028, "balance_loss_clip": 1.05447876, "balance_loss_mlp": 1.0300622, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 2.976325973684052, "language_loss": 0.81616414, "learning_rate": 3.8629678737882945e-06, "loss": 0.8383311, "num_input_tokens_seen": 52202740, "step": 2411, "time_per_iteration": 2.580059051513672 }, { "auxiliary_loss_clip": 0.01143671, "auxiliary_loss_mlp": 0.01055066, "balance_loss_clip": 1.05553794, "balance_loss_mlp": 1.03366852, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 2.049708152728223, "language_loss": 0.69947547, "learning_rate": 3.862826159140214e-06, "loss": 0.72146285, "num_input_tokens_seen": 52223100, "step": 2412, "time_per_iteration": 2.792389392852783 }, { "auxiliary_loss_clip": 0.01153861, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.05600309, "balance_loss_mlp": 1.02669024, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 1.9741671649406984, "language_loss": 0.76655865, "learning_rate": 3.862684373853579e-06, "loss": 0.78857231, "num_input_tokens_seen": 52239690, "step": 2413, "time_per_iteration": 2.6535370349884033 }, { "auxiliary_loss_clip": 0.01072879, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.04041791, "balance_loss_mlp": 1.0252564, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.9047547971056389, "language_loss": 0.58883119, "learning_rate": 3.8625425179337656e-06, "loss": 0.60984492, "num_input_tokens_seen": 52296705, "step": 2414, "time_per_iteration": 3.1230342388153076 }, { "auxiliary_loss_clip": 0.01059489, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.03874373, "balance_loss_mlp": 1.00692892, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8422279258983576, "language_loss": 0.62171185, "learning_rate": 3.862400591386154e-06, "loss": 0.64240396, "num_input_tokens_seen": 52361830, "step": 2415, "time_per_iteration": 3.1932270526885986 }, { "auxiliary_loss_clip": 0.01151643, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05383611, "balance_loss_mlp": 1.02500319, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 2.2913061581681036, "language_loss": 0.71468806, "learning_rate": 3.8622585942161245e-06, "loss": 0.73667121, "num_input_tokens_seen": 52379420, "step": 2416, "time_per_iteration": 2.5892374515533447 }, { "auxiliary_loss_clip": 0.01050816, "auxiliary_loss_mlp": 0.010049, "balance_loss_clip": 1.03675056, "balance_loss_mlp": 1.00211036, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7147623603004897, "language_loss": 0.6037569, "learning_rate": 3.8621165264290635e-06, "loss": 0.62431407, "num_input_tokens_seen": 52446290, "step": 2417, "time_per_iteration": 3.3065359592437744 }, { "auxiliary_loss_clip": 0.01168766, "auxiliary_loss_mlp": 0.01053548, "balance_loss_clip": 1.05357766, "balance_loss_mlp": 1.03275824, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 3.7032433533234346, "language_loss": 0.78014368, "learning_rate": 3.861974388030356e-06, "loss": 0.80236679, "num_input_tokens_seen": 52467295, "step": 2418, "time_per_iteration": 2.887986183166504 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01049779, "balance_loss_clip": 1.04354823, "balance_loss_mlp": 1.02911985, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 2.096300480609688, "language_loss": 0.71208847, "learning_rate": 3.861832179025394e-06, "loss": 0.73372757, "num_input_tokens_seen": 52487295, "step": 2419, "time_per_iteration": 2.764268636703491 }, { "auxiliary_loss_clip": 0.01142427, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05351484, "balance_loss_mlp": 1.03300607, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 2.414673655978061, "language_loss": 0.89847761, "learning_rate": 3.861689899419569e-06, "loss": 0.92045164, "num_input_tokens_seen": 52504220, "step": 2420, "time_per_iteration": 2.7500016689300537 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01060929, "balance_loss_clip": 1.05202007, "balance_loss_mlp": 1.04072309, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 2.0953123539002383, "language_loss": 0.82278717, "learning_rate": 3.861547549218276e-06, "loss": 0.8449465, "num_input_tokens_seen": 52521900, "step": 2421, "time_per_iteration": 2.672722816467285 }, { "auxiliary_loss_clip": 0.01099277, "auxiliary_loss_mlp": 0.01056793, "balance_loss_clip": 1.04282439, "balance_loss_mlp": 1.03507352, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.667429152986229, "language_loss": 0.81741488, "learning_rate": 3.861405128426914e-06, "loss": 0.83897555, "num_input_tokens_seen": 52540495, "step": 2422, "time_per_iteration": 2.739992141723633 }, { "auxiliary_loss_clip": 0.01031842, "auxiliary_loss_mlp": 0.00760413, "balance_loss_clip": 1.0271318, "balance_loss_mlp": 1.00019872, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9102961670465963, "language_loss": 0.63342595, "learning_rate": 3.861262637050883e-06, "loss": 0.65134847, "num_input_tokens_seen": 52603305, "step": 2423, "time_per_iteration": 3.2704036235809326 }, { "auxiliary_loss_clip": 0.01112855, "auxiliary_loss_mlp": 0.00780065, "balance_loss_clip": 1.05457556, "balance_loss_mlp": 1.00038898, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 2.2239460229896206, "language_loss": 0.82163274, "learning_rate": 3.861120075095585e-06, "loss": 0.84056193, "num_input_tokens_seen": 52623435, "step": 2424, "time_per_iteration": 2.7993249893188477 }, { "auxiliary_loss_clip": 0.01141208, "auxiliary_loss_mlp": 0.01069468, "balance_loss_clip": 1.0535512, "balance_loss_mlp": 1.0496788, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 2.769336045727131, "language_loss": 0.78602695, "learning_rate": 3.860977442566429e-06, "loss": 0.80813372, "num_input_tokens_seen": 52642255, "step": 2425, "time_per_iteration": 2.698594093322754 }, { "auxiliary_loss_clip": 0.01156078, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.05603778, "balance_loss_mlp": 1.04148602, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 50.77412231982301, "language_loss": 0.83184898, "learning_rate": 3.860834739468821e-06, "loss": 0.85403109, "num_input_tokens_seen": 52658700, "step": 2426, "time_per_iteration": 2.6948676109313965 }, { "auxiliary_loss_clip": 0.01166642, "auxiliary_loss_mlp": 0.01060596, "balance_loss_clip": 1.05706, "balance_loss_mlp": 1.04040194, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 3.7420612082917475, "language_loss": 0.87215799, "learning_rate": 3.860691965808173e-06, "loss": 0.8944304, "num_input_tokens_seen": 52678140, "step": 2427, "time_per_iteration": 2.6479666233062744 }, { "auxiliary_loss_clip": 0.01128634, "auxiliary_loss_mlp": 0.01064346, "balance_loss_clip": 1.04835391, "balance_loss_mlp": 1.0405997, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 1.9221483903926033, "language_loss": 0.66815829, "learning_rate": 3.8605491215899e-06, "loss": 0.69008809, "num_input_tokens_seen": 52696825, "step": 2428, "time_per_iteration": 2.6971306800842285 }, { "auxiliary_loss_clip": 0.01155557, "auxiliary_loss_mlp": 0.01059343, "balance_loss_clip": 1.05335426, "balance_loss_mlp": 1.03842235, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 2.0918238083564242, "language_loss": 0.83231717, "learning_rate": 3.860406206819417e-06, "loss": 0.8544662, "num_input_tokens_seen": 52715125, "step": 2429, "time_per_iteration": 4.283279895782471 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.01053505, "balance_loss_clip": 1.04625869, "balance_loss_mlp": 1.03446746, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 2.4559042296603746, "language_loss": 0.79087842, "learning_rate": 3.860263221502145e-06, "loss": 0.81262159, "num_input_tokens_seen": 52734015, "step": 2430, "time_per_iteration": 4.197890758514404 }, { "auxiliary_loss_clip": 0.01170782, "auxiliary_loss_mlp": 0.01061965, "balance_loss_clip": 1.05820751, "balance_loss_mlp": 1.04179525, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.4376691278662506, "language_loss": 0.82910693, "learning_rate": 3.860120165643504e-06, "loss": 0.85143435, "num_input_tokens_seen": 52753025, "step": 2431, "time_per_iteration": 4.162708282470703 }, { "auxiliary_loss_clip": 0.011607, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05553937, "balance_loss_mlp": 1.03853524, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 2.881661839068268, "language_loss": 0.78330141, "learning_rate": 3.859977039248921e-06, "loss": 0.80550951, "num_input_tokens_seen": 52773420, "step": 2432, "time_per_iteration": 2.6907777786254883 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.00782861, "balance_loss_clip": 1.05517077, "balance_loss_mlp": 1.00040507, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 2.3488382544651887, "language_loss": 0.79515982, "learning_rate": 3.859833842323822e-06, "loss": 0.81464243, "num_input_tokens_seen": 52792870, "step": 2433, "time_per_iteration": 2.719841241836548 }, { "auxiliary_loss_clip": 0.01124303, "auxiliary_loss_mlp": 0.01055776, "balance_loss_clip": 1.05385411, "balance_loss_mlp": 1.03484273, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 2.0782880949269926, "language_loss": 0.77905983, "learning_rate": 3.859690574873638e-06, "loss": 0.80086064, "num_input_tokens_seen": 52811615, "step": 2434, "time_per_iteration": 4.371506929397583 }, { "auxiliary_loss_clip": 0.01066282, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.05327988, "balance_loss_mlp": 1.03022039, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8566726319617045, "language_loss": 0.58453119, "learning_rate": 3.8595472369038e-06, "loss": 0.60552537, "num_input_tokens_seen": 52873230, "step": 2435, "time_per_iteration": 3.229882001876831 }, { "auxiliary_loss_clip": 0.01160087, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05263698, "balance_loss_mlp": 1.0257076, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 3.775553645712452, "language_loss": 0.88436592, "learning_rate": 3.859403828419744e-06, "loss": 0.90641725, "num_input_tokens_seen": 52889325, "step": 2436, "time_per_iteration": 2.568624973297119 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.00780257, "balance_loss_clip": 1.05587268, "balance_loss_mlp": 1.00041819, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 2.028718201913856, "language_loss": 0.74904168, "learning_rate": 3.85926034942691e-06, "loss": 0.7684052, "num_input_tokens_seen": 52909705, "step": 2437, "time_per_iteration": 2.6361188888549805 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01050068, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.02729869, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 3.0822234004311033, "language_loss": 0.73914421, "learning_rate": 3.859116799930736e-06, "loss": 0.76129669, "num_input_tokens_seen": 52930300, "step": 2438, "time_per_iteration": 2.7590928077697754 }, { "auxiliary_loss_clip": 0.01154571, "auxiliary_loss_mlp": 0.01046509, "balance_loss_clip": 1.05747688, "balance_loss_mlp": 1.02708936, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 4.476318678757457, "language_loss": 0.74410725, "learning_rate": 3.858973179936668e-06, "loss": 0.76611805, "num_input_tokens_seen": 52949955, "step": 2439, "time_per_iteration": 2.627037763595581 }, { "auxiliary_loss_clip": 0.01152452, "auxiliary_loss_mlp": 0.01051294, "balance_loss_clip": 1.05477583, "balance_loss_mlp": 1.0309453, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 2.1583973700525343, "language_loss": 0.74123728, "learning_rate": 3.85882948945015e-06, "loss": 0.76327467, "num_input_tokens_seen": 52972905, "step": 2440, "time_per_iteration": 2.79715633392334 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.05471611, "balance_loss_mlp": 1.02493691, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.9756103236146798, "language_loss": 0.82730794, "learning_rate": 3.85868572847663e-06, "loss": 0.84935671, "num_input_tokens_seen": 52994850, "step": 2441, "time_per_iteration": 2.6505653858184814 }, { "auxiliary_loss_clip": 0.01152605, "auxiliary_loss_mlp": 0.01049175, "balance_loss_clip": 1.05408478, "balance_loss_mlp": 1.02796757, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 2.582118236216862, "language_loss": 0.71455544, "learning_rate": 3.858541897021563e-06, "loss": 0.73657322, "num_input_tokens_seen": 53014740, "step": 2442, "time_per_iteration": 2.772648572921753 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.05283213, "balance_loss_mlp": 1.02224207, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 3.6780587187273155, "language_loss": 0.81992352, "learning_rate": 3.8583979950904e-06, "loss": 0.84165335, "num_input_tokens_seen": 53029780, "step": 2443, "time_per_iteration": 2.6979780197143555 }, { "auxiliary_loss_clip": 0.01147138, "auxiliary_loss_mlp": 0.0105693, "balance_loss_clip": 1.05402422, "balance_loss_mlp": 1.03474557, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 3.190851099873364, "language_loss": 0.83093917, "learning_rate": 3.858254022688599e-06, "loss": 0.85297978, "num_input_tokens_seen": 53048620, "step": 2444, "time_per_iteration": 2.7177255153656006 }, { "auxiliary_loss_clip": 0.01134628, "auxiliary_loss_mlp": 0.01051986, "balance_loss_clip": 1.05385137, "balance_loss_mlp": 1.03213811, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 3.1425569240832414, "language_loss": 0.71183646, "learning_rate": 3.85810997982162e-06, "loss": 0.7337026, "num_input_tokens_seen": 53070055, "step": 2445, "time_per_iteration": 2.735361099243164 }, { "auxiliary_loss_clip": 0.01095177, "auxiliary_loss_mlp": 0.01023118, "balance_loss_clip": 1.05335557, "balance_loss_mlp": 1.01999438, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.824990401786658, "language_loss": 0.63083708, "learning_rate": 3.857965866494923e-06, "loss": 0.65202004, "num_input_tokens_seen": 53126945, "step": 2446, "time_per_iteration": 3.0853025913238525 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01045249, "balance_loss_clip": 1.05621576, "balance_loss_mlp": 1.02491164, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 2.813052009295296, "language_loss": 0.74895924, "learning_rate": 3.857821682713975e-06, "loss": 0.77061838, "num_input_tokens_seen": 53149130, "step": 2447, "time_per_iteration": 2.858643054962158 }, { "auxiliary_loss_clip": 0.01168929, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.0604012, "balance_loss_mlp": 1.02383327, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 2.2427639286159367, "language_loss": 0.8528471, "learning_rate": 3.857677428484242e-06, "loss": 0.87496543, "num_input_tokens_seen": 53167120, "step": 2448, "time_per_iteration": 2.699781894683838 }, { "auxiliary_loss_clip": 0.01092169, "auxiliary_loss_mlp": 0.01019616, "balance_loss_clip": 1.05051064, "balance_loss_mlp": 1.01654005, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7683837313264128, "language_loss": 0.56829578, "learning_rate": 3.857533103811195e-06, "loss": 0.58941364, "num_input_tokens_seen": 53227945, "step": 2449, "time_per_iteration": 3.1478211879730225 }, { "auxiliary_loss_clip": 0.01135016, "auxiliary_loss_mlp": 0.01050801, "balance_loss_clip": 1.05464292, "balance_loss_mlp": 1.03023791, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.9048653074507311, "language_loss": 0.85067344, "learning_rate": 3.857388708700307e-06, "loss": 0.87253165, "num_input_tokens_seen": 53244615, "step": 2450, "time_per_iteration": 2.726008653640747 }, { "auxiliary_loss_clip": 0.01158708, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.05984712, "balance_loss_mlp": 1.02994645, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 2.306043539040143, "language_loss": 0.74523091, "learning_rate": 3.857244243157052e-06, "loss": 0.76731533, "num_input_tokens_seen": 53262205, "step": 2451, "time_per_iteration": 2.641082286834717 }, { "auxiliary_loss_clip": 0.01133915, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05399728, "balance_loss_mlp": 1.02031422, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.8026547738986978, "language_loss": 0.82384264, "learning_rate": 3.85709970718691e-06, "loss": 0.84556639, "num_input_tokens_seen": 53282445, "step": 2452, "time_per_iteration": 2.7810096740722656 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01041864, "balance_loss_clip": 1.05924153, "balance_loss_mlp": 1.0238874, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.6675065143572472, "language_loss": 0.74075705, "learning_rate": 3.856955100795361e-06, "loss": 0.76219124, "num_input_tokens_seen": 53299060, "step": 2453, "time_per_iteration": 2.7913167476654053 }, { "auxiliary_loss_clip": 0.01141798, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.05557632, "balance_loss_mlp": 1.026353, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 1.9958141581621542, "language_loss": 0.7558704, "learning_rate": 3.856810423987889e-06, "loss": 0.77774906, "num_input_tokens_seen": 53315970, "step": 2454, "time_per_iteration": 2.7199089527130127 }, { "auxiliary_loss_clip": 0.01147348, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.05733335, "balance_loss_mlp": 1.01864362, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 2.0858167958418674, "language_loss": 0.83077228, "learning_rate": 3.856665676769979e-06, "loss": 0.85262716, "num_input_tokens_seen": 53332940, "step": 2455, "time_per_iteration": 2.75616192817688 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.05704689, "balance_loss_mlp": 1.02452159, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 2.3702229998953976, "language_loss": 0.83881497, "learning_rate": 3.85652085914712e-06, "loss": 0.86054951, "num_input_tokens_seen": 53353295, "step": 2456, "time_per_iteration": 2.7914254665374756 }, { "auxiliary_loss_clip": 0.01154014, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.05863023, "balance_loss_mlp": 1.02514231, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 2.4172359629848996, "language_loss": 0.84154665, "learning_rate": 3.856375971124805e-06, "loss": 0.86352402, "num_input_tokens_seen": 53373410, "step": 2457, "time_per_iteration": 2.688265323638916 }, { "auxiliary_loss_clip": 0.01155788, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.02529585, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 6.310680797376285, "language_loss": 0.75692672, "learning_rate": 3.856231012708527e-06, "loss": 0.77891362, "num_input_tokens_seen": 53391430, "step": 2458, "time_per_iteration": 2.698697805404663 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.05451179, "balance_loss_mlp": 1.02718902, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 3.1268711361266393, "language_loss": 0.83348328, "learning_rate": 3.856085983903782e-06, "loss": 0.85513484, "num_input_tokens_seen": 53409960, "step": 2459, "time_per_iteration": 2.790552854537964 }, { "auxiliary_loss_clip": 0.01126767, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.05070424, "balance_loss_mlp": 1.02435231, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 3.1203941208753534, "language_loss": 0.7554391, "learning_rate": 3.855940884716071e-06, "loss": 0.77712965, "num_input_tokens_seen": 53426160, "step": 2460, "time_per_iteration": 2.815455675125122 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.05845904, "balance_loss_mlp": 1.03770471, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 3.59241393994, "language_loss": 0.81227219, "learning_rate": 3.855795715150896e-06, "loss": 0.83418173, "num_input_tokens_seen": 53448530, "step": 2461, "time_per_iteration": 2.785569190979004 }, { "auxiliary_loss_clip": 0.01156748, "auxiliary_loss_mlp": 0.01051178, "balance_loss_clip": 1.05812359, "balance_loss_mlp": 1.03044713, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 3.2910626990147183, "language_loss": 0.66117477, "learning_rate": 3.855650475213761e-06, "loss": 0.683254, "num_input_tokens_seen": 53465915, "step": 2462, "time_per_iteration": 2.7222983837127686 }, { "auxiliary_loss_clip": 0.01136035, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.05622339, "balance_loss_mlp": 1.02965331, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 1.8120706772856114, "language_loss": 0.67226064, "learning_rate": 3.8555051649101745e-06, "loss": 0.69411635, "num_input_tokens_seen": 53496055, "step": 2463, "time_per_iteration": 3.0344398021698 }, { "auxiliary_loss_clip": 0.01153077, "auxiliary_loss_mlp": 0.01050435, "balance_loss_clip": 1.05550933, "balance_loss_mlp": 1.0307889, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 1.9881580745750587, "language_loss": 0.76870739, "learning_rate": 3.855359784245646e-06, "loss": 0.79074258, "num_input_tokens_seen": 53513790, "step": 2464, "time_per_iteration": 2.69480037689209 }, { "auxiliary_loss_clip": 0.01133748, "auxiliary_loss_mlp": 0.01057139, "balance_loss_clip": 1.05392432, "balance_loss_mlp": 1.03769565, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.8401367705559406, "language_loss": 0.79628456, "learning_rate": 3.855214333225688e-06, "loss": 0.81819344, "num_input_tokens_seen": 53533410, "step": 2465, "time_per_iteration": 2.6989939212799072 }, { "auxiliary_loss_clip": 0.01170385, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.06119514, "balance_loss_mlp": 1.03568494, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 2.005541134809237, "language_loss": 0.76272273, "learning_rate": 3.855068811855817e-06, "loss": 0.78497583, "num_input_tokens_seen": 53554775, "step": 2466, "time_per_iteration": 2.646245002746582 }, { "auxiliary_loss_clip": 0.01018939, "auxiliary_loss_mlp": 0.0114331, "balance_loss_clip": 1.03313899, "balance_loss_mlp": 1.14004362, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.8320983618395327, "language_loss": 0.6004858, "learning_rate": 3.854923220141551e-06, "loss": 0.62210834, "num_input_tokens_seen": 53609675, "step": 2467, "time_per_iteration": 3.33776593208313 }, { "auxiliary_loss_clip": 0.01141854, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.05437851, "balance_loss_mlp": 1.02509522, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 2.92694776694492, "language_loss": 0.87666196, "learning_rate": 3.85477755808841e-06, "loss": 0.89852077, "num_input_tokens_seen": 53626950, "step": 2468, "time_per_iteration": 4.266207456588745 }, { "auxiliary_loss_clip": 0.01130189, "auxiliary_loss_mlp": 0.01048186, "balance_loss_clip": 1.05255163, "balance_loss_mlp": 1.02782488, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 2.2284173124426223, "language_loss": 0.7598694, "learning_rate": 3.854631825701919e-06, "loss": 0.78165317, "num_input_tokens_seen": 53644200, "step": 2469, "time_per_iteration": 4.217481851577759 }, { "auxiliary_loss_clip": 0.01126269, "auxiliary_loss_mlp": 0.0104139, "balance_loss_clip": 1.05208421, "balance_loss_mlp": 1.02251911, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 6.591244267451795, "language_loss": 0.75895017, "learning_rate": 3.854486022987603e-06, "loss": 0.78062677, "num_input_tokens_seen": 53659650, "step": 2470, "time_per_iteration": 2.7157187461853027 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.05831027, "balance_loss_mlp": 1.02571499, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.8610043660805562, "language_loss": 0.7215873, "learning_rate": 3.8543401499509905e-06, "loss": 0.74364614, "num_input_tokens_seen": 53680275, "step": 2471, "time_per_iteration": 4.162387132644653 }, { "auxiliary_loss_clip": 0.01135244, "auxiliary_loss_mlp": 0.01047611, "balance_loss_clip": 1.05438995, "balance_loss_mlp": 1.02717888, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 1.979025280241548, "language_loss": 0.89558828, "learning_rate": 3.854194206597615e-06, "loss": 0.91741687, "num_input_tokens_seen": 53698270, "step": 2472, "time_per_iteration": 2.739457607269287 }, { "auxiliary_loss_clip": 0.01134625, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.06334805, "balance_loss_mlp": 1.02964163, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 2.6029609251362764, "language_loss": 0.80801564, "learning_rate": 3.854048192933008e-06, "loss": 0.82985294, "num_input_tokens_seen": 53716845, "step": 2473, "time_per_iteration": 4.412883758544922 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.0626657, "balance_loss_mlp": 1.03267312, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 3.426519274325147, "language_loss": 0.77372944, "learning_rate": 3.853902108962709e-06, "loss": 0.79585278, "num_input_tokens_seen": 53734970, "step": 2474, "time_per_iteration": 2.6879520416259766 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01059785, "balance_loss_clip": 1.05597806, "balance_loss_mlp": 1.04041362, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 2.4771626433268734, "language_loss": 0.82151824, "learning_rate": 3.853755954692255e-06, "loss": 0.84335828, "num_input_tokens_seen": 53753415, "step": 2475, "time_per_iteration": 2.7828469276428223 }, { "auxiliary_loss_clip": 0.01115855, "auxiliary_loss_mlp": 0.01052322, "balance_loss_clip": 1.0614953, "balance_loss_mlp": 1.03341544, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.9349243252831771, "language_loss": 0.80917645, "learning_rate": 3.85360973012719e-06, "loss": 0.83085823, "num_input_tokens_seen": 53770305, "step": 2476, "time_per_iteration": 2.7227590084075928 }, { "auxiliary_loss_clip": 0.01156019, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.06338036, "balance_loss_mlp": 1.03216898, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 2.0032169897498346, "language_loss": 0.77659523, "learning_rate": 3.853463435273058e-06, "loss": 0.79865897, "num_input_tokens_seen": 53788895, "step": 2477, "time_per_iteration": 2.740241765975952 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.01092005, "balance_loss_clip": 1.07879949, "balance_loss_mlp": 1.08730817, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8188153224748298, "language_loss": 0.60153681, "learning_rate": 3.853317070135407e-06, "loss": 0.62348026, "num_input_tokens_seen": 53850260, "step": 2478, "time_per_iteration": 3.2467947006225586 }, { "auxiliary_loss_clip": 0.01107417, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.0516423, "balance_loss_mlp": 1.03041577, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.666109649137694, "language_loss": 0.7139731, "learning_rate": 3.853170634719787e-06, "loss": 0.73553181, "num_input_tokens_seen": 53867520, "step": 2479, "time_per_iteration": 2.7973475456237793 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05563831, "balance_loss_mlp": 1.02407789, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.7687137634424535, "language_loss": 0.80758464, "learning_rate": 3.853024129031751e-06, "loss": 0.82942122, "num_input_tokens_seen": 53886620, "step": 2480, "time_per_iteration": 2.7238829135894775 }, { "auxiliary_loss_clip": 0.01138106, "auxiliary_loss_mlp": 0.0104537, "balance_loss_clip": 1.0584991, "balance_loss_mlp": 1.02627277, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 4.65741826395702, "language_loss": 0.84375542, "learning_rate": 3.852877553076854e-06, "loss": 0.86559021, "num_input_tokens_seen": 53902230, "step": 2481, "time_per_iteration": 2.791550874710083 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05772805, "balance_loss_mlp": 1.02948999, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 8.035113387353048, "language_loss": 0.77703977, "learning_rate": 3.8527309068606546e-06, "loss": 0.79903734, "num_input_tokens_seen": 53919475, "step": 2482, "time_per_iteration": 2.7310593128204346 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01040426, "balance_loss_clip": 1.05452228, "balance_loss_mlp": 1.02032781, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.207731010812049, "language_loss": 0.78967929, "learning_rate": 3.852584190388713e-06, "loss": 0.81141514, "num_input_tokens_seen": 53939150, "step": 2483, "time_per_iteration": 2.749671220779419 }, { "auxiliary_loss_clip": 0.01154122, "auxiliary_loss_mlp": 0.00776708, "balance_loss_clip": 1.06144214, "balance_loss_mlp": 1.00029397, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 2.020127706544282, "language_loss": 0.70361555, "learning_rate": 3.852437403666595e-06, "loss": 0.72292387, "num_input_tokens_seen": 53958735, "step": 2484, "time_per_iteration": 2.737781524658203 }, { "auxiliary_loss_clip": 0.01141919, "auxiliary_loss_mlp": 0.00778215, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.00030363, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 2.165877689982274, "language_loss": 0.84666765, "learning_rate": 3.852290546699863e-06, "loss": 0.86586899, "num_input_tokens_seen": 53975065, "step": 2485, "time_per_iteration": 2.697976589202881 }, { "auxiliary_loss_clip": 0.01145272, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05639958, "balance_loss_mlp": 1.02257001, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 2.5229241908443023, "language_loss": 0.8476423, "learning_rate": 3.8521436194940894e-06, "loss": 0.86951739, "num_input_tokens_seen": 53993330, "step": 2486, "time_per_iteration": 2.6799628734588623 }, { "auxiliary_loss_clip": 0.01149031, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.05667424, "balance_loss_mlp": 1.0230875, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 2.1822908802725203, "language_loss": 0.74762607, "learning_rate": 3.851996622054842e-06, "loss": 0.76950949, "num_input_tokens_seen": 54010515, "step": 2487, "time_per_iteration": 2.8037290573120117 }, { "auxiliary_loss_clip": 0.01153097, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.05934322, "balance_loss_mlp": 1.02611899, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 16.320028017118723, "language_loss": 0.72210175, "learning_rate": 3.8518495543877e-06, "loss": 0.74407548, "num_input_tokens_seen": 54031315, "step": 2488, "time_per_iteration": 2.8031094074249268 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.01054916, "balance_loss_clip": 1.05569518, "balance_loss_mlp": 1.03636682, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 3.2458980886023143, "language_loss": 0.71352434, "learning_rate": 3.851702416498235e-06, "loss": 0.73544884, "num_input_tokens_seen": 54045965, "step": 2489, "time_per_iteration": 2.648883819580078 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01052603, "balance_loss_clip": 1.05376494, "balance_loss_mlp": 1.03357768, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.893198448080141, "language_loss": 0.81559736, "learning_rate": 3.8515552083920295e-06, "loss": 0.8375001, "num_input_tokens_seen": 54059960, "step": 2490, "time_per_iteration": 2.702808380126953 }, { "auxiliary_loss_clip": 0.01125097, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.05606139, "balance_loss_mlp": 1.03803492, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.9071281232744548, "language_loss": 0.80057055, "learning_rate": 3.851407930074666e-06, "loss": 0.82238084, "num_input_tokens_seen": 54079330, "step": 2491, "time_per_iteration": 2.833272933959961 }, { "auxiliary_loss_clip": 0.01143407, "auxiliary_loss_mlp": 0.01052558, "balance_loss_clip": 1.05301452, "balance_loss_mlp": 1.03195894, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 2.3105790695512294, "language_loss": 0.90820229, "learning_rate": 3.851260581551727e-06, "loss": 0.93016195, "num_input_tokens_seen": 54097555, "step": 2492, "time_per_iteration": 2.684178352355957 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01063543, "balance_loss_clip": 1.05835843, "balance_loss_mlp": 1.04508913, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 6.881290297472923, "language_loss": 0.79406559, "learning_rate": 3.851113162828802e-06, "loss": 0.81622434, "num_input_tokens_seen": 54115600, "step": 2493, "time_per_iteration": 2.6558918952941895 }, { "auxiliary_loss_clip": 0.0114858, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.05345511, "balance_loss_mlp": 1.03258693, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 2.3431247769189967, "language_loss": 0.79894584, "learning_rate": 3.85096567391148e-06, "loss": 0.82095182, "num_input_tokens_seen": 54135220, "step": 2494, "time_per_iteration": 2.6774168014526367 }, { "auxiliary_loss_clip": 0.01137216, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.05474579, "balance_loss_mlp": 1.03212965, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.928941284350508, "language_loss": 0.66480517, "learning_rate": 3.850818114805354e-06, "loss": 0.68668592, "num_input_tokens_seen": 54161065, "step": 2495, "time_per_iteration": 3.1090729236602783 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.06896818, "balance_loss_mlp": 1.03560257, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.9030283421527312, "language_loss": 0.59524739, "learning_rate": 3.850670485516019e-06, "loss": 0.61666763, "num_input_tokens_seen": 54225095, "step": 2496, "time_per_iteration": 3.2250726222991943 }, { "auxiliary_loss_clip": 0.01163934, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.05690169, "balance_loss_mlp": 1.0360074, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 3.063784198565679, "language_loss": 0.65276247, "learning_rate": 3.850522786049075e-06, "loss": 0.67495906, "num_input_tokens_seen": 54243750, "step": 2497, "time_per_iteration": 2.619946002960205 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.05308235, "balance_loss_mlp": 1.03316998, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.5552670947231086, "language_loss": 0.75182658, "learning_rate": 3.850375016410121e-06, "loss": 0.77362406, "num_input_tokens_seen": 54266185, "step": 2498, "time_per_iteration": 2.778163433074951 }, { "auxiliary_loss_clip": 0.01132738, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.02701163, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 3.357364003851319, "language_loss": 0.71821117, "learning_rate": 3.850227176604761e-06, "loss": 0.74000776, "num_input_tokens_seen": 54283940, "step": 2499, "time_per_iteration": 2.6929259300231934 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.0547812, "balance_loss_mlp": 1.03236222, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.1406696998963652, "language_loss": 0.7206136, "learning_rate": 3.850079266638601e-06, "loss": 0.7424742, "num_input_tokens_seen": 54304830, "step": 2500, "time_per_iteration": 2.769988536834717 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.0105021, "balance_loss_clip": 1.06063724, "balance_loss_mlp": 1.03181624, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 2.0251881980439306, "language_loss": 0.65127194, "learning_rate": 3.849931286517249e-06, "loss": 0.6731143, "num_input_tokens_seen": 54325595, "step": 2501, "time_per_iteration": 2.810945510864258 }, { "auxiliary_loss_clip": 0.01137877, "auxiliary_loss_mlp": 0.01055223, "balance_loss_clip": 1.0541079, "balance_loss_mlp": 1.03511274, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.209666371186328, "language_loss": 0.83401144, "learning_rate": 3.849783236246318e-06, "loss": 0.85594243, "num_input_tokens_seen": 54342180, "step": 2502, "time_per_iteration": 2.6780545711517334 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01049887, "balance_loss_clip": 1.05318308, "balance_loss_mlp": 1.0323875, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 2.0319272128830947, "language_loss": 0.77134645, "learning_rate": 3.849635115831421e-06, "loss": 0.79307491, "num_input_tokens_seen": 54360255, "step": 2503, "time_per_iteration": 2.7579123973846436 }, { "auxiliary_loss_clip": 0.01159116, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02692807, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 1.9852139459946199, "language_loss": 0.85514295, "learning_rate": 3.849486925278176e-06, "loss": 0.87717503, "num_input_tokens_seen": 54378260, "step": 2504, "time_per_iteration": 2.631882905960083 }, { "auxiliary_loss_clip": 0.01146113, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.05622697, "balance_loss_mlp": 1.03098798, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.8222645508164372, "language_loss": 0.83178544, "learning_rate": 3.8493386645922e-06, "loss": 0.85372692, "num_input_tokens_seen": 54399745, "step": 2505, "time_per_iteration": 2.7706007957458496 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01053819, "balance_loss_clip": 1.05586648, "balance_loss_mlp": 1.03590202, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 2.0148067518000445, "language_loss": 0.76044405, "learning_rate": 3.849190333779117e-06, "loss": 0.7822392, "num_input_tokens_seen": 54417105, "step": 2506, "time_per_iteration": 2.70989990234375 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01041911, "balance_loss_clip": 1.05785728, "balance_loss_mlp": 1.02305174, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 2.823460856599666, "language_loss": 0.76220375, "learning_rate": 3.849041932844552e-06, "loss": 0.78427601, "num_input_tokens_seen": 54433920, "step": 2507, "time_per_iteration": 2.5367634296417236 }, { "auxiliary_loss_clip": 0.01144479, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.05261898, "balance_loss_mlp": 1.02306986, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 2.5197772895304906, "language_loss": 0.68633789, "learning_rate": 3.848893461794131e-06, "loss": 0.70819366, "num_input_tokens_seen": 54451540, "step": 2508, "time_per_iteration": 4.303388833999634 }, { "auxiliary_loss_clip": 0.01130299, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.05477214, "balance_loss_mlp": 1.02835178, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 2.840517748098311, "language_loss": 0.77994299, "learning_rate": 3.8487449206334845e-06, "loss": 0.80171108, "num_input_tokens_seen": 54470800, "step": 2509, "time_per_iteration": 4.380200147628784 }, { "auxiliary_loss_clip": 0.01141335, "auxiliary_loss_mlp": 0.00776843, "balance_loss_clip": 1.05463386, "balance_loss_mlp": 1.00027037, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 2.53406994590866, "language_loss": 0.79959804, "learning_rate": 3.848596309368246e-06, "loss": 0.81877983, "num_input_tokens_seen": 54486525, "step": 2510, "time_per_iteration": 4.219487428665161 }, { "auxiliary_loss_clip": 0.01150641, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05529225, "balance_loss_mlp": 1.02794981, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 1.8628702139594306, "language_loss": 0.73398602, "learning_rate": 3.8484476280040495e-06, "loss": 0.75596589, "num_input_tokens_seen": 54503795, "step": 2511, "time_per_iteration": 2.62237811088562 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04747009, "balance_loss_mlp": 1.02365553, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 2.20399257021602, "language_loss": 0.68716824, "learning_rate": 3.848298876546534e-06, "loss": 0.70853454, "num_input_tokens_seen": 54523025, "step": 2512, "time_per_iteration": 2.823359489440918 }, { "auxiliary_loss_clip": 0.01149398, "auxiliary_loss_mlp": 0.01043296, "balance_loss_clip": 1.05574036, "balance_loss_mlp": 1.02615356, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.6278607305338877, "language_loss": 0.73833561, "learning_rate": 3.84815005500134e-06, "loss": 0.76026255, "num_input_tokens_seen": 54545025, "step": 2513, "time_per_iteration": 4.386258602142334 }, { "auxiliary_loss_clip": 0.01059691, "auxiliary_loss_mlp": 0.01109321, "balance_loss_clip": 1.0685482, "balance_loss_mlp": 1.10529137, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.9017688875456507, "language_loss": 0.64720047, "learning_rate": 3.84800116337411e-06, "loss": 0.6688906, "num_input_tokens_seen": 54604545, "step": 2514, "time_per_iteration": 3.254983425140381 }, { "auxiliary_loss_clip": 0.01146323, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.05674648, "balance_loss_mlp": 1.02584124, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 3.178381755435586, "language_loss": 0.72995645, "learning_rate": 3.8478522016704916e-06, "loss": 0.7518549, "num_input_tokens_seen": 54620590, "step": 2515, "time_per_iteration": 2.67921781539917 }, { "auxiliary_loss_clip": 0.01133382, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.05675673, "balance_loss_mlp": 1.02120531, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 2.0712989062813243, "language_loss": 0.7773214, "learning_rate": 3.8477031698961325e-06, "loss": 0.79905832, "num_input_tokens_seen": 54640410, "step": 2516, "time_per_iteration": 2.763467788696289 }, { "auxiliary_loss_clip": 0.01087601, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.05344796, "balance_loss_mlp": 1.00160813, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7270407819118658, "language_loss": 0.54622567, "learning_rate": 3.8475540680566835e-06, "loss": 0.56714946, "num_input_tokens_seen": 54701430, "step": 2517, "time_per_iteration": 3.2293660640716553 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.04499209, "balance_loss_mlp": 1.02427244, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 3.035771526476276, "language_loss": 0.78264821, "learning_rate": 3.8474048961577995e-06, "loss": 0.80418587, "num_input_tokens_seen": 54720845, "step": 2518, "time_per_iteration": 2.8154754638671875 }, { "auxiliary_loss_clip": 0.01147342, "auxiliary_loss_mlp": 0.01056368, "balance_loss_clip": 1.05279088, "balance_loss_mlp": 1.03681803, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.1881526177791097, "language_loss": 0.70480245, "learning_rate": 3.847255654205137e-06, "loss": 0.72683954, "num_input_tokens_seen": 54740495, "step": 2519, "time_per_iteration": 2.7098515033721924 }, { "auxiliary_loss_clip": 0.01152463, "auxiliary_loss_mlp": 0.01056975, "balance_loss_clip": 1.05683672, "balance_loss_mlp": 1.03802037, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.9048594994100874, "language_loss": 0.78681207, "learning_rate": 3.847106342204354e-06, "loss": 0.80890644, "num_input_tokens_seen": 54758415, "step": 2520, "time_per_iteration": 2.664187431335449 }, { "auxiliary_loss_clip": 0.01140573, "auxiliary_loss_mlp": 0.01071607, "balance_loss_clip": 1.05435348, "balance_loss_mlp": 1.05244994, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 3.950911503454746, "language_loss": 0.74849677, "learning_rate": 3.846956960161114e-06, "loss": 0.77061862, "num_input_tokens_seen": 54779355, "step": 2521, "time_per_iteration": 2.7900772094726562 }, { "auxiliary_loss_clip": 0.01132038, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.05052209, "balance_loss_mlp": 1.0360136, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 4.620979243079986, "language_loss": 0.8253814, "learning_rate": 3.84680750808108e-06, "loss": 0.84726053, "num_input_tokens_seen": 54799465, "step": 2522, "time_per_iteration": 2.7216525077819824 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.05645704, "balance_loss_mlp": 1.04595995, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8362305181264502, "language_loss": 0.57885599, "learning_rate": 3.846657985969922e-06, "loss": 0.59986252, "num_input_tokens_seen": 54857665, "step": 2523, "time_per_iteration": 3.2375056743621826 }, { "auxiliary_loss_clip": 0.0114147, "auxiliary_loss_mlp": 0.01057964, "balance_loss_clip": 1.05213499, "balance_loss_mlp": 1.0368042, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.8054087157705183, "language_loss": 0.74795163, "learning_rate": 3.8465083938333066e-06, "loss": 0.76994598, "num_input_tokens_seen": 54879895, "step": 2524, "time_per_iteration": 2.711557388305664 }, { "auxiliary_loss_clip": 0.01138185, "auxiliary_loss_mlp": 0.01057236, "balance_loss_clip": 1.05304718, "balance_loss_mlp": 1.03865099, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.8255227790100423, "language_loss": 0.74631184, "learning_rate": 3.8463587316769085e-06, "loss": 0.76826608, "num_input_tokens_seen": 54898245, "step": 2525, "time_per_iteration": 2.6936984062194824 }, { "auxiliary_loss_clip": 0.01144047, "auxiliary_loss_mlp": 0.01057009, "balance_loss_clip": 1.05403006, "balance_loss_mlp": 1.03747034, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 1.8907352833287865, "language_loss": 0.79600316, "learning_rate": 3.846208999506402e-06, "loss": 0.81801373, "num_input_tokens_seen": 54917060, "step": 2526, "time_per_iteration": 2.651494264602661 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.01047798, "balance_loss_clip": 1.05538774, "balance_loss_mlp": 1.03056002, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 1.7677336965262924, "language_loss": 0.8443349, "learning_rate": 3.846059197327466e-06, "loss": 0.86616516, "num_input_tokens_seen": 54936365, "step": 2527, "time_per_iteration": 2.702683448791504 }, { "auxiliary_loss_clip": 0.01124925, "auxiliary_loss_mlp": 0.01049207, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 1.02985954, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 1.85678489681458, "language_loss": 0.69361663, "learning_rate": 3.845909325145779e-06, "loss": 0.7153579, "num_input_tokens_seen": 54961365, "step": 2528, "time_per_iteration": 2.9250690937042236 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.01055056, "balance_loss_clip": 1.05266535, "balance_loss_mlp": 1.03587484, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 2.004144148858156, "language_loss": 0.86482549, "learning_rate": 3.845759382967026e-06, "loss": 0.88671696, "num_input_tokens_seen": 54980750, "step": 2529, "time_per_iteration": 2.7277863025665283 }, { "auxiliary_loss_clip": 0.01124798, "auxiliary_loss_mlp": 0.01041651, "balance_loss_clip": 1.05046487, "balance_loss_mlp": 1.02297091, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.544775548600603, "language_loss": 0.83399373, "learning_rate": 3.845609370796893e-06, "loss": 0.85565823, "num_input_tokens_seen": 54999675, "step": 2530, "time_per_iteration": 2.8717291355133057 }, { "auxiliary_loss_clip": 0.01125761, "auxiliary_loss_mlp": 0.01048121, "balance_loss_clip": 1.05035281, "balance_loss_mlp": 1.02940559, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.1410437006568723, "language_loss": 0.80404246, "learning_rate": 3.845459288641066e-06, "loss": 0.82578129, "num_input_tokens_seen": 55018295, "step": 2531, "time_per_iteration": 2.8444995880126953 }, { "auxiliary_loss_clip": 0.01143114, "auxiliary_loss_mlp": 0.01043494, "balance_loss_clip": 1.05216551, "balance_loss_mlp": 1.02613723, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.7922494378130023, "language_loss": 0.78874445, "learning_rate": 3.8453091365052394e-06, "loss": 0.81061059, "num_input_tokens_seen": 55037975, "step": 2532, "time_per_iteration": 2.9122390747070312 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.05737543, "balance_loss_mlp": 1.02676702, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.9533698136575197, "language_loss": 0.87679356, "learning_rate": 3.845158914395105e-06, "loss": 0.89874816, "num_input_tokens_seen": 55057135, "step": 2533, "time_per_iteration": 2.7987985610961914 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01048672, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.02983665, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.391026063452041, "language_loss": 0.78886449, "learning_rate": 3.84500862231636e-06, "loss": 0.81053078, "num_input_tokens_seen": 55075525, "step": 2534, "time_per_iteration": 2.7587406635284424 }, { "auxiliary_loss_clip": 0.01164218, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.05609345, "balance_loss_mlp": 1.0270381, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 2.689732363294508, "language_loss": 0.76809752, "learning_rate": 3.844858260274702e-06, "loss": 0.79021192, "num_input_tokens_seen": 55090845, "step": 2535, "time_per_iteration": 2.7494406700134277 }, { "auxiliary_loss_clip": 0.01142628, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.05345285, "balance_loss_mlp": 1.02401042, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.2235871255319446, "language_loss": 0.78301942, "learning_rate": 3.844707828275835e-06, "loss": 0.80487478, "num_input_tokens_seen": 55108750, "step": 2536, "time_per_iteration": 2.738638401031494 }, { "auxiliary_loss_clip": 0.01128919, "auxiliary_loss_mlp": 0.0105368, "balance_loss_clip": 1.05349088, "balance_loss_mlp": 1.03497589, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.311649941233105, "language_loss": 0.75824189, "learning_rate": 3.844557326325461e-06, "loss": 0.78006792, "num_input_tokens_seen": 55126750, "step": 2537, "time_per_iteration": 2.632373809814453 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.05675745, "balance_loss_mlp": 1.02331281, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 2.193148723631548, "language_loss": 0.77737647, "learning_rate": 3.8444067544292896e-06, "loss": 0.79928178, "num_input_tokens_seen": 55144690, "step": 2538, "time_per_iteration": 2.6835639476776123 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05477905, "balance_loss_mlp": 1.02480412, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 2.951423477379744, "language_loss": 0.89502335, "learning_rate": 3.844256112593029e-06, "loss": 0.91653961, "num_input_tokens_seen": 55166055, "step": 2539, "time_per_iteration": 2.7825794219970703 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.05367279, "balance_loss_mlp": 1.02721143, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 2.1073423273657044, "language_loss": 0.93423879, "learning_rate": 3.844105400822391e-06, "loss": 0.95612311, "num_input_tokens_seen": 55186285, "step": 2540, "time_per_iteration": 2.717541456222534 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.0240885, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 2.084754505375857, "language_loss": 0.75217843, "learning_rate": 3.843954619123092e-06, "loss": 0.77391309, "num_input_tokens_seen": 55207915, "step": 2541, "time_per_iteration": 2.8376123905181885 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.04877007, "balance_loss_mlp": 1.0268805, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 2.037290364787748, "language_loss": 0.80996066, "learning_rate": 3.84380376750085e-06, "loss": 0.83157599, "num_input_tokens_seen": 55227860, "step": 2542, "time_per_iteration": 2.7110376358032227 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.0566076, "balance_loss_mlp": 1.02992105, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 3.2152362880248857, "language_loss": 0.77796149, "learning_rate": 3.843652845961383e-06, "loss": 0.80005145, "num_input_tokens_seen": 55247330, "step": 2543, "time_per_iteration": 2.674131155014038 }, { "auxiliary_loss_clip": 0.01145565, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.05380869, "balance_loss_mlp": 1.02388239, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 2.4890924021550918, "language_loss": 0.85898137, "learning_rate": 3.843501854510416e-06, "loss": 0.88085836, "num_input_tokens_seen": 55266195, "step": 2544, "time_per_iteration": 2.685840606689453 }, { "auxiliary_loss_clip": 0.01149904, "auxiliary_loss_mlp": 0.01051141, "balance_loss_clip": 1.05162692, "balance_loss_mlp": 1.03061318, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 1.9817931887295275, "language_loss": 0.83159137, "learning_rate": 3.843350793153673e-06, "loss": 0.85360181, "num_input_tokens_seen": 55283305, "step": 2545, "time_per_iteration": 2.7415812015533447 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.05556524, "balance_loss_mlp": 1.02257705, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 6.0131413628182, "language_loss": 0.71669161, "learning_rate": 3.843199661896884e-06, "loss": 0.73869026, "num_input_tokens_seen": 55303035, "step": 2546, "time_per_iteration": 2.6626265048980713 }, { "auxiliary_loss_clip": 0.01130357, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.05013335, "balance_loss_mlp": 1.02688098, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 1.6563553629779504, "language_loss": 0.77438712, "learning_rate": 3.843048460745779e-06, "loss": 0.79616702, "num_input_tokens_seen": 55327570, "step": 2547, "time_per_iteration": 4.451423168182373 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01044692, "balance_loss_clip": 1.04845536, "balance_loss_mlp": 1.02517736, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.3544675813743834, "language_loss": 0.74357474, "learning_rate": 3.842897189706092e-06, "loss": 0.7650972, "num_input_tokens_seen": 55351090, "step": 2548, "time_per_iteration": 2.846991539001465 }, { "auxiliary_loss_clip": 0.01138346, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.05340147, "balance_loss_mlp": 1.03304434, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.446042531021912, "language_loss": 0.80296385, "learning_rate": 3.842745848783558e-06, "loss": 0.82487667, "num_input_tokens_seen": 55371050, "step": 2549, "time_per_iteration": 5.8849101066589355 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.05108786, "balance_loss_mlp": 1.02255249, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.6149920159034452, "language_loss": 0.74602014, "learning_rate": 3.842594437983917e-06, "loss": 0.76788169, "num_input_tokens_seen": 55390375, "step": 2550, "time_per_iteration": 2.684868812561035 }, { "auxiliary_loss_clip": 0.01149823, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05212283, "balance_loss_mlp": 1.02129996, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 2.33086854575276, "language_loss": 0.76910275, "learning_rate": 3.8424429573129115e-06, "loss": 0.79100841, "num_input_tokens_seen": 55408890, "step": 2551, "time_per_iteration": 4.415414333343506 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.01054065, "balance_loss_clip": 1.05333817, "balance_loss_mlp": 1.05116868, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9493148205555214, "language_loss": 0.5665558, "learning_rate": 3.842291406776283e-06, "loss": 0.5879637, "num_input_tokens_seen": 55463815, "step": 2552, "time_per_iteration": 3.1105730533599854 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.05131924, "balance_loss_mlp": 1.0204618, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.183188616823757, "language_loss": 0.88550794, "learning_rate": 3.84213978637978e-06, "loss": 0.90703207, "num_input_tokens_seen": 55481050, "step": 2553, "time_per_iteration": 2.748298406600952 }, { "auxiliary_loss_clip": 0.01147024, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.0232954, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.8094820084348213, "language_loss": 0.7800495, "learning_rate": 3.841988096129152e-06, "loss": 0.80194902, "num_input_tokens_seen": 55500050, "step": 2554, "time_per_iteration": 2.6555569171905518 }, { "auxiliary_loss_clip": 0.01094445, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.04876757, "balance_loss_mlp": 1.03291798, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.372022486587551, "language_loss": 0.77472258, "learning_rate": 3.841836336030151e-06, "loss": 0.79620385, "num_input_tokens_seen": 55518125, "step": 2555, "time_per_iteration": 2.7507212162017822 }, { "auxiliary_loss_clip": 0.01129555, "auxiliary_loss_mlp": 0.01046723, "balance_loss_clip": 1.05400753, "balance_loss_mlp": 1.02873409, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5517643759455655, "language_loss": 0.77453947, "learning_rate": 3.8416845060885305e-06, "loss": 0.79630232, "num_input_tokens_seen": 55540960, "step": 2556, "time_per_iteration": 2.7947654724121094 }, { "auxiliary_loss_clip": 0.01140725, "auxiliary_loss_mlp": 0.0077646, "balance_loss_clip": 1.05336452, "balance_loss_mlp": 1.00054574, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.8786460244833383, "language_loss": 0.90098578, "learning_rate": 3.84153260631005e-06, "loss": 0.92015761, "num_input_tokens_seen": 55559210, "step": 2557, "time_per_iteration": 2.702029228210449 }, { "auxiliary_loss_clip": 0.01137441, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.02862656, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.4046585493240102, "language_loss": 0.7092281, "learning_rate": 3.841380636700468e-06, "loss": 0.73108798, "num_input_tokens_seen": 55578925, "step": 2558, "time_per_iteration": 2.815653085708618 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.04937947, "balance_loss_mlp": 1.02659965, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 2.1050139676488535, "language_loss": 0.92165422, "learning_rate": 3.841228597265548e-06, "loss": 0.94343007, "num_input_tokens_seen": 55597255, "step": 2559, "time_per_iteration": 2.7363967895507812 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01057878, "balance_loss_clip": 1.05492043, "balance_loss_mlp": 1.03711152, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.149412909113977, "language_loss": 0.63330692, "learning_rate": 3.841076488011055e-06, "loss": 0.65526068, "num_input_tokens_seen": 55619515, "step": 2560, "time_per_iteration": 2.811800003051758 }, { "auxiliary_loss_clip": 0.01132154, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.04914606, "balance_loss_mlp": 1.02416182, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 2.066473237183783, "language_loss": 0.88155699, "learning_rate": 3.8409243089427574e-06, "loss": 0.90331829, "num_input_tokens_seen": 55640050, "step": 2561, "time_per_iteration": 2.7991089820861816 }, { "auxiliary_loss_clip": 0.0114054, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.05085099, "balance_loss_mlp": 1.02380693, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 1.906051405357337, "language_loss": 0.83117974, "learning_rate": 3.840772060066425e-06, "loss": 0.85300398, "num_input_tokens_seen": 55658695, "step": 2562, "time_per_iteration": 2.6410810947418213 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.00778205, "balance_loss_clip": 1.04988563, "balance_loss_mlp": 1.00058532, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 2.3547297997270906, "language_loss": 0.74647415, "learning_rate": 3.840619741387832e-06, "loss": 0.76554382, "num_input_tokens_seen": 55676340, "step": 2563, "time_per_iteration": 2.6813745498657227 }, { "auxiliary_loss_clip": 0.01116857, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.05126941, "balance_loss_mlp": 1.02444029, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 2.842824767177756, "language_loss": 0.7609179, "learning_rate": 3.8404673529127534e-06, "loss": 0.78252757, "num_input_tokens_seen": 55698890, "step": 2564, "time_per_iteration": 2.832885265350342 }, { "auxiliary_loss_clip": 0.01133461, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.05174518, "balance_loss_mlp": 1.03443313, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.0125869911748575, "language_loss": 0.70960921, "learning_rate": 3.840314894646969e-06, "loss": 0.73147273, "num_input_tokens_seen": 55718535, "step": 2565, "time_per_iteration": 2.7352514266967773 }, { "auxiliary_loss_clip": 0.01137766, "auxiliary_loss_mlp": 0.01046908, "balance_loss_clip": 1.04731965, "balance_loss_mlp": 1.02787066, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.1021891280826965, "language_loss": 0.71605748, "learning_rate": 3.840162366596259e-06, "loss": 0.73790431, "num_input_tokens_seen": 55738970, "step": 2566, "time_per_iteration": 2.681710720062256 }, { "auxiliary_loss_clip": 0.01150619, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.02271223, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.7167104030167524, "language_loss": 0.84746087, "learning_rate": 3.840009768766408e-06, "loss": 0.86937428, "num_input_tokens_seen": 55759585, "step": 2567, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.04447246, "balance_loss_mlp": 1.03164053, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.9101336164483014, "language_loss": 0.78074998, "learning_rate": 3.839857101163202e-06, "loss": 0.80229992, "num_input_tokens_seen": 55779250, "step": 2568, "time_per_iteration": 2.7385261058807373 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.04715753, "balance_loss_mlp": 1.01684201, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.852436867559063, "language_loss": 0.6991998, "learning_rate": 3.83970436379243e-06, "loss": 0.72079051, "num_input_tokens_seen": 55800470, "step": 2569, "time_per_iteration": 2.746974229812622 }, { "auxiliary_loss_clip": 0.01124209, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.04695952, "balance_loss_mlp": 1.02178574, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 1.7212875994527412, "language_loss": 0.76482332, "learning_rate": 3.839551556659884e-06, "loss": 0.78646845, "num_input_tokens_seen": 55817795, "step": 2570, "time_per_iteration": 2.7470619678497314 }, { "auxiliary_loss_clip": 0.01137702, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04993737, "balance_loss_mlp": 1.0192852, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.5033166184578066, "language_loss": 0.77997506, "learning_rate": 3.839398679771359e-06, "loss": 0.80173767, "num_input_tokens_seen": 55836125, "step": 2571, "time_per_iteration": 2.692863702774048 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0498451, "balance_loss_mlp": 1.02704597, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 4.3242380509309015, "language_loss": 0.82932413, "learning_rate": 3.839245733132652e-06, "loss": 0.85109681, "num_input_tokens_seen": 55855280, "step": 2572, "time_per_iteration": 2.8341822624206543 }, { "auxiliary_loss_clip": 0.01156188, "auxiliary_loss_mlp": 0.01042592, "balance_loss_clip": 1.05181205, "balance_loss_mlp": 1.02383995, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5874704718869805, "language_loss": 0.90373385, "learning_rate": 3.839092716749563e-06, "loss": 0.92572165, "num_input_tokens_seen": 55875695, "step": 2573, "time_per_iteration": 2.740121364593506 }, { "auxiliary_loss_clip": 0.01088424, "auxiliary_loss_mlp": 0.01049893, "balance_loss_clip": 1.04328668, "balance_loss_mlp": 1.03003311, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 1.596795561637076, "language_loss": 0.70298707, "learning_rate": 3.838939630627893e-06, "loss": 0.72437024, "num_input_tokens_seen": 55894575, "step": 2574, "time_per_iteration": 2.7629144191741943 }, { "auxiliary_loss_clip": 0.01127537, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04714394, "balance_loss_mlp": 1.02509642, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 6.018921028505516, "language_loss": 0.82426423, "learning_rate": 3.838786474773448e-06, "loss": 0.84599686, "num_input_tokens_seen": 55912855, "step": 2575, "time_per_iteration": 2.656783103942871 }, { "auxiliary_loss_clip": 0.01127415, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02584219, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8376318938002576, "language_loss": 0.85038638, "learning_rate": 3.838633249192036e-06, "loss": 0.87209404, "num_input_tokens_seen": 55932375, "step": 2576, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01152547, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.04872847, "balance_loss_mlp": 1.02499545, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 1.8027999188827728, "language_loss": 0.82271254, "learning_rate": 3.838479953889465e-06, "loss": 0.84467208, "num_input_tokens_seen": 55953970, "step": 2577, "time_per_iteration": 2.6355643272399902 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.05147958, "balance_loss_mlp": 1.02984881, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.1677069711314463, "language_loss": 0.76556361, "learning_rate": 3.8383265888715525e-06, "loss": 0.78731394, "num_input_tokens_seen": 55973120, "step": 2578, "time_per_iteration": 2.649043560028076 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.05155993, "balance_loss_mlp": 1.0253042, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.9614380224881987, "language_loss": 0.82443559, "learning_rate": 3.83817315414411e-06, "loss": 0.8461169, "num_input_tokens_seen": 55993260, "step": 2579, "time_per_iteration": 2.62631893157959 }, { "auxiliary_loss_clip": 0.01143904, "auxiliary_loss_mlp": 0.01044324, "balance_loss_clip": 1.05856657, "balance_loss_mlp": 1.02556014, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 2.610374735790095, "language_loss": 0.80465376, "learning_rate": 3.838019649712958e-06, "loss": 0.82653606, "num_input_tokens_seen": 56012130, "step": 2580, "time_per_iteration": 2.6512253284454346 }, { "auxiliary_loss_clip": 0.0107737, "auxiliary_loss_mlp": 0.01006304, "balance_loss_clip": 1.04551053, "balance_loss_mlp": 1.00360954, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.842131683094019, "language_loss": 0.58823448, "learning_rate": 3.8378660755839166e-06, "loss": 0.60907125, "num_input_tokens_seen": 56079045, "step": 2581, "time_per_iteration": 3.357855796813965 }, { "auxiliary_loss_clip": 0.01108206, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.04392648, "balance_loss_mlp": 1.0249418, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 1.9584677228939371, "language_loss": 0.84773678, "learning_rate": 3.8377124317628095e-06, "loss": 0.86928916, "num_input_tokens_seen": 56098745, "step": 2582, "time_per_iteration": 2.727062702178955 }, { "auxiliary_loss_clip": 0.01144131, "auxiliary_loss_mlp": 0.01051911, "balance_loss_clip": 1.05233002, "balance_loss_mlp": 1.03175235, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.466663791870015, "language_loss": 0.79050052, "learning_rate": 3.8375587182554625e-06, "loss": 0.81246096, "num_input_tokens_seen": 56117655, "step": 2583, "time_per_iteration": 2.664794683456421 }, { "auxiliary_loss_clip": 0.01139818, "auxiliary_loss_mlp": 0.01054771, "balance_loss_clip": 1.04957032, "balance_loss_mlp": 1.03252697, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.8743170599575527, "language_loss": 0.76320136, "learning_rate": 3.837404935067705e-06, "loss": 0.78514719, "num_input_tokens_seen": 56141960, "step": 2584, "time_per_iteration": 2.757392168045044 }, { "auxiliary_loss_clip": 0.01137324, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.04884958, "balance_loss_mlp": 1.02302885, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 1.6493041410587026, "language_loss": 0.75269651, "learning_rate": 3.837251082205368e-06, "loss": 0.77449471, "num_input_tokens_seen": 56161430, "step": 2585, "time_per_iteration": 2.6497461795806885 }, { "auxiliary_loss_clip": 0.01116144, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.02321053, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 2.068989677221064, "language_loss": 0.61187196, "learning_rate": 3.837097159674286e-06, "loss": 0.63345695, "num_input_tokens_seen": 56179390, "step": 2586, "time_per_iteration": 2.697852373123169 }, { "auxiliary_loss_clip": 0.01129408, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.04842281, "balance_loss_mlp": 1.02341127, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.8484108176722505, "language_loss": 0.81318939, "learning_rate": 3.836943167480296e-06, "loss": 0.83490539, "num_input_tokens_seen": 56198020, "step": 2587, "time_per_iteration": 4.212551593780518 }, { "auxiliary_loss_clip": 0.01160891, "auxiliary_loss_mlp": 0.01054822, "balance_loss_clip": 1.05309868, "balance_loss_mlp": 1.03325701, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.866779523391448, "language_loss": 0.88716942, "learning_rate": 3.836789105629236e-06, "loss": 0.90932655, "num_input_tokens_seen": 56218165, "step": 2588, "time_per_iteration": 4.192267894744873 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01052123, "balance_loss_clip": 1.04558384, "balance_loss_mlp": 1.03164268, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 2.018423224363699, "language_loss": 0.64624381, "learning_rate": 3.83663497412695e-06, "loss": 0.66771483, "num_input_tokens_seen": 56237160, "step": 2589, "time_per_iteration": 4.303871154785156 }, { "auxiliary_loss_clip": 0.01104407, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.04520249, "balance_loss_mlp": 1.02123344, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 1.784618480549341, "language_loss": 0.82832813, "learning_rate": 3.836480772979281e-06, "loss": 0.84979194, "num_input_tokens_seen": 56257610, "step": 2590, "time_per_iteration": 4.460350751876831 }, { "auxiliary_loss_clip": 0.011248, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.05032134, "balance_loss_mlp": 1.02694952, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.6687659077907484, "language_loss": 0.78766, "learning_rate": 3.836326502192077e-06, "loss": 0.80938083, "num_input_tokens_seen": 56275215, "step": 2591, "time_per_iteration": 2.73305606842041 }, { "auxiliary_loss_clip": 0.01143879, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05174232, "balance_loss_mlp": 1.03137255, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.0331558547393054, "language_loss": 0.65025747, "learning_rate": 3.836172161771189e-06, "loss": 0.67218637, "num_input_tokens_seen": 56297130, "step": 2592, "time_per_iteration": 2.8582632541656494 }, { "auxiliary_loss_clip": 0.01136043, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.05417228, "balance_loss_mlp": 1.0322001, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.311634250072179, "language_loss": 0.82506329, "learning_rate": 3.836017751722467e-06, "loss": 0.84694475, "num_input_tokens_seen": 56314995, "step": 2593, "time_per_iteration": 2.7230453491210938 }, { "auxiliary_loss_clip": 0.01142565, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.05237365, "balance_loss_mlp": 1.02676034, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.778410683125911, "language_loss": 0.73220694, "learning_rate": 3.8358632720517695e-06, "loss": 0.75409293, "num_input_tokens_seen": 56334005, "step": 2594, "time_per_iteration": 2.708063840866089 }, { "auxiliary_loss_clip": 0.01117989, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.0453043, "balance_loss_mlp": 1.02077007, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.1444704922101105, "language_loss": 0.81569934, "learning_rate": 3.835708722764952e-06, "loss": 0.83728027, "num_input_tokens_seen": 56353795, "step": 2595, "time_per_iteration": 2.716334581375122 }, { "auxiliary_loss_clip": 0.01155359, "auxiliary_loss_mlp": 0.01043269, "balance_loss_clip": 1.05093551, "balance_loss_mlp": 1.0238502, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 1.8943501893042642, "language_loss": 0.86674929, "learning_rate": 3.835554103867876e-06, "loss": 0.88873553, "num_input_tokens_seen": 56373195, "step": 2596, "time_per_iteration": 2.5947446823120117 }, { "auxiliary_loss_clip": 0.01144729, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05225515, "balance_loss_mlp": 1.02360725, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 1.8059460934517404, "language_loss": 0.68772388, "learning_rate": 3.835399415366404e-06, "loss": 0.70959222, "num_input_tokens_seen": 56391525, "step": 2597, "time_per_iteration": 2.8101041316986084 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01050835, "balance_loss_clip": 1.05409336, "balance_loss_mlp": 1.03165436, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.9103744906429732, "language_loss": 0.79860938, "learning_rate": 3.8352446572664035e-06, "loss": 0.82041842, "num_input_tokens_seen": 56410715, "step": 2598, "time_per_iteration": 2.695117950439453 }, { "auxiliary_loss_clip": 0.0112861, "auxiliary_loss_mlp": 0.00776118, "balance_loss_clip": 1.04750216, "balance_loss_mlp": 1.0006249, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 3.1104681024188827, "language_loss": 0.83092594, "learning_rate": 3.8350898295737405e-06, "loss": 0.84997326, "num_input_tokens_seen": 56429170, "step": 2599, "time_per_iteration": 2.665703773498535 }, { "auxiliary_loss_clip": 0.01160593, "auxiliary_loss_mlp": 0.0105002, "balance_loss_clip": 1.05274248, "balance_loss_mlp": 1.02924192, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.2910683048406266, "language_loss": 0.81530893, "learning_rate": 3.834934932294287e-06, "loss": 0.83741504, "num_input_tokens_seen": 56445685, "step": 2600, "time_per_iteration": 2.615651845932007 }, { "auxiliary_loss_clip": 0.01161023, "auxiliary_loss_mlp": 0.00776671, "balance_loss_clip": 1.05562234, "balance_loss_mlp": 1.00063944, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 1.7832591469657297, "language_loss": 0.88511437, "learning_rate": 3.834779965433917e-06, "loss": 0.90449131, "num_input_tokens_seen": 56465900, "step": 2601, "time_per_iteration": 2.6833529472351074 }, { "auxiliary_loss_clip": 0.0116257, "auxiliary_loss_mlp": 0.0106307, "balance_loss_clip": 1.05569744, "balance_loss_mlp": 1.04120743, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.9421054688538308, "language_loss": 0.78707534, "learning_rate": 3.834624928998508e-06, "loss": 0.80933177, "num_input_tokens_seen": 56485020, "step": 2602, "time_per_iteration": 2.6296608448028564 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.05035329, "balance_loss_mlp": 1.02419758, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8230718276715763, "language_loss": 0.74029547, "learning_rate": 3.8344698229939376e-06, "loss": 0.76199877, "num_input_tokens_seen": 56505205, "step": 2603, "time_per_iteration": 2.744508743286133 }, { "auxiliary_loss_clip": 0.01143305, "auxiliary_loss_mlp": 0.01051047, "balance_loss_clip": 1.04820418, "balance_loss_mlp": 1.03112721, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 4.041164356714064, "language_loss": 0.87723601, "learning_rate": 3.8343146474260865e-06, "loss": 0.89917958, "num_input_tokens_seen": 56521495, "step": 2604, "time_per_iteration": 2.682457447052002 }, { "auxiliary_loss_clip": 0.01145351, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.04976749, "balance_loss_mlp": 1.0256021, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 2.260429022209425, "language_loss": 0.8573193, "learning_rate": 3.834159402300841e-06, "loss": 0.87922043, "num_input_tokens_seen": 56540665, "step": 2605, "time_per_iteration": 2.7724974155426025 }, { "auxiliary_loss_clip": 0.0115108, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.02676356, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 1.7309636492693905, "language_loss": 0.73101914, "learning_rate": 3.834004087624087e-06, "loss": 0.75300246, "num_input_tokens_seen": 56560805, "step": 2606, "time_per_iteration": 2.7490081787109375 }, { "auxiliary_loss_clip": 0.01158388, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.0552665, "balance_loss_mlp": 1.03165627, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 2.968092109370304, "language_loss": 0.76497948, "learning_rate": 3.8338487034017145e-06, "loss": 0.78706092, "num_input_tokens_seen": 56576335, "step": 2607, "time_per_iteration": 2.6597230434417725 }, { "auxiliary_loss_clip": 0.01120645, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05131412, "balance_loss_mlp": 1.0284934, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.7981763092074996, "language_loss": 0.82107675, "learning_rate": 3.833693249639615e-06, "loss": 0.84275496, "num_input_tokens_seen": 56595880, "step": 2608, "time_per_iteration": 2.7072103023529053 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.02436399, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 1.6817301031159713, "language_loss": 0.72335941, "learning_rate": 3.833537726343684e-06, "loss": 0.74509382, "num_input_tokens_seen": 56615130, "step": 2609, "time_per_iteration": 2.690690755844116 }, { "auxiliary_loss_clip": 0.01143972, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.04901087, "balance_loss_mlp": 1.01756072, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 5.132438477880424, "language_loss": 0.72317064, "learning_rate": 3.833382133519818e-06, "loss": 0.74497753, "num_input_tokens_seen": 56634005, "step": 2610, "time_per_iteration": 2.6515614986419678 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01051513, "balance_loss_clip": 1.05216432, "balance_loss_mlp": 1.03063977, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.0600295188113935, "language_loss": 0.72915608, "learning_rate": 3.833226471173919e-06, "loss": 0.75126708, "num_input_tokens_seen": 56653480, "step": 2611, "time_per_iteration": 2.630988359451294 }, { "auxiliary_loss_clip": 0.01141924, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.04917872, "balance_loss_mlp": 1.0231905, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 2.0339762399532186, "language_loss": 0.70766544, "learning_rate": 3.833070739311887e-06, "loss": 0.72951007, "num_input_tokens_seen": 56672270, "step": 2612, "time_per_iteration": 2.6569461822509766 }, { "auxiliary_loss_clip": 0.01116284, "auxiliary_loss_mlp": 0.01051299, "balance_loss_clip": 1.04844582, "balance_loss_mlp": 1.03221321, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 1.9704781930994688, "language_loss": 0.76294881, "learning_rate": 3.83291493793963e-06, "loss": 0.78462464, "num_input_tokens_seen": 56691510, "step": 2613, "time_per_iteration": 2.7188539505004883 }, { "auxiliary_loss_clip": 0.01115155, "auxiliary_loss_mlp": 0.01049301, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.02956033, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 2.137998057111896, "language_loss": 0.65944499, "learning_rate": 3.832759067063055e-06, "loss": 0.68108952, "num_input_tokens_seen": 56712230, "step": 2614, "time_per_iteration": 2.7550084590911865 }, { "auxiliary_loss_clip": 0.01151987, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.05387104, "balance_loss_mlp": 1.02374101, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 2.2755662506820915, "language_loss": 0.75204211, "learning_rate": 3.832603126688072e-06, "loss": 0.77400374, "num_input_tokens_seen": 56727490, "step": 2615, "time_per_iteration": 2.683225154876709 }, { "auxiliary_loss_clip": 0.01138545, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.05209839, "balance_loss_mlp": 1.03078008, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 2.581872009488739, "language_loss": 0.73064095, "learning_rate": 3.832447116820594e-06, "loss": 0.75253528, "num_input_tokens_seen": 56747385, "step": 2616, "time_per_iteration": 2.6660919189453125 }, { "auxiliary_loss_clip": 0.01130717, "auxiliary_loss_mlp": 0.01047511, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.02794933, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 2.813587490853999, "language_loss": 0.72425079, "learning_rate": 3.832291037466539e-06, "loss": 0.74603307, "num_input_tokens_seen": 56768055, "step": 2617, "time_per_iteration": 2.768561363220215 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0104637, "balance_loss_clip": 1.04947805, "balance_loss_mlp": 1.02548432, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 2.3222819484870016, "language_loss": 0.74358094, "learning_rate": 3.8321348886318235e-06, "loss": 0.76542777, "num_input_tokens_seen": 56785110, "step": 2618, "time_per_iteration": 2.66121768951416 }, { "auxiliary_loss_clip": 0.01162954, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.05417252, "balance_loss_mlp": 1.02526867, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 1.8808629075569874, "language_loss": 0.78896272, "learning_rate": 3.8319786703223695e-06, "loss": 0.81105405, "num_input_tokens_seen": 56804975, "step": 2619, "time_per_iteration": 2.6743338108062744 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01055551, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.03576207, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.9082963728737496, "language_loss": 0.76517296, "learning_rate": 3.831822382544101e-06, "loss": 0.78698927, "num_input_tokens_seen": 56822470, "step": 2620, "time_per_iteration": 2.6481080055236816 }, { "auxiliary_loss_clip": 0.01136128, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.02488887, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.6603432400664486, "language_loss": 0.7136035, "learning_rate": 3.831666025302944e-06, "loss": 0.73542225, "num_input_tokens_seen": 56842100, "step": 2621, "time_per_iteration": 2.70985746383667 }, { "auxiliary_loss_clip": 0.01103274, "auxiliary_loss_mlp": 0.01052522, "balance_loss_clip": 1.04624665, "balance_loss_mlp": 1.02921629, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 2.1843515622778624, "language_loss": 0.72136736, "learning_rate": 3.831509598604828e-06, "loss": 0.74292529, "num_input_tokens_seen": 56865920, "step": 2622, "time_per_iteration": 3.024561643600464 }, { "auxiliary_loss_clip": 0.01095163, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.04474711, "balance_loss_mlp": 1.02464843, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.6586715789846178, "language_loss": 0.87637675, "learning_rate": 3.831353102455684e-06, "loss": 0.8977679, "num_input_tokens_seen": 56885265, "step": 2623, "time_per_iteration": 2.9600114822387695 }, { "auxiliary_loss_clip": 0.01158714, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05476475, "balance_loss_mlp": 1.02564478, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 1.6915331173398198, "language_loss": 0.81600082, "learning_rate": 3.831196536861448e-06, "loss": 0.83803129, "num_input_tokens_seen": 56906710, "step": 2624, "time_per_iteration": 2.6621103286743164 }, { "auxiliary_loss_clip": 0.01122344, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.04776418, "balance_loss_mlp": 1.02990842, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 2.879465237309773, "language_loss": 0.79977828, "learning_rate": 3.831039901828054e-06, "loss": 0.82149595, "num_input_tokens_seen": 56924275, "step": 2625, "time_per_iteration": 2.7291064262390137 }, { "auxiliary_loss_clip": 0.01157938, "auxiliary_loss_mlp": 0.01046203, "balance_loss_clip": 1.05403268, "balance_loss_mlp": 1.02857196, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.133783972400447, "language_loss": 0.80332482, "learning_rate": 3.830883197361445e-06, "loss": 0.8253662, "num_input_tokens_seen": 56941525, "step": 2626, "time_per_iteration": 4.252760171890259 }, { "auxiliary_loss_clip": 0.01102762, "auxiliary_loss_mlp": 0.01057658, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.03512752, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 3.9802810067864045, "language_loss": 0.73636395, "learning_rate": 3.830726423467561e-06, "loss": 0.75796819, "num_input_tokens_seen": 56962145, "step": 2627, "time_per_iteration": 4.328871250152588 }, { "auxiliary_loss_clip": 0.01117433, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.0503006, "balance_loss_mlp": 1.0351001, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 2.0211273696228216, "language_loss": 0.84589541, "learning_rate": 3.830569580152348e-06, "loss": 0.86762005, "num_input_tokens_seen": 56977505, "step": 2628, "time_per_iteration": 2.6785013675689697 }, { "auxiliary_loss_clip": 0.01129476, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.02308416, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.897214582222077, "language_loss": 0.76437485, "learning_rate": 3.830412667421752e-06, "loss": 0.78607821, "num_input_tokens_seen": 56996770, "step": 2629, "time_per_iteration": 4.2878499031066895 }, { "auxiliary_loss_clip": 0.01143973, "auxiliary_loss_mlp": 0.01046449, "balance_loss_clip": 1.0529623, "balance_loss_mlp": 1.02675569, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.252423233454998, "language_loss": 0.73337436, "learning_rate": 3.8302556852817245e-06, "loss": 0.75527859, "num_input_tokens_seen": 57014970, "step": 2630, "time_per_iteration": 4.253108263015747 }, { "auxiliary_loss_clip": 0.01156261, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.05644512, "balance_loss_mlp": 1.02615929, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.390369083551665, "language_loss": 0.83678091, "learning_rate": 3.8300986337382184e-06, "loss": 0.85879952, "num_input_tokens_seen": 57034045, "step": 2631, "time_per_iteration": 2.6145882606506348 }, { "auxiliary_loss_clip": 0.01159092, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.05313432, "balance_loss_mlp": 1.02746117, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.8755653224160422, "language_loss": 0.78415525, "learning_rate": 3.8299415127971895e-06, "loss": 0.80620384, "num_input_tokens_seen": 57053695, "step": 2632, "time_per_iteration": 2.656691551208496 }, { "auxiliary_loss_clip": 0.01151481, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05574381, "balance_loss_mlp": 1.03769732, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 2.079450153413421, "language_loss": 0.8301838, "learning_rate": 3.829784322464594e-06, "loss": 0.85227144, "num_input_tokens_seen": 57071290, "step": 2633, "time_per_iteration": 2.622725248336792 }, { "auxiliary_loss_clip": 0.01165069, "auxiliary_loss_mlp": 0.01041545, "balance_loss_clip": 1.05761647, "balance_loss_mlp": 1.02223265, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 2.1719104392782813, "language_loss": 0.77448404, "learning_rate": 3.829627062746394e-06, "loss": 0.79655015, "num_input_tokens_seen": 57091465, "step": 2634, "time_per_iteration": 2.6383235454559326 }, { "auxiliary_loss_clip": 0.01127407, "auxiliary_loss_mlp": 0.00777775, "balance_loss_clip": 1.05277348, "balance_loss_mlp": 1.00136137, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 3.5133527254089087, "language_loss": 0.88479185, "learning_rate": 3.829469733648552e-06, "loss": 0.90384364, "num_input_tokens_seen": 57110075, "step": 2635, "time_per_iteration": 2.725924491882324 }, { "auxiliary_loss_clip": 0.01096223, "auxiliary_loss_mlp": 0.01058885, "balance_loss_clip": 1.04816198, "balance_loss_mlp": 1.03847599, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.8627721083207627, "language_loss": 0.75762677, "learning_rate": 3.829312335177034e-06, "loss": 0.77917778, "num_input_tokens_seen": 57128945, "step": 2636, "time_per_iteration": 2.775310516357422 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.05117822, "balance_loss_mlp": 1.02350879, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 2.388418559522659, "language_loss": 0.71977961, "learning_rate": 3.82915486733781e-06, "loss": 0.74141967, "num_input_tokens_seen": 57152385, "step": 2637, "time_per_iteration": 2.8375279903411865 }, { "auxiliary_loss_clip": 0.0115052, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.05661607, "balance_loss_mlp": 1.02640057, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 2.1640345554565057, "language_loss": 0.78352648, "learning_rate": 3.82899733013685e-06, "loss": 0.80548006, "num_input_tokens_seen": 57172620, "step": 2638, "time_per_iteration": 2.7298176288604736 }, { "auxiliary_loss_clip": 0.01129706, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.05311394, "balance_loss_mlp": 1.03715718, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 2.325769963269074, "language_loss": 0.75845039, "learning_rate": 3.828839723580128e-06, "loss": 0.78032124, "num_input_tokens_seen": 57194680, "step": 2639, "time_per_iteration": 2.7731449604034424 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05350864, "balance_loss_mlp": 1.03772068, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.173238447343554, "language_loss": 0.81319505, "learning_rate": 3.82868204767362e-06, "loss": 0.83481646, "num_input_tokens_seen": 57214675, "step": 2640, "time_per_iteration": 2.8024139404296875 }, { "auxiliary_loss_clip": 0.01135166, "auxiliary_loss_mlp": 0.01054673, "balance_loss_clip": 1.05492401, "balance_loss_mlp": 1.03426492, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 2.013499020988034, "language_loss": 0.66893363, "learning_rate": 3.828524302423306e-06, "loss": 0.69083202, "num_input_tokens_seen": 57235830, "step": 2641, "time_per_iteration": 2.7519116401672363 }, { "auxiliary_loss_clip": 0.01149448, "auxiliary_loss_mlp": 0.01051949, "balance_loss_clip": 1.05758858, "balance_loss_mlp": 1.0326376, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.139760259286454, "language_loss": 0.7552591, "learning_rate": 3.828366487835167e-06, "loss": 0.77727306, "num_input_tokens_seen": 57255970, "step": 2642, "time_per_iteration": 2.706136465072632 }, { "auxiliary_loss_clip": 0.01156917, "auxiliary_loss_mlp": 0.01042142, "balance_loss_clip": 1.06263423, "balance_loss_mlp": 1.02323556, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 1.9419610036505286, "language_loss": 0.70564604, "learning_rate": 3.828208603915186e-06, "loss": 0.72763658, "num_input_tokens_seen": 57274435, "step": 2643, "time_per_iteration": 2.682015895843506 }, { "auxiliary_loss_clip": 0.01161783, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.05891204, "balance_loss_mlp": 1.02389312, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 1.846517711414915, "language_loss": 0.78057045, "learning_rate": 3.828050650669353e-06, "loss": 0.80260473, "num_input_tokens_seen": 57293115, "step": 2644, "time_per_iteration": 2.683790922164917 }, { "auxiliary_loss_clip": 0.01151239, "auxiliary_loss_mlp": 0.01050105, "balance_loss_clip": 1.05701637, "balance_loss_mlp": 1.03154373, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 3.757920662841351, "language_loss": 0.81961924, "learning_rate": 3.827892628103657e-06, "loss": 0.84163266, "num_input_tokens_seen": 57312565, "step": 2645, "time_per_iteration": 2.698085069656372 }, { "auxiliary_loss_clip": 0.01162748, "auxiliary_loss_mlp": 0.01048492, "balance_loss_clip": 1.05487716, "balance_loss_mlp": 1.02854836, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.056693785790565, "language_loss": 0.69412929, "learning_rate": 3.827734536224087e-06, "loss": 0.71624172, "num_input_tokens_seen": 57333360, "step": 2646, "time_per_iteration": 2.7166528701782227 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.05435526, "balance_loss_mlp": 1.02223015, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.5975497323405055, "language_loss": 0.62932581, "learning_rate": 3.827576375036642e-06, "loss": 0.65107965, "num_input_tokens_seen": 57350575, "step": 2647, "time_per_iteration": 2.7405354976654053 }, { "auxiliary_loss_clip": 0.01160144, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.05654776, "balance_loss_mlp": 1.02896523, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 2.2161421076431025, "language_loss": 0.89490473, "learning_rate": 3.827418144547318e-06, "loss": 0.91698575, "num_input_tokens_seen": 57367570, "step": 2648, "time_per_iteration": 2.6193346977233887 }, { "auxiliary_loss_clip": 0.01158791, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.05630398, "balance_loss_mlp": 1.03072906, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 1.9960039108301237, "language_loss": 0.91307199, "learning_rate": 3.827259844762114e-06, "loss": 0.93514073, "num_input_tokens_seen": 57383980, "step": 2649, "time_per_iteration": 2.6137378215789795 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.05474401, "balance_loss_mlp": 1.02439272, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3504548368335767, "language_loss": 0.71782613, "learning_rate": 3.827101475687033e-06, "loss": 0.73926663, "num_input_tokens_seen": 57400840, "step": 2650, "time_per_iteration": 2.8883376121520996 }, { "auxiliary_loss_clip": 0.01146809, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05386841, "balance_loss_mlp": 1.02476835, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 1.8238326955956992, "language_loss": 0.71427429, "learning_rate": 3.826943037328082e-06, "loss": 0.73616046, "num_input_tokens_seen": 57419230, "step": 2651, "time_per_iteration": 2.607879638671875 }, { "auxiliary_loss_clip": 0.01118842, "auxiliary_loss_mlp": 0.00777496, "balance_loss_clip": 1.05154157, "balance_loss_mlp": 1.00132799, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 1.8928974850955373, "language_loss": 0.80185902, "learning_rate": 3.8267845296912674e-06, "loss": 0.82082248, "num_input_tokens_seen": 57439315, "step": 2652, "time_per_iteration": 2.718695640563965 }, { "auxiliary_loss_clip": 0.01138048, "auxiliary_loss_mlp": 0.00775, "balance_loss_clip": 1.0567826, "balance_loss_mlp": 1.00124729, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 2.6116065834427387, "language_loss": 0.69539076, "learning_rate": 3.826625952782601e-06, "loss": 0.71452117, "num_input_tokens_seen": 57454635, "step": 2653, "time_per_iteration": 2.7088639736175537 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02050805, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 2.1937273620657307, "language_loss": 0.76670635, "learning_rate": 3.826467306608095e-06, "loss": 0.78854191, "num_input_tokens_seen": 57476805, "step": 2654, "time_per_iteration": 2.79425048828125 }, { "auxiliary_loss_clip": 0.01114313, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.04714727, "balance_loss_mlp": 1.02248931, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.0572535633716247, "language_loss": 0.81873977, "learning_rate": 3.826308591173765e-06, "loss": 0.84029424, "num_input_tokens_seen": 57496400, "step": 2655, "time_per_iteration": 2.6990878582000732 }, { "auxiliary_loss_clip": 0.01112525, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.04670715, "balance_loss_mlp": 1.02849984, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 2.0964800101687486, "language_loss": 0.73768878, "learning_rate": 3.826149806485631e-06, "loss": 0.75927746, "num_input_tokens_seen": 57513700, "step": 2656, "time_per_iteration": 2.7409873008728027 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04749918, "balance_loss_mlp": 1.02220988, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 2.516351978408242, "language_loss": 0.77637637, "learning_rate": 3.825990952549713e-06, "loss": 0.79788804, "num_input_tokens_seen": 57536180, "step": 2657, "time_per_iteration": 2.984161376953125 }, { "auxiliary_loss_clip": 0.01142397, "auxiliary_loss_mlp": 0.01048058, "balance_loss_clip": 1.05276513, "balance_loss_mlp": 1.02984321, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 2.1741432296797303, "language_loss": 0.74654955, "learning_rate": 3.825832029372035e-06, "loss": 0.76845407, "num_input_tokens_seen": 57555025, "step": 2658, "time_per_iteration": 2.6795172691345215 }, { "auxiliary_loss_clip": 0.01137294, "auxiliary_loss_mlp": 0.01047097, "balance_loss_clip": 1.05887127, "balance_loss_mlp": 1.02581763, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 2.2676743120149916, "language_loss": 0.75164986, "learning_rate": 3.825673036958624e-06, "loss": 0.77349377, "num_input_tokens_seen": 57577660, "step": 2659, "time_per_iteration": 2.885744094848633 }, { "auxiliary_loss_clip": 0.01122752, "auxiliary_loss_mlp": 0.0105323, "balance_loss_clip": 1.0512991, "balance_loss_mlp": 1.0334295, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.181311046841435, "language_loss": 0.90998709, "learning_rate": 3.825513975315508e-06, "loss": 0.93174696, "num_input_tokens_seen": 57596335, "step": 2660, "time_per_iteration": 2.7562267780303955 }, { "auxiliary_loss_clip": 0.01114547, "auxiliary_loss_mlp": 0.01058378, "balance_loss_clip": 1.05538487, "balance_loss_mlp": 1.03590751, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.746468400789071, "language_loss": 0.77724659, "learning_rate": 3.82535484444872e-06, "loss": 0.79897583, "num_input_tokens_seen": 57616830, "step": 2661, "time_per_iteration": 2.9896914958953857 }, { "auxiliary_loss_clip": 0.0113781, "auxiliary_loss_mlp": 0.00777461, "balance_loss_clip": 1.05382478, "balance_loss_mlp": 1.00132632, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 2.0483033922540086, "language_loss": 0.74442393, "learning_rate": 3.825195644364292e-06, "loss": 0.76357663, "num_input_tokens_seen": 57635515, "step": 2662, "time_per_iteration": 2.7993714809417725 }, { "auxiliary_loss_clip": 0.01135674, "auxiliary_loss_mlp": 0.00780783, "balance_loss_clip": 1.05392313, "balance_loss_mlp": 1.0016191, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 2.9903694104875984, "language_loss": 0.82515085, "learning_rate": 3.825036375068263e-06, "loss": 0.84431541, "num_input_tokens_seen": 57654250, "step": 2663, "time_per_iteration": 2.678490161895752 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.05182636, "balance_loss_mlp": 1.02574801, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.06786422122115, "language_loss": 0.7951405, "learning_rate": 3.824877036566672e-06, "loss": 0.81671166, "num_input_tokens_seen": 57672645, "step": 2664, "time_per_iteration": 2.819880962371826 }, { "auxiliary_loss_clip": 0.01151449, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05374622, "balance_loss_mlp": 1.02886605, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.6697703441146605, "language_loss": 0.93748474, "learning_rate": 3.824717628865561e-06, "loss": 0.95947945, "num_input_tokens_seen": 57691055, "step": 2665, "time_per_iteration": 2.697660446166992 }, { "auxiliary_loss_clip": 0.01127607, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.05185676, "balance_loss_mlp": 1.02774525, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.9655602739253095, "language_loss": 0.85237324, "learning_rate": 3.824558151970974e-06, "loss": 0.87412339, "num_input_tokens_seen": 57707235, "step": 2666, "time_per_iteration": 4.282273530960083 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.00777125, "balance_loss_clip": 1.05257225, "balance_loss_mlp": 1.00145936, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 1.8366839898970433, "language_loss": 0.81284773, "learning_rate": 3.8243986058889595e-06, "loss": 0.83191717, "num_input_tokens_seen": 57724190, "step": 2667, "time_per_iteration": 2.69508695602417 }, { "auxiliary_loss_clip": 0.0116556, "auxiliary_loss_mlp": 0.01046526, "balance_loss_clip": 1.06089485, "balance_loss_mlp": 1.02643883, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 1.958935842080623, "language_loss": 0.74031079, "learning_rate": 3.824238990625567e-06, "loss": 0.76243162, "num_input_tokens_seen": 57743620, "step": 2668, "time_per_iteration": 4.2559425830841064 }, { "auxiliary_loss_clip": 0.01148853, "auxiliary_loss_mlp": 0.01051992, "balance_loss_clip": 1.05547619, "balance_loss_mlp": 1.03240585, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 1.7737626564305047, "language_loss": 0.77495629, "learning_rate": 3.824079306186848e-06, "loss": 0.7969647, "num_input_tokens_seen": 57764810, "step": 2669, "time_per_iteration": 2.6424050331115723 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01012737, "balance_loss_clip": 1.06351233, "balance_loss_mlp": 1.00986385, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.8041290684345284, "language_loss": 0.5549804, "learning_rate": 3.823919552578861e-06, "loss": 0.57608116, "num_input_tokens_seen": 57824390, "step": 2670, "time_per_iteration": 4.765664100646973 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02430916, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.6306224128650464, "language_loss": 0.77778888, "learning_rate": 3.82375972980766e-06, "loss": 0.7997191, "num_input_tokens_seen": 57843665, "step": 2671, "time_per_iteration": 2.6876416206359863 }, { "auxiliary_loss_clip": 0.01151164, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05529547, "balance_loss_mlp": 1.02503204, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 1.9167251889277674, "language_loss": 0.64766788, "learning_rate": 3.8235998378793086e-06, "loss": 0.66961908, "num_input_tokens_seen": 57863305, "step": 2672, "time_per_iteration": 2.7102553844451904 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.05674481, "balance_loss_mlp": 1.02554154, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.045175098484539, "language_loss": 0.85708207, "learning_rate": 3.8234398767998675e-06, "loss": 0.87905198, "num_input_tokens_seen": 57883025, "step": 2673, "time_per_iteration": 2.656360626220703 }, { "auxiliary_loss_clip": 0.01125542, "auxiliary_loss_mlp": 0.01055838, "balance_loss_clip": 1.05366015, "balance_loss_mlp": 1.03716969, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.339006860757087, "language_loss": 0.7289716, "learning_rate": 3.823279846575403e-06, "loss": 0.75078535, "num_input_tokens_seen": 57901430, "step": 2674, "time_per_iteration": 2.7122414112091064 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.05416465, "balance_loss_mlp": 1.02464211, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.9341682597436423, "language_loss": 0.84438515, "learning_rate": 3.823119747211986e-06, "loss": 0.86634052, "num_input_tokens_seen": 57919550, "step": 2675, "time_per_iteration": 2.6646435260772705 }, { "auxiliary_loss_clip": 0.01116221, "auxiliary_loss_mlp": 0.01049343, "balance_loss_clip": 1.05220723, "balance_loss_mlp": 1.02823126, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 1.871909119220515, "language_loss": 0.82216591, "learning_rate": 3.822959578715685e-06, "loss": 0.84382153, "num_input_tokens_seen": 57939890, "step": 2676, "time_per_iteration": 2.8457534313201904 }, { "auxiliary_loss_clip": 0.01151157, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.05746996, "balance_loss_mlp": 1.03162253, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 2.1166154816193923, "language_loss": 0.73485494, "learning_rate": 3.822799341092573e-06, "loss": 0.75686526, "num_input_tokens_seen": 57957410, "step": 2677, "time_per_iteration": 2.65387225151062 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.05438483, "balance_loss_mlp": 1.02537322, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 3.229282061984371, "language_loss": 0.76305777, "learning_rate": 3.822639034348728e-06, "loss": 0.78488332, "num_input_tokens_seen": 57977900, "step": 2678, "time_per_iteration": 2.836071014404297 }, { "auxiliary_loss_clip": 0.01148252, "auxiliary_loss_mlp": 0.01047887, "balance_loss_clip": 1.05379987, "balance_loss_mlp": 1.02789569, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 8.295814069484678, "language_loss": 0.70340431, "learning_rate": 3.822478658490228e-06, "loss": 0.7253657, "num_input_tokens_seen": 57998210, "step": 2679, "time_per_iteration": 2.771185874938965 }, { "auxiliary_loss_clip": 0.01059502, "auxiliary_loss_mlp": 0.00758644, "balance_loss_clip": 1.04695845, "balance_loss_mlp": 1.00150955, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7819629653273137, "language_loss": 0.51843339, "learning_rate": 3.822318213523154e-06, "loss": 0.53661484, "num_input_tokens_seen": 58059420, "step": 2680, "time_per_iteration": 3.3107378482818604 }, { "auxiliary_loss_clip": 0.01144342, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.05360317, "balance_loss_mlp": 1.02632904, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 1.6718368455031125, "language_loss": 0.8028667, "learning_rate": 3.8221576994535925e-06, "loss": 0.82478368, "num_input_tokens_seen": 58078370, "step": 2681, "time_per_iteration": 2.6986513137817383 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01055518, "balance_loss_clip": 1.05603266, "balance_loss_mlp": 1.03602743, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 2.154781054673542, "language_loss": 0.68957973, "learning_rate": 3.821997116287627e-06, "loss": 0.71152687, "num_input_tokens_seen": 58097395, "step": 2682, "time_per_iteration": 2.794686794281006 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01052349, "balance_loss_clip": 1.05670619, "balance_loss_mlp": 1.03195262, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 1.9802191055590168, "language_loss": 0.87362224, "learning_rate": 3.821836464031348e-06, "loss": 0.89553785, "num_input_tokens_seen": 58115630, "step": 2683, "time_per_iteration": 2.703634262084961 }, { "auxiliary_loss_clip": 0.01165497, "auxiliary_loss_mlp": 0.0105575, "balance_loss_clip": 1.05714059, "balance_loss_mlp": 1.03491259, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.939499216066865, "language_loss": 0.74143028, "learning_rate": 3.821675742690849e-06, "loss": 0.76364273, "num_input_tokens_seen": 58138655, "step": 2684, "time_per_iteration": 2.7890264987945557 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.00778989, "balance_loss_clip": 1.05435085, "balance_loss_mlp": 1.00176883, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.9009911635557044, "language_loss": 0.70506597, "learning_rate": 3.821514952272223e-06, "loss": 0.72422272, "num_input_tokens_seen": 58157440, "step": 2685, "time_per_iteration": 2.803942918777466 }, { "auxiliary_loss_clip": 0.01116315, "auxiliary_loss_mlp": 0.01059092, "balance_loss_clip": 1.05291295, "balance_loss_mlp": 1.03757524, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 2.295686008167468, "language_loss": 0.72060591, "learning_rate": 3.821354092781567e-06, "loss": 0.74236, "num_input_tokens_seen": 58176660, "step": 2686, "time_per_iteration": 2.850309133529663 }, { "auxiliary_loss_clip": 0.01153803, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.05603862, "balance_loss_mlp": 1.02922952, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 2.056921120199424, "language_loss": 0.81720114, "learning_rate": 3.821193164224981e-06, "loss": 0.83922803, "num_input_tokens_seen": 58195085, "step": 2687, "time_per_iteration": 2.7077832221984863 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.05335689, "balance_loss_mlp": 1.02910483, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 1.6747986106054085, "language_loss": 0.71680355, "learning_rate": 3.821032166608568e-06, "loss": 0.73885429, "num_input_tokens_seen": 58213540, "step": 2688, "time_per_iteration": 2.700073480606079 }, { "auxiliary_loss_clip": 0.0112226, "auxiliary_loss_mlp": 0.0105252, "balance_loss_clip": 1.0517168, "balance_loss_mlp": 1.03330338, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 2.2887064413695253, "language_loss": 0.76168394, "learning_rate": 3.8208710999384325e-06, "loss": 0.78343177, "num_input_tokens_seen": 58236995, "step": 2689, "time_per_iteration": 2.846964120864868 }, { "auxiliary_loss_clip": 0.01166324, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 1.05979431, "balance_loss_mlp": 1.03308284, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 2.045037041298705, "language_loss": 0.87211925, "learning_rate": 3.820709964220683e-06, "loss": 0.89431226, "num_input_tokens_seen": 58257230, "step": 2690, "time_per_iteration": 2.704497814178467 }, { "auxiliary_loss_clip": 0.01143898, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.05318451, "balance_loss_mlp": 1.02890396, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.7518031225399346, "language_loss": 0.87899524, "learning_rate": 3.8205487594614284e-06, "loss": 0.90089989, "num_input_tokens_seen": 58277080, "step": 2691, "time_per_iteration": 2.6763153076171875 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01053114, "balance_loss_clip": 1.05237532, "balance_loss_mlp": 1.03142977, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.1723450057475313, "language_loss": 0.81989783, "learning_rate": 3.820387485666784e-06, "loss": 0.84193164, "num_input_tokens_seen": 58294815, "step": 2692, "time_per_iteration": 2.6381001472473145 }, { "auxiliary_loss_clip": 0.01167881, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.05555534, "balance_loss_mlp": 1.02499604, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 2.194958172554253, "language_loss": 0.81381011, "learning_rate": 3.820226142842862e-06, "loss": 0.83594954, "num_input_tokens_seen": 58313215, "step": 2693, "time_per_iteration": 2.6366944313049316 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01058298, "balance_loss_clip": 1.0587461, "balance_loss_mlp": 1.03991616, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 2.778189532536263, "language_loss": 0.83837044, "learning_rate": 3.820064730995783e-06, "loss": 0.86059809, "num_input_tokens_seen": 58333215, "step": 2694, "time_per_iteration": 2.7802140712738037 }, { "auxiliary_loss_clip": 0.01116209, "auxiliary_loss_mlp": 0.0105764, "balance_loss_clip": 1.04927421, "balance_loss_mlp": 1.0366354, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 1.8201511645490482, "language_loss": 0.69709098, "learning_rate": 3.819903250131667e-06, "loss": 0.71882945, "num_input_tokens_seen": 58351160, "step": 2695, "time_per_iteration": 2.756904125213623 }, { "auxiliary_loss_clip": 0.01155526, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.05799723, "balance_loss_mlp": 1.03026128, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 2.1550523064219487, "language_loss": 0.82986331, "learning_rate": 3.819741700256637e-06, "loss": 0.85192692, "num_input_tokens_seen": 58368505, "step": 2696, "time_per_iteration": 2.651510238647461 }, { "auxiliary_loss_clip": 0.01174193, "auxiliary_loss_mlp": 0.01052819, "balance_loss_clip": 1.05826569, "balance_loss_mlp": 1.03095615, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.9267990143146503, "language_loss": 0.8862049, "learning_rate": 3.8195800813768194e-06, "loss": 0.90847504, "num_input_tokens_seen": 58385085, "step": 2697, "time_per_iteration": 2.5935380458831787 }, { "auxiliary_loss_clip": 0.01158945, "auxiliary_loss_mlp": 0.01045471, "balance_loss_clip": 1.0552485, "balance_loss_mlp": 1.02719641, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.7480298293719791, "language_loss": 0.80844599, "learning_rate": 3.819418393498343e-06, "loss": 0.83049017, "num_input_tokens_seen": 58406985, "step": 2698, "time_per_iteration": 2.6685965061187744 }, { "auxiliary_loss_clip": 0.01151678, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05785704, "balance_loss_mlp": 1.03060579, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.590231062064763, "language_loss": 0.77499473, "learning_rate": 3.819256636627339e-06, "loss": 0.79701245, "num_input_tokens_seen": 58426205, "step": 2699, "time_per_iteration": 2.7206287384033203 }, { "auxiliary_loss_clip": 0.01134482, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.0504272, "balance_loss_mlp": 1.02510071, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.299083669251571, "language_loss": 0.85903585, "learning_rate": 3.81909481076994e-06, "loss": 0.88081944, "num_input_tokens_seen": 58443830, "step": 2700, "time_per_iteration": 2.6440224647521973 }, { "auxiliary_loss_clip": 0.01150266, "auxiliary_loss_mlp": 0.00778348, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.00180686, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.7679372116400307, "language_loss": 0.80424523, "learning_rate": 3.818932915932284e-06, "loss": 0.82353133, "num_input_tokens_seen": 58464405, "step": 2701, "time_per_iteration": 2.6943976879119873 }, { "auxiliary_loss_clip": 0.01144477, "auxiliary_loss_mlp": 0.01046291, "balance_loss_clip": 1.05771017, "balance_loss_mlp": 1.02664542, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.6539412057050027, "language_loss": 0.72777367, "learning_rate": 3.818770952120511e-06, "loss": 0.74968135, "num_input_tokens_seen": 58483295, "step": 2702, "time_per_iteration": 2.6914141178131104 }, { "auxiliary_loss_clip": 0.01156069, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.05802381, "balance_loss_mlp": 1.02896905, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.8265391375227176, "language_loss": 0.7273894, "learning_rate": 3.81860891934076e-06, "loss": 0.74945152, "num_input_tokens_seen": 58501205, "step": 2703, "time_per_iteration": 2.6301820278167725 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.0553968, "balance_loss_mlp": 1.02942359, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 3.0329584489902666, "language_loss": 0.70018482, "learning_rate": 3.818446817599176e-06, "loss": 0.72234988, "num_input_tokens_seen": 58522315, "step": 2704, "time_per_iteration": 2.6667227745056152 }, { "auxiliary_loss_clip": 0.01034679, "auxiliary_loss_mlp": 0.01001657, "balance_loss_clip": 1.03343439, "balance_loss_mlp": 0.99865305, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7801109588151329, "language_loss": 0.5336051, "learning_rate": 3.818284646901907e-06, "loss": 0.55396849, "num_input_tokens_seen": 58586695, "step": 2705, "time_per_iteration": 4.808594465255737 }, { "auxiliary_loss_clip": 0.01138628, "auxiliary_loss_mlp": 0.00781324, "balance_loss_clip": 1.0539608, "balance_loss_mlp": 1.00171995, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.3827832530074455, "language_loss": 0.7536028, "learning_rate": 3.818122407255102e-06, "loss": 0.77280229, "num_input_tokens_seen": 58602435, "step": 2706, "time_per_iteration": 4.126614570617676 }, { "auxiliary_loss_clip": 0.01130684, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.0523324, "balance_loss_mlp": 1.02859437, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 2.2272392184651038, "language_loss": 0.72203928, "learning_rate": 3.817960098664914e-06, "loss": 0.74382102, "num_input_tokens_seen": 58621275, "step": 2707, "time_per_iteration": 4.2739410400390625 }, { "auxiliary_loss_clip": 0.01142142, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.05433679, "balance_loss_mlp": 1.02898431, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 3.192481802987827, "language_loss": 0.83481139, "learning_rate": 3.817797721137495e-06, "loss": 0.85670936, "num_input_tokens_seen": 58637550, "step": 2708, "time_per_iteration": 2.7163965702056885 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.00781217, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.00177419, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.2850459718507654, "language_loss": 0.86162847, "learning_rate": 3.817635274679006e-06, "loss": 0.88045627, "num_input_tokens_seen": 58654135, "step": 2709, "time_per_iteration": 4.474989652633667 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.00777602, "balance_loss_clip": 1.05267572, "balance_loss_mlp": 1.00172114, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 2.581053296112052, "language_loss": 0.91410124, "learning_rate": 3.817472759295605e-06, "loss": 0.93328035, "num_input_tokens_seen": 58674320, "step": 2710, "time_per_iteration": 2.6951892375946045 }, { "auxiliary_loss_clip": 0.01118597, "auxiliary_loss_mlp": 0.01054854, "balance_loss_clip": 1.05254805, "balance_loss_mlp": 1.03451669, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.4322540773438437, "language_loss": 0.81690979, "learning_rate": 3.817310174993453e-06, "loss": 0.83864427, "num_input_tokens_seen": 58691000, "step": 2711, "time_per_iteration": 2.7854437828063965 }, { "auxiliary_loss_clip": 0.01146056, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.04954815, "balance_loss_mlp": 1.02107334, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 3.73256798888747, "language_loss": 0.8091476, "learning_rate": 3.817147521778719e-06, "loss": 0.83101463, "num_input_tokens_seen": 58710230, "step": 2712, "time_per_iteration": 2.834291458129883 }, { "auxiliary_loss_clip": 0.01171211, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.0590024, "balance_loss_mlp": 1.03273714, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 2.3460895846171996, "language_loss": 0.7681579, "learning_rate": 3.816984799657568e-06, "loss": 0.79040015, "num_input_tokens_seen": 58728610, "step": 2713, "time_per_iteration": 2.6188278198242188 }, { "auxiliary_loss_clip": 0.01156539, "auxiliary_loss_mlp": 0.0105792, "balance_loss_clip": 1.06240916, "balance_loss_mlp": 1.03832221, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.543173325075216, "language_loss": 0.79012156, "learning_rate": 3.8168220086361715e-06, "loss": 0.81226611, "num_input_tokens_seen": 58744385, "step": 2714, "time_per_iteration": 2.6534018516540527 }, { "auxiliary_loss_clip": 0.01149567, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.05467987, "balance_loss_mlp": 1.03724504, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.614702766215493, "language_loss": 0.77693665, "learning_rate": 3.816659148720702e-06, "loss": 0.79899377, "num_input_tokens_seen": 58763905, "step": 2715, "time_per_iteration": 2.856006383895874 }, { "auxiliary_loss_clip": 0.01129437, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.04810584, "balance_loss_mlp": 1.02525854, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.374975046722651, "language_loss": 0.81513858, "learning_rate": 3.816496219917336e-06, "loss": 0.83687335, "num_input_tokens_seen": 58785580, "step": 2716, "time_per_iteration": 2.6750845909118652 }, { "auxiliary_loss_clip": 0.01144393, "auxiliary_loss_mlp": 0.01055927, "balance_loss_clip": 1.05851114, "balance_loss_mlp": 1.03703237, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 1.8186679286330678, "language_loss": 0.86522418, "learning_rate": 3.816333222232251e-06, "loss": 0.88722742, "num_input_tokens_seen": 58806075, "step": 2717, "time_per_iteration": 2.761622428894043 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.05334044, "balance_loss_mlp": 1.0274632, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 1.8799656187942837, "language_loss": 0.76924133, "learning_rate": 3.816170155671629e-06, "loss": 0.79105473, "num_input_tokens_seen": 58827405, "step": 2718, "time_per_iteration": 2.7946770191192627 }, { "auxiliary_loss_clip": 0.01145146, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.05553615, "balance_loss_mlp": 1.02566922, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 2.2449478392049906, "language_loss": 0.73827291, "learning_rate": 3.816007020241652e-06, "loss": 0.76016116, "num_input_tokens_seen": 58847205, "step": 2719, "time_per_iteration": 2.719980478286743 }, { "auxiliary_loss_clip": 0.01128361, "auxiliary_loss_mlp": 0.01045887, "balance_loss_clip": 1.04900515, "balance_loss_mlp": 1.02732563, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 1.7092252575708884, "language_loss": 0.72267497, "learning_rate": 3.815843815948507e-06, "loss": 0.74441749, "num_input_tokens_seen": 58866865, "step": 2720, "time_per_iteration": 2.8737292289733887 }, { "auxiliary_loss_clip": 0.01109456, "auxiliary_loss_mlp": 0.01049703, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.02840054, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.1621365878543153, "language_loss": 0.75120997, "learning_rate": 3.8156805427983824e-06, "loss": 0.77280164, "num_input_tokens_seen": 58885200, "step": 2721, "time_per_iteration": 2.785296678543091 }, { "auxiliary_loss_clip": 0.01110342, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.04597676, "balance_loss_mlp": 1.03734064, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.9032438792006017, "language_loss": 0.79073942, "learning_rate": 3.8155172007974695e-06, "loss": 0.81244236, "num_input_tokens_seen": 58906385, "step": 2722, "time_per_iteration": 2.7850708961486816 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.00778798, "balance_loss_clip": 1.05395257, "balance_loss_mlp": 1.00171757, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.3019049903761215, "language_loss": 0.84954333, "learning_rate": 3.8153537899519624e-06, "loss": 0.86882937, "num_input_tokens_seen": 58925040, "step": 2723, "time_per_iteration": 2.7268764972686768 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.04805517, "balance_loss_mlp": 1.02493143, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 1.8985615531712963, "language_loss": 0.71018666, "learning_rate": 3.815190310268058e-06, "loss": 0.73171842, "num_input_tokens_seen": 58944790, "step": 2724, "time_per_iteration": 2.7691783905029297 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.05226958, "balance_loss_mlp": 1.02364373, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 2.1059770262776136, "language_loss": 0.70552838, "learning_rate": 3.815026761751955e-06, "loss": 0.72712779, "num_input_tokens_seen": 58962500, "step": 2725, "time_per_iteration": 2.6936957836151123 }, { "auxiliary_loss_clip": 0.01112368, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.04912174, "balance_loss_mlp": 1.028391, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.27810298992254, "language_loss": 0.88491893, "learning_rate": 3.814863144409855e-06, "loss": 0.90650856, "num_input_tokens_seen": 58980355, "step": 2726, "time_per_iteration": 2.7967143058776855 }, { "auxiliary_loss_clip": 0.01157668, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.06062055, "balance_loss_mlp": 1.03099847, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 2.0584475237926303, "language_loss": 0.7469939, "learning_rate": 3.814699458247963e-06, "loss": 0.7690773, "num_input_tokens_seen": 58999505, "step": 2727, "time_per_iteration": 2.6818623542785645 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01052077, "balance_loss_clip": 1.0570507, "balance_loss_mlp": 1.03527999, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.6112579442237729, "language_loss": 0.83097756, "learning_rate": 3.8145357032724855e-06, "loss": 0.85300803, "num_input_tokens_seen": 59017930, "step": 2728, "time_per_iteration": 2.675360918045044 }, { "auxiliary_loss_clip": 0.01156153, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.05826735, "balance_loss_mlp": 1.02602315, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 2.5738755626941106, "language_loss": 0.84892929, "learning_rate": 3.814371879489633e-06, "loss": 0.87094688, "num_input_tokens_seen": 59035130, "step": 2729, "time_per_iteration": 2.7004599571228027 }, { "auxiliary_loss_clip": 0.01167293, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.0591594, "balance_loss_mlp": 1.03053224, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 1.9897225699042427, "language_loss": 0.72895479, "learning_rate": 3.814207986905616e-06, "loss": 0.75111228, "num_input_tokens_seen": 59053080, "step": 2730, "time_per_iteration": 2.593179702758789 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.05208349, "balance_loss_mlp": 1.02908981, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.6754501336017709, "language_loss": 0.74384654, "learning_rate": 3.814044025526651e-06, "loss": 0.76577234, "num_input_tokens_seen": 59075610, "step": 2731, "time_per_iteration": 2.8702962398529053 }, { "auxiliary_loss_clip": 0.01122791, "auxiliary_loss_mlp": 0.01047176, "balance_loss_clip": 1.05006754, "balance_loss_mlp": 1.02650499, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.031351475505915, "language_loss": 0.79190683, "learning_rate": 3.8138799953589548e-06, "loss": 0.8136065, "num_input_tokens_seen": 59094555, "step": 2732, "time_per_iteration": 2.734529972076416 }, { "auxiliary_loss_clip": 0.01141118, "auxiliary_loss_mlp": 0.01047385, "balance_loss_clip": 1.05340672, "balance_loss_mlp": 1.02796555, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 2.250003976384769, "language_loss": 0.69526887, "learning_rate": 3.8137158964087473e-06, "loss": 0.71715385, "num_input_tokens_seen": 59113515, "step": 2733, "time_per_iteration": 2.672377109527588 }, { "auxiliary_loss_clip": 0.01143332, "auxiliary_loss_mlp": 0.01053232, "balance_loss_clip": 1.05603123, "balance_loss_mlp": 1.0325135, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.000873580428856, "language_loss": 0.80976766, "learning_rate": 3.8135517286822508e-06, "loss": 0.83173329, "num_input_tokens_seen": 59133275, "step": 2734, "time_per_iteration": 2.710293769836426 }, { "auxiliary_loss_clip": 0.01135758, "auxiliary_loss_mlp": 0.01056722, "balance_loss_clip": 1.05488348, "balance_loss_mlp": 1.03470409, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 2.100664117201308, "language_loss": 0.81810421, "learning_rate": 3.8133874921856914e-06, "loss": 0.840029, "num_input_tokens_seen": 59154095, "step": 2735, "time_per_iteration": 2.8074140548706055 }, { "auxiliary_loss_clip": 0.01070875, "auxiliary_loss_mlp": 0.01044313, "balance_loss_clip": 1.04323888, "balance_loss_mlp": 1.02508426, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.405088987017839, "language_loss": 0.78515649, "learning_rate": 3.813223186925296e-06, "loss": 0.80630839, "num_input_tokens_seen": 59173795, "step": 2736, "time_per_iteration": 2.839087963104248 }, { "auxiliary_loss_clip": 0.01147998, "auxiliary_loss_mlp": 0.01054659, "balance_loss_clip": 1.05859447, "balance_loss_mlp": 1.03513288, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1.9462182296456145, "language_loss": 0.81052899, "learning_rate": 3.8130588129072964e-06, "loss": 0.83255553, "num_input_tokens_seen": 59191610, "step": 2737, "time_per_iteration": 2.7328996658325195 }, { "auxiliary_loss_clip": 0.01150424, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.0559026, "balance_loss_mlp": 1.03065443, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.8596348168124566, "language_loss": 0.87449318, "learning_rate": 3.8128943701379246e-06, "loss": 0.89651948, "num_input_tokens_seen": 59213000, "step": 2738, "time_per_iteration": 2.7345526218414307 }, { "auxiliary_loss_clip": 0.01139154, "auxiliary_loss_mlp": 0.0106055, "balance_loss_clip": 1.05534518, "balance_loss_mlp": 1.04079759, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 1.728421510231393, "language_loss": 0.71997833, "learning_rate": 3.8127298586234167e-06, "loss": 0.74197543, "num_input_tokens_seen": 59232340, "step": 2739, "time_per_iteration": 2.7091422080993652 }, { "auxiliary_loss_clip": 0.01154419, "auxiliary_loss_mlp": 0.0105106, "balance_loss_clip": 1.05673754, "balance_loss_mlp": 1.0312835, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.8559436932352185, "language_loss": 0.81645715, "learning_rate": 3.8125652783700104e-06, "loss": 0.83851194, "num_input_tokens_seen": 59253950, "step": 2740, "time_per_iteration": 2.712658166885376 }, { "auxiliary_loss_clip": 0.01114061, "auxiliary_loss_mlp": 0.01068725, "balance_loss_clip": 1.04991829, "balance_loss_mlp": 1.04307163, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.0528021789830837, "language_loss": 0.69467485, "learning_rate": 3.8124006293839475e-06, "loss": 0.71650267, "num_input_tokens_seen": 59275545, "step": 2741, "time_per_iteration": 2.8629493713378906 }, { "auxiliary_loss_clip": 0.01167543, "auxiliary_loss_mlp": 0.01048721, "balance_loss_clip": 1.05907226, "balance_loss_mlp": 1.02906334, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 1.7765193730452222, "language_loss": 0.79811072, "learning_rate": 3.812235911671472e-06, "loss": 0.8202734, "num_input_tokens_seen": 59293480, "step": 2742, "time_per_iteration": 2.626775026321411 }, { "auxiliary_loss_clip": 0.01141681, "auxiliary_loss_mlp": 0.01055663, "balance_loss_clip": 1.05664062, "balance_loss_mlp": 1.03477716, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 1.91797408289014, "language_loss": 0.8499459, "learning_rate": 3.8120711252388274e-06, "loss": 0.87191939, "num_input_tokens_seen": 59313435, "step": 2743, "time_per_iteration": 2.8218302726745605 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05743837, "balance_loss_mlp": 1.03196514, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.4425200129075006, "language_loss": 0.85558498, "learning_rate": 3.811906270092265e-06, "loss": 0.87772918, "num_input_tokens_seen": 59331535, "step": 2744, "time_per_iteration": 4.206263542175293 }, { "auxiliary_loss_clip": 0.01131671, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.05206287, "balance_loss_mlp": 1.02812767, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.6285200980820358, "language_loss": 0.82770813, "learning_rate": 3.811741346238036e-06, "loss": 0.84949243, "num_input_tokens_seen": 59350680, "step": 2745, "time_per_iteration": 4.331594467163086 }, { "auxiliary_loss_clip": 0.011344, "auxiliary_loss_mlp": 0.01057242, "balance_loss_clip": 1.05874014, "balance_loss_mlp": 1.03825223, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 6.766690288332402, "language_loss": 0.76811314, "learning_rate": 3.8115763536823923e-06, "loss": 0.79002959, "num_input_tokens_seen": 59367020, "step": 2746, "time_per_iteration": 4.225586414337158 }, { "auxiliary_loss_clip": 0.01164296, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.05781221, "balance_loss_mlp": 1.03533494, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.9760186874049024, "language_loss": 0.80818808, "learning_rate": 3.811411292431592e-06, "loss": 0.83038735, "num_input_tokens_seen": 59386075, "step": 2747, "time_per_iteration": 2.6862480640411377 }, { "auxiliary_loss_clip": 0.01157975, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.05990267, "balance_loss_mlp": 1.02664328, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.0608482379031337, "language_loss": 0.69433749, "learning_rate": 3.8112461624918945e-06, "loss": 0.71638453, "num_input_tokens_seen": 59402690, "step": 2748, "time_per_iteration": 2.6520986557006836 }, { "auxiliary_loss_clip": 0.01169692, "auxiliary_loss_mlp": 0.00778195, "balance_loss_clip": 1.06237423, "balance_loss_mlp": 1.00173104, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.259215537482641, "language_loss": 0.88012803, "learning_rate": 3.811080963869561e-06, "loss": 0.89960694, "num_input_tokens_seen": 59421130, "step": 2749, "time_per_iteration": 4.260679244995117 }, { "auxiliary_loss_clip": 0.01154179, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.02542281, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 2.0880864906339864, "language_loss": 0.79240286, "learning_rate": 3.8109156965708557e-06, "loss": 0.81440079, "num_input_tokens_seen": 59438970, "step": 2750, "time_per_iteration": 2.6335251331329346 }, { "auxiliary_loss_clip": 0.01153343, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.0579437, "balance_loss_mlp": 1.02602625, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 1.6952801391084946, "language_loss": 0.94854712, "learning_rate": 3.8107503606020455e-06, "loss": 0.97053963, "num_input_tokens_seen": 59458510, "step": 2751, "time_per_iteration": 2.697174310684204 }, { "auxiliary_loss_clip": 0.0106803, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.04625726, "balance_loss_mlp": 1.03247619, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 2.614588592950962, "language_loss": 0.71231711, "learning_rate": 3.8105849559693997e-06, "loss": 0.73352098, "num_input_tokens_seen": 59477110, "step": 2752, "time_per_iteration": 2.7780745029449463 }, { "auxiliary_loss_clip": 0.01090521, "auxiliary_loss_mlp": 0.01022104, "balance_loss_clip": 1.05741131, "balance_loss_mlp": 1.01941013, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7721529651221379, "language_loss": 0.54058975, "learning_rate": 3.810419482679192e-06, "loss": 0.56171602, "num_input_tokens_seen": 59541155, "step": 2753, "time_per_iteration": 3.3371469974517822 }, { "auxiliary_loss_clip": 0.01163808, "auxiliary_loss_mlp": 0.00778536, "balance_loss_clip": 1.05587018, "balance_loss_mlp": 1.00172091, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.6411537728312637, "language_loss": 0.75436741, "learning_rate": 3.8102539407376954e-06, "loss": 0.7737909, "num_input_tokens_seen": 59561155, "step": 2754, "time_per_iteration": 2.6382133960723877 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01060584, "balance_loss_clip": 1.05406713, "balance_loss_mlp": 1.03768396, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.4067479946694137, "language_loss": 0.86654639, "learning_rate": 3.810088330151188e-06, "loss": 0.88854647, "num_input_tokens_seen": 59580460, "step": 2755, "time_per_iteration": 2.6590075492858887 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01053169, "balance_loss_clip": 1.04948378, "balance_loss_mlp": 1.03293943, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.7268487777137649, "language_loss": 0.73350251, "learning_rate": 3.80992265092595e-06, "loss": 0.75526237, "num_input_tokens_seen": 59600025, "step": 2756, "time_per_iteration": 2.771820545196533 }, { "auxiliary_loss_clip": 0.01128662, "auxiliary_loss_mlp": 0.01049666, "balance_loss_clip": 1.05550277, "balance_loss_mlp": 1.02969813, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5540667033085804, "language_loss": 0.75308084, "learning_rate": 3.8097569030682636e-06, "loss": 0.77486414, "num_input_tokens_seen": 59620600, "step": 2757, "time_per_iteration": 2.8106157779693604 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01054064, "balance_loss_clip": 1.057634, "balance_loss_mlp": 1.03390563, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.8675154897424497, "language_loss": 0.84604371, "learning_rate": 3.8095910865844137e-06, "loss": 0.86802036, "num_input_tokens_seen": 59641385, "step": 2758, "time_per_iteration": 2.8663368225097656 }, { "auxiliary_loss_clip": 0.01168186, "auxiliary_loss_mlp": 0.01058337, "balance_loss_clip": 1.06166434, "balance_loss_mlp": 1.03952527, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.0824774555850243, "language_loss": 0.78848934, "learning_rate": 3.809425201480689e-06, "loss": 0.81075454, "num_input_tokens_seen": 59659865, "step": 2759, "time_per_iteration": 2.655371904373169 }, { "auxiliary_loss_clip": 0.01098973, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.0491066, "balance_loss_mlp": 1.02846527, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 2.4005603702739613, "language_loss": 0.75130272, "learning_rate": 3.8092592477633793e-06, "loss": 0.77278036, "num_input_tokens_seen": 59678780, "step": 2760, "time_per_iteration": 2.767866611480713 }, { "auxiliary_loss_clip": 0.01117278, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.05129814, "balance_loss_mlp": 1.02867997, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 1.5792623632565632, "language_loss": 0.73425764, "learning_rate": 3.8090932254387774e-06, "loss": 0.75591272, "num_input_tokens_seen": 59698795, "step": 2761, "time_per_iteration": 2.762836456298828 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.03018475, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 2.9515424803015033, "language_loss": 0.88832974, "learning_rate": 3.8089271345131788e-06, "loss": 0.91018462, "num_input_tokens_seen": 59718795, "step": 2762, "time_per_iteration": 2.766324281692505 }, { "auxiliary_loss_clip": 0.01115163, "auxiliary_loss_mlp": 0.01050144, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.03080845, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 1.84507980271118, "language_loss": 0.87992418, "learning_rate": 3.8087609749928822e-06, "loss": 0.90157735, "num_input_tokens_seen": 59737555, "step": 2763, "time_per_iteration": 2.7734055519104004 }, { "auxiliary_loss_clip": 0.01086152, "auxiliary_loss_mlp": 0.01013622, "balance_loss_clip": 1.0448606, "balance_loss_mlp": 1.01065338, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7790832079967882, "language_loss": 0.59799927, "learning_rate": 3.8085947468841885e-06, "loss": 0.61899698, "num_input_tokens_seen": 59800915, "step": 2764, "time_per_iteration": 3.1728692054748535 }, { "auxiliary_loss_clip": 0.01152232, "auxiliary_loss_mlp": 0.01053607, "balance_loss_clip": 1.05467176, "balance_loss_mlp": 1.03254318, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.7436496772383425, "language_loss": 0.82260036, "learning_rate": 3.808428450193401e-06, "loss": 0.84465873, "num_input_tokens_seen": 59822910, "step": 2765, "time_per_iteration": 2.72440767288208 }, { "auxiliary_loss_clip": 0.01171844, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.05882454, "balance_loss_mlp": 1.02746069, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.128015994498251, "language_loss": 0.69980019, "learning_rate": 3.8082620849268244e-06, "loss": 0.72200948, "num_input_tokens_seen": 59838805, "step": 2766, "time_per_iteration": 2.5810647010803223 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01047665, "balance_loss_clip": 1.05772817, "balance_loss_mlp": 1.02792454, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.107381123394178, "language_loss": 0.8845337, "learning_rate": 3.808095651090769e-06, "loss": 0.90651393, "num_input_tokens_seen": 59855345, "step": 2767, "time_per_iteration": 2.659240245819092 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01002999, "balance_loss_clip": 1.046556, "balance_loss_mlp": 1.00020981, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6403612433239105, "language_loss": 0.5289067, "learning_rate": 3.8079291486915447e-06, "loss": 0.54970956, "num_input_tokens_seen": 59917710, "step": 2768, "time_per_iteration": 3.28488826751709 }, { "auxiliary_loss_clip": 0.01137637, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.05451822, "balance_loss_mlp": 1.03034163, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.4342686570828267, "language_loss": 0.84962058, "learning_rate": 3.8077625777354667e-06, "loss": 0.87151396, "num_input_tokens_seen": 59935105, "step": 2769, "time_per_iteration": 2.753257989883423 }, { "auxiliary_loss_clip": 0.01068987, "auxiliary_loss_mlp": 0.0100573, "balance_loss_clip": 1.04678345, "balance_loss_mlp": 1.00316668, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.8107434108728753, "language_loss": 0.57455683, "learning_rate": 3.80759593822885e-06, "loss": 0.59530401, "num_input_tokens_seen": 59984085, "step": 2770, "time_per_iteration": 3.2202906608581543 }, { "auxiliary_loss_clip": 0.01054548, "auxiliary_loss_mlp": 0.01003676, "balance_loss_clip": 1.04637623, "balance_loss_mlp": 1.00086308, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8940719168038874, "language_loss": 0.56241393, "learning_rate": 3.807429230178015e-06, "loss": 0.58299619, "num_input_tokens_seen": 60043470, "step": 2771, "time_per_iteration": 3.3302085399627686 }, { "auxiliary_loss_clip": 0.01110714, "auxiliary_loss_mlp": 0.01053994, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.03316772, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.9137693497887778, "language_loss": 0.70419657, "learning_rate": 3.8072624535892817e-06, "loss": 0.72584367, "num_input_tokens_seen": 60063045, "step": 2772, "time_per_iteration": 2.845414161682129 }, { "auxiliary_loss_clip": 0.0114592, "auxiliary_loss_mlp": 0.01049708, "balance_loss_clip": 1.05082583, "balance_loss_mlp": 1.02923954, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 2.20945076195277, "language_loss": 0.86324167, "learning_rate": 3.807095608468975e-06, "loss": 0.88519788, "num_input_tokens_seen": 60081945, "step": 2773, "time_per_iteration": 2.669412851333618 }, { "auxiliary_loss_clip": 0.01095425, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.04436934, "balance_loss_mlp": 1.0300827, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.0211952616678937, "language_loss": 0.82141376, "learning_rate": 3.8069286948234224e-06, "loss": 0.84285897, "num_input_tokens_seen": 60096820, "step": 2774, "time_per_iteration": 2.7111308574676514 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.05252421, "balance_loss_mlp": 1.02446127, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 3.3781068524499, "language_loss": 0.8298822, "learning_rate": 3.806761712658952e-06, "loss": 0.85156858, "num_input_tokens_seen": 60116140, "step": 2775, "time_per_iteration": 2.7367632389068604 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.05761933, "balance_loss_mlp": 1.03264022, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 1.8115651629444076, "language_loss": 0.80919641, "learning_rate": 3.806594661981897e-06, "loss": 0.8312161, "num_input_tokens_seen": 60134235, "step": 2776, "time_per_iteration": 2.651723623275757 }, { "auxiliary_loss_clip": 0.0113775, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.05518723, "balance_loss_mlp": 1.0346483, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 2.7510345221850336, "language_loss": 0.80203485, "learning_rate": 3.8064275427985906e-06, "loss": 0.82395434, "num_input_tokens_seen": 60153275, "step": 2777, "time_per_iteration": 2.6380929946899414 }, { "auxiliary_loss_clip": 0.01147967, "auxiliary_loss_mlp": 0.01045166, "balance_loss_clip": 1.05270481, "balance_loss_mlp": 1.02640271, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 1.6179722336290305, "language_loss": 0.85384095, "learning_rate": 3.806260355115371e-06, "loss": 0.87577224, "num_input_tokens_seen": 60173215, "step": 2778, "time_per_iteration": 2.754652500152588 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.0531714, "balance_loss_mlp": 1.02148652, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 3.2091470007324414, "language_loss": 0.74180603, "learning_rate": 3.8060930989385778e-06, "loss": 0.76358056, "num_input_tokens_seen": 60190515, "step": 2779, "time_per_iteration": 2.777193784713745 }, { "auxiliary_loss_clip": 0.01112683, "auxiliary_loss_mlp": 0.00777451, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.0015173, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.127789274190337, "language_loss": 0.6557346, "learning_rate": 3.805925774274554e-06, "loss": 0.67463589, "num_input_tokens_seen": 60211655, "step": 2780, "time_per_iteration": 2.896976947784424 }, { "auxiliary_loss_clip": 0.01120921, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.04843462, "balance_loss_mlp": 1.02547836, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 2.46647860258999, "language_loss": 0.78422606, "learning_rate": 3.805758381129643e-06, "loss": 0.80589032, "num_input_tokens_seen": 60230860, "step": 2781, "time_per_iteration": 2.725782632827759 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.04439843, "balance_loss_mlp": 1.03056526, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 26.23767952829368, "language_loss": 0.75119764, "learning_rate": 3.805590919510193e-06, "loss": 0.77264553, "num_input_tokens_seen": 60250535, "step": 2782, "time_per_iteration": 2.7064197063446045 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.05152631, "balance_loss_mlp": 1.02764392, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 2.116531296279042, "language_loss": 0.67398441, "learning_rate": 3.8054233894225547e-06, "loss": 0.69575214, "num_input_tokens_seen": 60269530, "step": 2783, "time_per_iteration": 2.7901556491851807 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.05460215, "balance_loss_mlp": 1.03271747, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.7768362036873409, "language_loss": 0.69919086, "learning_rate": 3.805255790873081e-06, "loss": 0.72129631, "num_input_tokens_seen": 60289900, "step": 2784, "time_per_iteration": 5.714844226837158 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01056022, "balance_loss_clip": 1.05217624, "balance_loss_mlp": 1.03539932, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 4.741795209709136, "language_loss": 0.60970068, "learning_rate": 3.805088123868126e-06, "loss": 0.6316371, "num_input_tokens_seen": 60310025, "step": 2785, "time_per_iteration": 4.219547510147095 }, { "auxiliary_loss_clip": 0.01057886, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.03758883, "balance_loss_mlp": 1.00141752, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.773077721474628, "language_loss": 0.58780885, "learning_rate": 3.8049203884140492e-06, "loss": 0.60842752, "num_input_tokens_seen": 60377800, "step": 2786, "time_per_iteration": 3.2306320667266846 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.0496738, "balance_loss_mlp": 1.02589226, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 1.7333132735183339, "language_loss": 0.76308596, "learning_rate": 3.80475258451721e-06, "loss": 0.78492826, "num_input_tokens_seen": 60398215, "step": 2787, "time_per_iteration": 2.6434125900268555 }, { "auxiliary_loss_clip": 0.01146924, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.0529089, "balance_loss_mlp": 1.02544546, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 1.7210472408736244, "language_loss": 0.7717936, "learning_rate": 3.804584712183972e-06, "loss": 0.79369676, "num_input_tokens_seen": 60416910, "step": 2788, "time_per_iteration": 4.359618425369263 }, { "auxiliary_loss_clip": 0.01054629, "auxiliary_loss_mlp": 0.00999991, "balance_loss_clip": 1.03482509, "balance_loss_mlp": 0.99746382, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8596744797543817, "language_loss": 0.59331679, "learning_rate": 3.8044167714207013e-06, "loss": 0.61386299, "num_input_tokens_seen": 60468660, "step": 2789, "time_per_iteration": 3.0742650032043457 }, { "auxiliary_loss_clip": 0.01148272, "auxiliary_loss_mlp": 0.01053856, "balance_loss_clip": 1.05450928, "balance_loss_mlp": 1.03428209, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 1.689036486923415, "language_loss": 0.7012763, "learning_rate": 3.804248762233765e-06, "loss": 0.7232976, "num_input_tokens_seen": 60492370, "step": 2790, "time_per_iteration": 2.872232437133789 }, { "auxiliary_loss_clip": 0.0112492, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.0497216, "balance_loss_mlp": 1.0334661, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 1.864386369112868, "language_loss": 0.79464513, "learning_rate": 3.8040806846295356e-06, "loss": 0.81641054, "num_input_tokens_seen": 60512655, "step": 2791, "time_per_iteration": 2.7180140018463135 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01050939, "balance_loss_clip": 1.04977369, "balance_loss_mlp": 1.03106701, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 1.705849915566178, "language_loss": 0.71547955, "learning_rate": 3.8039125386143853e-06, "loss": 0.73721349, "num_input_tokens_seen": 60533090, "step": 2792, "time_per_iteration": 2.9221818447113037 }, { "auxiliary_loss_clip": 0.01131469, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.05479562, "balance_loss_mlp": 1.02551246, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 1.9301593564774673, "language_loss": 0.71581644, "learning_rate": 3.803744324194691e-06, "loss": 0.73757172, "num_input_tokens_seen": 60553190, "step": 2793, "time_per_iteration": 2.75104022026062 }, { "auxiliary_loss_clip": 0.01143072, "auxiliary_loss_mlp": 0.01053231, "balance_loss_clip": 1.05276942, "balance_loss_mlp": 1.03452659, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 2.3859650274226833, "language_loss": 0.7717455, "learning_rate": 3.803576041376831e-06, "loss": 0.79370856, "num_input_tokens_seen": 60571995, "step": 2794, "time_per_iteration": 2.6007745265960693 }, { "auxiliary_loss_clip": 0.01137828, "auxiliary_loss_mlp": 0.0104987, "balance_loss_clip": 1.05250025, "balance_loss_mlp": 1.03010476, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 2.7692472240964747, "language_loss": 0.71609265, "learning_rate": 3.803407690167187e-06, "loss": 0.73796958, "num_input_tokens_seen": 60591275, "step": 2795, "time_per_iteration": 2.693826198577881 }, { "auxiliary_loss_clip": 0.01131865, "auxiliary_loss_mlp": 0.01041012, "balance_loss_clip": 1.04973865, "balance_loss_mlp": 1.02302384, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.990096863808903, "language_loss": 0.84230494, "learning_rate": 3.803239270572142e-06, "loss": 0.8640337, "num_input_tokens_seen": 60609235, "step": 2796, "time_per_iteration": 2.697253465652466 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01045196, "balance_loss_clip": 1.04877055, "balance_loss_mlp": 1.0262773, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.9272276676322646, "language_loss": 0.81609607, "learning_rate": 3.8030707825980838e-06, "loss": 0.83756441, "num_input_tokens_seen": 60629880, "step": 2797, "time_per_iteration": 2.8784244060516357 }, { "auxiliary_loss_clip": 0.0114057, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.05136061, "balance_loss_mlp": 1.02448523, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 1.7015769336052518, "language_loss": 0.74811113, "learning_rate": 3.802902226251401e-06, "loss": 0.76992965, "num_input_tokens_seen": 60651175, "step": 2798, "time_per_iteration": 2.700727939605713 }, { "auxiliary_loss_clip": 0.01161342, "auxiliary_loss_mlp": 0.01048462, "balance_loss_clip": 1.05728281, "balance_loss_mlp": 1.03075945, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.5964091182578661, "language_loss": 0.79693568, "learning_rate": 3.8027336015384845e-06, "loss": 0.81903368, "num_input_tokens_seen": 60670210, "step": 2799, "time_per_iteration": 2.6582021713256836 }, { "auxiliary_loss_clip": 0.01077177, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.04514158, "balance_loss_mlp": 1.02374637, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 4.227726163531211, "language_loss": 0.70963746, "learning_rate": 3.8025649084657296e-06, "loss": 0.73086143, "num_input_tokens_seen": 60690895, "step": 2800, "time_per_iteration": 2.8856699466705322 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.00777078, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.00161195, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.9902029671619985, "language_loss": 0.83663505, "learning_rate": 3.8023961470395326e-06, "loss": 0.85549408, "num_input_tokens_seen": 60708280, "step": 2801, "time_per_iteration": 2.6917035579681396 }, { "auxiliary_loss_clip": 0.01128148, "auxiliary_loss_mlp": 0.01049324, "balance_loss_clip": 1.05011535, "balance_loss_mlp": 1.03084683, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.4052305427948735, "language_loss": 0.82509923, "learning_rate": 3.8022273172662933e-06, "loss": 0.84687394, "num_input_tokens_seen": 60724150, "step": 2802, "time_per_iteration": 2.882611036300659 }, { "auxiliary_loss_clip": 0.01150156, "auxiliary_loss_mlp": 0.01048717, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.02885723, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 3.107584498439891, "language_loss": 0.80643189, "learning_rate": 3.802058419152413e-06, "loss": 0.8284207, "num_input_tokens_seen": 60746485, "step": 2803, "time_per_iteration": 2.7886922359466553 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.0556829, "balance_loss_mlp": 1.02918339, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 2.2127389669880713, "language_loss": 0.76168799, "learning_rate": 3.801889452704297e-06, "loss": 0.7836476, "num_input_tokens_seen": 60762875, "step": 2804, "time_per_iteration": 2.7588601112365723 }, { "auxiliary_loss_clip": 0.01045171, "auxiliary_loss_mlp": 0.01013955, "balance_loss_clip": 1.03581083, "balance_loss_mlp": 1.01078367, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8536034833258724, "language_loss": 0.55464876, "learning_rate": 3.8017204179283526e-06, "loss": 0.57524002, "num_input_tokens_seen": 60825510, "step": 2805, "time_per_iteration": 3.2089412212371826 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05013156, "balance_loss_mlp": 1.02239537, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 2.2836767274778427, "language_loss": 0.73090243, "learning_rate": 3.8015513148309892e-06, "loss": 0.75268269, "num_input_tokens_seen": 60844440, "step": 2806, "time_per_iteration": 2.643596649169922 }, { "auxiliary_loss_clip": 0.01117063, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.05330753, "balance_loss_mlp": 1.02766335, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.8406859431587912, "language_loss": 0.69773197, "learning_rate": 3.80138214341862e-06, "loss": 0.71935666, "num_input_tokens_seen": 60863210, "step": 2807, "time_per_iteration": 2.6946568489074707 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.04842246, "balance_loss_mlp": 1.02794707, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 3.042021842274248, "language_loss": 0.70280695, "learning_rate": 3.8012129036976587e-06, "loss": 0.72458601, "num_input_tokens_seen": 60882510, "step": 2808, "time_per_iteration": 2.6656088829040527 }, { "auxiliary_loss_clip": 0.01119025, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.05019665, "balance_loss_mlp": 1.02164018, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.0835789337145965, "language_loss": 0.79903001, "learning_rate": 3.8010435956745236e-06, "loss": 0.8206377, "num_input_tokens_seen": 60901105, "step": 2809, "time_per_iteration": 2.7665679454803467 }, { "auxiliary_loss_clip": 0.01155146, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.0557605, "balance_loss_mlp": 1.02252758, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.0672093223845245, "language_loss": 0.88076419, "learning_rate": 3.8008742193556358e-06, "loss": 0.90273583, "num_input_tokens_seen": 60915340, "step": 2810, "time_per_iteration": 2.6186363697052 }, { "auxiliary_loss_clip": 0.01149997, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.05503082, "balance_loss_mlp": 1.02715337, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 1.8921026809528976, "language_loss": 0.92376304, "learning_rate": 3.800704774747416e-06, "loss": 0.9457261, "num_input_tokens_seen": 60933735, "step": 2811, "time_per_iteration": 2.6567442417144775 }, { "auxiliary_loss_clip": 0.01140053, "auxiliary_loss_mlp": 0.01049063, "balance_loss_clip": 1.05383325, "balance_loss_mlp": 1.03039432, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 2.116573413654177, "language_loss": 0.78582352, "learning_rate": 3.800535261856291e-06, "loss": 0.8077147, "num_input_tokens_seen": 60953105, "step": 2812, "time_per_iteration": 2.6796023845672607 }, { "auxiliary_loss_clip": 0.01147895, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.05772316, "balance_loss_mlp": 1.02653646, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.5483899062625093, "language_loss": 0.75195068, "learning_rate": 3.8003656806886887e-06, "loss": 0.7738688, "num_input_tokens_seen": 60969150, "step": 2813, "time_per_iteration": 2.621772050857544 }, { "auxiliary_loss_clip": 0.01136313, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05311871, "balance_loss_mlp": 1.02599943, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 3.0041182480764554, "language_loss": 0.69118392, "learning_rate": 3.8001960312510396e-06, "loss": 0.7129975, "num_input_tokens_seen": 60982825, "step": 2814, "time_per_iteration": 2.837264060974121 }, { "auxiliary_loss_clip": 0.01163835, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.05900145, "balance_loss_mlp": 1.02134776, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 3.1079956206415833, "language_loss": 0.61439502, "learning_rate": 3.800026313549776e-06, "loss": 0.63643175, "num_input_tokens_seen": 61000875, "step": 2815, "time_per_iteration": 2.6967194080352783 }, { "auxiliary_loss_clip": 0.01129827, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.05139673, "balance_loss_mlp": 1.02382088, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7930623183302479, "language_loss": 0.82490849, "learning_rate": 3.7998565275913342e-06, "loss": 0.84663367, "num_input_tokens_seen": 61021940, "step": 2816, "time_per_iteration": 2.7227163314819336 }, { "auxiliary_loss_clip": 0.01133129, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.05375743, "balance_loss_mlp": 1.02853012, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 3.083808689594852, "language_loss": 0.87322289, "learning_rate": 3.799686673382153e-06, "loss": 0.89503324, "num_input_tokens_seen": 61040285, "step": 2817, "time_per_iteration": 2.733180522918701 }, { "auxiliary_loss_clip": 0.01141455, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.05800366, "balance_loss_mlp": 1.03352427, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.8594303503608436, "language_loss": 0.81247765, "learning_rate": 3.799516750928672e-06, "loss": 0.83441973, "num_input_tokens_seen": 61059020, "step": 2818, "time_per_iteration": 2.7384097576141357 }, { "auxiliary_loss_clip": 0.01160132, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.05699944, "balance_loss_mlp": 1.02496791, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 2.739998367204505, "language_loss": 0.80788404, "learning_rate": 3.799346760237336e-06, "loss": 0.82992733, "num_input_tokens_seen": 61074245, "step": 2819, "time_per_iteration": 2.609870672225952 }, { "auxiliary_loss_clip": 0.01069019, "auxiliary_loss_mlp": 0.01015301, "balance_loss_clip": 1.0485003, "balance_loss_mlp": 1.0125947, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9309223426502673, "language_loss": 0.61031163, "learning_rate": 3.7991767013145902e-06, "loss": 0.63115478, "num_input_tokens_seen": 61127080, "step": 2820, "time_per_iteration": 3.161051034927368 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.05106986, "balance_loss_mlp": 1.03207326, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 1.8682266790688726, "language_loss": 0.78265435, "learning_rate": 3.7990065741668844e-06, "loss": 0.80441403, "num_input_tokens_seen": 61146955, "step": 2821, "time_per_iteration": 2.838730573654175 }, { "auxiliary_loss_clip": 0.0113863, "auxiliary_loss_mlp": 0.01055528, "balance_loss_clip": 1.05282724, "balance_loss_mlp": 1.03494084, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 2.1667405259997516, "language_loss": 0.78521514, "learning_rate": 3.7988363788006685e-06, "loss": 0.80715668, "num_input_tokens_seen": 61166605, "step": 2822, "time_per_iteration": 2.783385753631592 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.00777154, "balance_loss_clip": 1.05367076, "balance_loss_mlp": 1.00129986, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 1.8038457392731222, "language_loss": 0.74939907, "learning_rate": 3.7986661152223967e-06, "loss": 0.76860654, "num_input_tokens_seen": 61186535, "step": 2823, "time_per_iteration": 4.329328298568726 }, { "auxiliary_loss_clip": 0.01129469, "auxiliary_loss_mlp": 0.0105385, "balance_loss_clip": 1.05166912, "balance_loss_mlp": 1.03496754, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 3.336653609493179, "language_loss": 0.60266119, "learning_rate": 3.7984957834385257e-06, "loss": 0.62449437, "num_input_tokens_seen": 61208965, "step": 2824, "time_per_iteration": 5.892346620559692 }, { "auxiliary_loss_clip": 0.01138249, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.05565047, "balance_loss_mlp": 1.02287912, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 2.152838804074104, "language_loss": 0.73322558, "learning_rate": 3.7983253834555144e-06, "loss": 0.75503135, "num_input_tokens_seen": 61230670, "step": 2825, "time_per_iteration": 2.834482431411743 }, { "auxiliary_loss_clip": 0.01161467, "auxiliary_loss_mlp": 0.01047701, "balance_loss_clip": 1.05502653, "balance_loss_mlp": 1.02762675, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 2.05671259677731, "language_loss": 0.85638934, "learning_rate": 3.7981549152798245e-06, "loss": 0.87848103, "num_input_tokens_seen": 61249510, "step": 2826, "time_per_iteration": 2.6443135738372803 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01047749, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02856779, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 1.9562557148441426, "language_loss": 0.82465482, "learning_rate": 3.7979843789179196e-06, "loss": 0.84652597, "num_input_tokens_seen": 61269440, "step": 2827, "time_per_iteration": 2.7683157920837402 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.0104561, "balance_loss_clip": 1.05320346, "balance_loss_mlp": 1.02536786, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 1.7386401818136152, "language_loss": 0.73704529, "learning_rate": 3.797813774376267e-06, "loss": 0.75882024, "num_input_tokens_seen": 61288195, "step": 2828, "time_per_iteration": 4.465311288833618 }, { "auxiliary_loss_clip": 0.01061458, "auxiliary_loss_mlp": 0.01009538, "balance_loss_clip": 1.04764342, "balance_loss_mlp": 1.00620067, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.7670168832041738, "language_loss": 0.56426483, "learning_rate": 3.797643101661336e-06, "loss": 0.58497471, "num_input_tokens_seen": 61350850, "step": 2829, "time_per_iteration": 3.3114631175994873 }, { "auxiliary_loss_clip": 0.01111753, "auxiliary_loss_mlp": 0.01051557, "balance_loss_clip": 1.04527223, "balance_loss_mlp": 1.03088641, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 1.7961285206560338, "language_loss": 0.83465374, "learning_rate": 3.7974723607795983e-06, "loss": 0.85628688, "num_input_tokens_seen": 61370765, "step": 2830, "time_per_iteration": 2.795253038406372 }, { "auxiliary_loss_clip": 0.01121533, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.04901659, "balance_loss_mlp": 1.02442193, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 2.4873654173451727, "language_loss": 0.78360993, "learning_rate": 3.797301551737529e-06, "loss": 0.80526608, "num_input_tokens_seen": 61388935, "step": 2831, "time_per_iteration": 2.7864232063293457 }, { "auxiliary_loss_clip": 0.01123612, "auxiliary_loss_mlp": 0.01051154, "balance_loss_clip": 1.05275893, "balance_loss_mlp": 1.0311985, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.532473263441992, "language_loss": 0.79668158, "learning_rate": 3.7971306745416044e-06, "loss": 0.81842923, "num_input_tokens_seen": 61407350, "step": 2832, "time_per_iteration": 2.842217206954956 }, { "auxiliary_loss_clip": 0.01127135, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.05029321, "balance_loss_mlp": 1.02984488, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.8387196201649116, "language_loss": 0.88638175, "learning_rate": 3.7969597291983046e-06, "loss": 0.90814275, "num_input_tokens_seen": 61429010, "step": 2833, "time_per_iteration": 2.75942325592041 }, { "auxiliary_loss_clip": 0.01158799, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.05633831, "balance_loss_mlp": 1.02842951, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.49094605220443, "language_loss": 0.71924698, "learning_rate": 3.7967887157141115e-06, "loss": 0.74130386, "num_input_tokens_seen": 61450040, "step": 2834, "time_per_iteration": 2.9035184383392334 }, { "auxiliary_loss_clip": 0.01119873, "auxiliary_loss_mlp": 0.01052215, "balance_loss_clip": 1.05165124, "balance_loss_mlp": 1.03428626, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 1.9093816511111852, "language_loss": 0.86831236, "learning_rate": 3.7966176340955106e-06, "loss": 0.89003325, "num_input_tokens_seen": 61468585, "step": 2835, "time_per_iteration": 2.7627484798431396 }, { "auxiliary_loss_clip": 0.01149332, "auxiliary_loss_mlp": 0.01049844, "balance_loss_clip": 1.0536654, "balance_loss_mlp": 1.02887547, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.1227367002258153, "language_loss": 0.74483943, "learning_rate": 3.796446484348989e-06, "loss": 0.76683116, "num_input_tokens_seen": 61486330, "step": 2836, "time_per_iteration": 2.6748619079589844 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.04775679, "balance_loss_mlp": 1.02790809, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.1718385109372824, "language_loss": 0.79959226, "learning_rate": 3.796275266481036e-06, "loss": 0.82111007, "num_input_tokens_seen": 61503950, "step": 2837, "time_per_iteration": 2.757340908050537 }, { "auxiliary_loss_clip": 0.01144378, "auxiliary_loss_mlp": 0.01044803, "balance_loss_clip": 1.05493581, "balance_loss_mlp": 1.02644491, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 1.6825251002952497, "language_loss": 0.83258498, "learning_rate": 3.7961039804981456e-06, "loss": 0.85447681, "num_input_tokens_seen": 61523550, "step": 2838, "time_per_iteration": 2.705357551574707 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.05217135, "balance_loss_mlp": 1.02685261, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 1.7789799751303759, "language_loss": 0.93788463, "learning_rate": 3.795932626406812e-06, "loss": 0.95939398, "num_input_tokens_seen": 61542720, "step": 2839, "time_per_iteration": 2.7881791591644287 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05183244, "balance_loss_mlp": 1.0250175, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 2.3337760403585435, "language_loss": 0.83974946, "learning_rate": 3.7957612042135336e-06, "loss": 0.86147022, "num_input_tokens_seen": 61563040, "step": 2840, "time_per_iteration": 2.7564892768859863 }, { "auxiliary_loss_clip": 0.01151834, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.05555129, "balance_loss_mlp": 1.02449679, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.9037435592597944, "language_loss": 0.76307738, "learning_rate": 3.79558971392481e-06, "loss": 0.7850399, "num_input_tokens_seen": 61581890, "step": 2841, "time_per_iteration": 2.695525646209717 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.0527097, "balance_loss_mlp": 1.02744126, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.7844240011089845, "language_loss": 0.77076876, "learning_rate": 3.7954181555471443e-06, "loss": 0.79258937, "num_input_tokens_seen": 61602095, "step": 2842, "time_per_iteration": 2.773792266845703 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.05616069, "balance_loss_mlp": 1.02503705, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.8430349199993477, "language_loss": 0.85694385, "learning_rate": 3.795246529087043e-06, "loss": 0.87894201, "num_input_tokens_seen": 61620400, "step": 2843, "time_per_iteration": 2.5860671997070312 }, { "auxiliary_loss_clip": 0.01154742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05549574, "balance_loss_mlp": 1.02608204, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 2.0353470349004485, "language_loss": 0.68646181, "learning_rate": 3.7950748345510126e-06, "loss": 0.70844984, "num_input_tokens_seen": 61637680, "step": 2844, "time_per_iteration": 2.5961523056030273 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.00778162, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.00112617, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 2.027694794878894, "language_loss": 0.78771943, "learning_rate": 3.7949030719455646e-06, "loss": 0.806835, "num_input_tokens_seen": 61655630, "step": 2845, "time_per_iteration": 2.720193386077881 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01047407, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.02914453, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.2586144454646306, "language_loss": 0.7811147, "learning_rate": 3.7947312412772127e-06, "loss": 0.80303913, "num_input_tokens_seen": 61673475, "step": 2846, "time_per_iteration": 2.691033363342285 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.05425262, "balance_loss_mlp": 1.02865243, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 2.2208975060456426, "language_loss": 0.79762948, "learning_rate": 3.794559342552472e-06, "loss": 0.8195321, "num_input_tokens_seen": 61693370, "step": 2847, "time_per_iteration": 2.7504522800445557 }, { "auxiliary_loss_clip": 0.01142651, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.05101562, "balance_loss_mlp": 1.02668071, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 2.4457083156230017, "language_loss": 0.8665086, "learning_rate": 3.7943873757778614e-06, "loss": 0.88839209, "num_input_tokens_seen": 61710820, "step": 2848, "time_per_iteration": 2.642946720123291 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.02559662, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 3.6033710399461856, "language_loss": 0.75238276, "learning_rate": 3.794215340959902e-06, "loss": 0.77394426, "num_input_tokens_seen": 61729855, "step": 2849, "time_per_iteration": 2.7511017322540283 }, { "auxiliary_loss_clip": 0.0103263, "auxiliary_loss_mlp": 0.01006833, "balance_loss_clip": 1.02775574, "balance_loss_mlp": 1.00413883, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.7881928427119427, "language_loss": 0.57514679, "learning_rate": 3.7940432381051163e-06, "loss": 0.59554148, "num_input_tokens_seen": 61790290, "step": 2850, "time_per_iteration": 3.234609603881836 }, { "auxiliary_loss_clip": 0.01115021, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.05049884, "balance_loss_mlp": 1.02661848, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.962731712990184, "language_loss": 0.81328994, "learning_rate": 3.793871067220031e-06, "loss": 0.83488399, "num_input_tokens_seen": 61809265, "step": 2851, "time_per_iteration": 2.78957200050354 }, { "auxiliary_loss_clip": 0.01114419, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.05193233, "balance_loss_mlp": 1.02592039, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.049906502724323, "language_loss": 0.93085313, "learning_rate": 3.7936988283111764e-06, "loss": 0.95243311, "num_input_tokens_seen": 61828980, "step": 2852, "time_per_iteration": 2.8247029781341553 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.04961288, "balance_loss_mlp": 1.03045225, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.8770741979814063, "language_loss": 0.69465554, "learning_rate": 3.7935265213850817e-06, "loss": 0.71637762, "num_input_tokens_seen": 61847915, "step": 2853, "time_per_iteration": 2.814162492752075 }, { "auxiliary_loss_clip": 0.01120856, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05593121, "balance_loss_mlp": 1.02899122, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.5884803351111705, "language_loss": 0.66611075, "learning_rate": 3.7933541464482815e-06, "loss": 0.68778855, "num_input_tokens_seen": 61865570, "step": 2854, "time_per_iteration": 2.7968995571136475 }, { "auxiliary_loss_clip": 0.01120742, "auxiliary_loss_mlp": 0.01052217, "balance_loss_clip": 1.04853106, "balance_loss_mlp": 1.0349679, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.705510390491261, "language_loss": 0.8929621, "learning_rate": 3.7931817035073124e-06, "loss": 0.91469175, "num_input_tokens_seen": 61883340, "step": 2855, "time_per_iteration": 2.7045016288757324 }, { "auxiliary_loss_clip": 0.01157319, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.05505848, "balance_loss_mlp": 1.02662265, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.117219134143716, "language_loss": 0.83963835, "learning_rate": 3.7930091925687134e-06, "loss": 0.86164963, "num_input_tokens_seen": 61900610, "step": 2856, "time_per_iteration": 2.7349936962127686 }, { "auxiliary_loss_clip": 0.01150108, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.05812418, "balance_loss_mlp": 1.02783966, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 2.234025867710235, "language_loss": 0.86309886, "learning_rate": 3.792836613639026e-06, "loss": 0.88506144, "num_input_tokens_seen": 61916795, "step": 2857, "time_per_iteration": 2.749356746673584 }, { "auxiliary_loss_clip": 0.01144467, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 1.05469525, "balance_loss_mlp": 1.0324626, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 2.069122070501307, "language_loss": 0.78334701, "learning_rate": 3.7926639667247947e-06, "loss": 0.80529737, "num_input_tokens_seen": 61936665, "step": 2858, "time_per_iteration": 2.6673583984375 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.05591416, "balance_loss_mlp": 1.03263378, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 2.1629422323642453, "language_loss": 0.77565676, "learning_rate": 3.7924912518325663e-06, "loss": 0.79766762, "num_input_tokens_seen": 61954415, "step": 2859, "time_per_iteration": 2.646648645401001 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01047481, "balance_loss_clip": 1.05317724, "balance_loss_mlp": 1.02887201, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 2.088627069497316, "language_loss": 0.77088714, "learning_rate": 3.7923184689688902e-06, "loss": 0.79244983, "num_input_tokens_seen": 61973940, "step": 2860, "time_per_iteration": 2.7671573162078857 }, { "auxiliary_loss_clip": 0.01145562, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05316472, "balance_loss_mlp": 1.02416611, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 2.1608688480628304, "language_loss": 0.81384242, "learning_rate": 3.792145618140317e-06, "loss": 0.83571851, "num_input_tokens_seen": 61991845, "step": 2861, "time_per_iteration": 2.6492061614990234 }, { "auxiliary_loss_clip": 0.011306, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05280077, "balance_loss_mlp": 1.0335927, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 2.0128324416816192, "language_loss": 0.85691392, "learning_rate": 3.7919726993534038e-06, "loss": 0.87873554, "num_input_tokens_seen": 62009395, "step": 2862, "time_per_iteration": 4.290126323699951 }, { "auxiliary_loss_clip": 0.01116765, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.05126834, "balance_loss_mlp": 1.02655208, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 3.7047120479299993, "language_loss": 0.78047049, "learning_rate": 3.7917997126147054e-06, "loss": 0.80207253, "num_input_tokens_seen": 62029005, "step": 2863, "time_per_iteration": 4.275500774383545 }, { "auxiliary_loss_clip": 0.01122315, "auxiliary_loss_mlp": 0.00776596, "balance_loss_clip": 1.05132961, "balance_loss_mlp": 1.00090909, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.7350128683820358, "language_loss": 0.72135127, "learning_rate": 3.7916266579307823e-06, "loss": 0.74034035, "num_input_tokens_seen": 62048730, "step": 2864, "time_per_iteration": 4.414710998535156 }, { "auxiliary_loss_clip": 0.01121488, "auxiliary_loss_mlp": 0.01049611, "balance_loss_clip": 1.05114079, "balance_loss_mlp": 1.03099, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 1.9270646210248614, "language_loss": 0.73002023, "learning_rate": 3.7914535353081973e-06, "loss": 0.75173128, "num_input_tokens_seen": 62069000, "step": 2865, "time_per_iteration": 2.7463715076446533 }, { "auxiliary_loss_clip": 0.01145037, "auxiliary_loss_mlp": 0.0077644, "balance_loss_clip": 1.05669165, "balance_loss_mlp": 1.00120521, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 2.669585642962841, "language_loss": 0.78357804, "learning_rate": 3.7912803447535145e-06, "loss": 0.80279285, "num_input_tokens_seen": 62086750, "step": 2866, "time_per_iteration": 2.785146713256836 }, { "auxiliary_loss_clip": 0.01157272, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.05600274, "balance_loss_mlp": 1.02536821, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 2.551277931358127, "language_loss": 0.79755104, "learning_rate": 3.7911070862733016e-06, "loss": 0.81956732, "num_input_tokens_seen": 62106240, "step": 2867, "time_per_iteration": 4.3145318031311035 }, { "auxiliary_loss_clip": 0.01132297, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.0529356, "balance_loss_mlp": 1.02274013, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.8689780270661371, "language_loss": 0.79206991, "learning_rate": 3.7909337598741276e-06, "loss": 0.81380683, "num_input_tokens_seen": 62124895, "step": 2868, "time_per_iteration": 2.7683827877044678 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.05331647, "balance_loss_mlp": 1.02427697, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.0344588273772923, "language_loss": 0.84221756, "learning_rate": 3.7907603655625674e-06, "loss": 0.86375177, "num_input_tokens_seen": 62143510, "step": 2869, "time_per_iteration": 2.729156970977783 }, { "auxiliary_loss_clip": 0.01132999, "auxiliary_loss_mlp": 0.01048405, "balance_loss_clip": 1.0535363, "balance_loss_mlp": 1.02955842, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.8935704627114847, "language_loss": 0.77299273, "learning_rate": 3.7905869033451932e-06, "loss": 0.79480684, "num_input_tokens_seen": 62162285, "step": 2870, "time_per_iteration": 2.752739191055298 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.05671024, "balance_loss_mlp": 1.02110744, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 2.0115587398764396, "language_loss": 0.77409238, "learning_rate": 3.7904133732285857e-06, "loss": 0.79601026, "num_input_tokens_seen": 62180970, "step": 2871, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.0222379, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.203011669690562, "language_loss": 0.74197829, "learning_rate": 3.7902397752193228e-06, "loss": 0.76371384, "num_input_tokens_seen": 62198965, "step": 2872, "time_per_iteration": 2.6959900856018066 }, { "auxiliary_loss_clip": 0.01150773, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.05359554, "balance_loss_mlp": 1.02362645, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.7914171074077658, "language_loss": 0.82336062, "learning_rate": 3.790066109323988e-06, "loss": 0.84528345, "num_input_tokens_seen": 62219890, "step": 2873, "time_per_iteration": 2.603564977645874 }, { "auxiliary_loss_clip": 0.01108819, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.04744792, "balance_loss_mlp": 1.02522969, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 3.7341652608759297, "language_loss": 0.75355422, "learning_rate": 3.7898923755491678e-06, "loss": 0.77509236, "num_input_tokens_seen": 62237140, "step": 2874, "time_per_iteration": 2.8438260555267334 }, { "auxiliary_loss_clip": 0.01159322, "auxiliary_loss_mlp": 0.01044415, "balance_loss_clip": 1.05658269, "balance_loss_mlp": 1.02404249, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 2.7053876793207037, "language_loss": 0.80239916, "learning_rate": 3.7897185739014487e-06, "loss": 0.82443655, "num_input_tokens_seen": 62255405, "step": 2875, "time_per_iteration": 2.625183343887329 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.0535475, "balance_loss_mlp": 1.03297722, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 3.840653645811056, "language_loss": 0.87621164, "learning_rate": 3.7895447043874217e-06, "loss": 0.8980962, "num_input_tokens_seen": 62271280, "step": 2876, "time_per_iteration": 2.6782751083374023 }, { "auxiliary_loss_clip": 0.01136898, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.05730534, "balance_loss_mlp": 1.02559566, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 1.8931416121171032, "language_loss": 0.84386718, "learning_rate": 3.789370767013681e-06, "loss": 0.86567843, "num_input_tokens_seen": 62289140, "step": 2877, "time_per_iteration": 2.681131362915039 }, { "auxiliary_loss_clip": 0.01120759, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05222571, "balance_loss_mlp": 1.02499604, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.106635210245156, "language_loss": 0.79660022, "learning_rate": 3.7891967617868204e-06, "loss": 0.81824744, "num_input_tokens_seen": 62307490, "step": 2878, "time_per_iteration": 2.8118834495544434 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01047222, "balance_loss_clip": 1.05593777, "balance_loss_mlp": 1.02953172, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.9675557254753375, "language_loss": 0.70236337, "learning_rate": 3.78902268871344e-06, "loss": 0.72419673, "num_input_tokens_seen": 62328570, "step": 2879, "time_per_iteration": 2.7998502254486084 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.05183411, "balance_loss_mlp": 1.03337598, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 2.0545155253910163, "language_loss": 0.82884222, "learning_rate": 3.78884854780014e-06, "loss": 0.85066462, "num_input_tokens_seen": 62345735, "step": 2880, "time_per_iteration": 2.6707684993743896 }, { "auxiliary_loss_clip": 0.01110706, "auxiliary_loss_mlp": 0.01054327, "balance_loss_clip": 1.05214918, "balance_loss_mlp": 1.03303647, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 1.9029231217608267, "language_loss": 0.80879176, "learning_rate": 3.7886743390535236e-06, "loss": 0.83044201, "num_input_tokens_seen": 62365525, "step": 2881, "time_per_iteration": 2.7851576805114746 }, { "auxiliary_loss_clip": 0.01135983, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.05544055, "balance_loss_mlp": 1.02921653, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 2.753231520615002, "language_loss": 0.77268815, "learning_rate": 3.788500062480197e-06, "loss": 0.79451692, "num_input_tokens_seen": 62385160, "step": 2882, "time_per_iteration": 2.7785212993621826 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01047516, "balance_loss_clip": 1.0633558, "balance_loss_mlp": 1.02947998, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 2.096311926604511, "language_loss": 0.76714236, "learning_rate": 3.788325718086769e-06, "loss": 0.78885853, "num_input_tokens_seen": 62405280, "step": 2883, "time_per_iteration": 2.838848352432251 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.04929209, "balance_loss_mlp": 1.02821302, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.1194201700326873, "language_loss": 0.8555252, "learning_rate": 3.7881513058798503e-06, "loss": 0.87709635, "num_input_tokens_seen": 62423665, "step": 2884, "time_per_iteration": 2.829376220703125 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.00775817, "balance_loss_clip": 1.05472779, "balance_loss_mlp": 1.00088096, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.7131036779262108, "language_loss": 0.74756771, "learning_rate": 3.787976825866055e-06, "loss": 0.76666546, "num_input_tokens_seen": 62445170, "step": 2885, "time_per_iteration": 2.8710989952087402 }, { "auxiliary_loss_clip": 0.01128977, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.05498922, "balance_loss_mlp": 1.0280925, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 2.374438581614022, "language_loss": 0.7107017, "learning_rate": 3.7878022780519998e-06, "loss": 0.73244053, "num_input_tokens_seen": 62466135, "step": 2886, "time_per_iteration": 2.726621150970459 }, { "auxiliary_loss_clip": 0.01142411, "auxiliary_loss_mlp": 0.01041857, "balance_loss_clip": 1.05233932, "balance_loss_mlp": 1.02408338, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 2.0566537172661747, "language_loss": 0.69906294, "learning_rate": 3.7876276624443024e-06, "loss": 0.72090566, "num_input_tokens_seen": 62483910, "step": 2887, "time_per_iteration": 2.7066688537597656 }, { "auxiliary_loss_clip": 0.01116425, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.05328536, "balance_loss_mlp": 1.02728677, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 2.038016964464323, "language_loss": 0.85257947, "learning_rate": 3.787452979049585e-06, "loss": 0.87419748, "num_input_tokens_seen": 62501530, "step": 2888, "time_per_iteration": 2.7514970302581787 }, { "auxiliary_loss_clip": 0.01095063, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.05020595, "balance_loss_mlp": 1.02822983, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.196318077733749, "language_loss": 0.78491282, "learning_rate": 3.7872782278744718e-06, "loss": 0.80635762, "num_input_tokens_seen": 62521295, "step": 2889, "time_per_iteration": 2.8221559524536133 }, { "auxiliary_loss_clip": 0.01112139, "auxiliary_loss_mlp": 0.0077601, "balance_loss_clip": 1.05236733, "balance_loss_mlp": 1.00114667, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.333227367674716, "language_loss": 0.84076989, "learning_rate": 3.7871034089255883e-06, "loss": 0.85965133, "num_input_tokens_seen": 62539615, "step": 2890, "time_per_iteration": 2.7213382720947266 }, { "auxiliary_loss_clip": 0.01142218, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.05530691, "balance_loss_mlp": 1.03752589, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 2.7278091568285596, "language_loss": 0.82205319, "learning_rate": 3.7869285222095653e-06, "loss": 0.84402454, "num_input_tokens_seen": 62556820, "step": 2891, "time_per_iteration": 2.625162363052368 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01050012, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.02876878, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 1.9017653264876209, "language_loss": 0.81200826, "learning_rate": 3.7867535677330334e-06, "loss": 0.83348203, "num_input_tokens_seen": 62572450, "step": 2892, "time_per_iteration": 2.7682459354400635 }, { "auxiliary_loss_clip": 0.01148834, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.05707812, "balance_loss_mlp": 1.03631687, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 2.0056711213447436, "language_loss": 0.73950225, "learning_rate": 3.786578545502627e-06, "loss": 0.76154572, "num_input_tokens_seen": 62592580, "step": 2893, "time_per_iteration": 2.8463022708892822 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.05198765, "balance_loss_mlp": 1.02443516, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 4.010773627073901, "language_loss": 0.82507658, "learning_rate": 3.7864034555249828e-06, "loss": 0.84683645, "num_input_tokens_seen": 62611220, "step": 2894, "time_per_iteration": 2.719564914703369 }, { "auxiliary_loss_clip": 0.01113951, "auxiliary_loss_mlp": 0.01046249, "balance_loss_clip": 1.0506922, "balance_loss_mlp": 1.02463603, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 2.3322053123967574, "language_loss": 0.73826683, "learning_rate": 3.786228297806741e-06, "loss": 0.7598688, "num_input_tokens_seen": 62629185, "step": 2895, "time_per_iteration": 2.743992805480957 }, { "auxiliary_loss_clip": 0.01037578, "auxiliary_loss_mlp": 0.01011099, "balance_loss_clip": 1.0404408, "balance_loss_mlp": 1.00788069, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8765647158253519, "language_loss": 0.62754023, "learning_rate": 3.7860530723545435e-06, "loss": 0.64802706, "num_input_tokens_seen": 62691895, "step": 2896, "time_per_iteration": 3.345099687576294 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.00776588, "balance_loss_clip": 1.05246758, "balance_loss_mlp": 1.00102258, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7338863964520728, "language_loss": 0.75822324, "learning_rate": 3.785877779175034e-06, "loss": 0.77729923, "num_input_tokens_seen": 62713790, "step": 2897, "time_per_iteration": 2.772292137145996 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.0545547, "balance_loss_mlp": 1.02512598, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 1.944569306659421, "language_loss": 0.6883949, "learning_rate": 3.7857024182748606e-06, "loss": 0.71023834, "num_input_tokens_seen": 62736285, "step": 2898, "time_per_iteration": 2.7278554439544678 }, { "auxiliary_loss_clip": 0.01128715, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.05251193, "balance_loss_mlp": 1.02504694, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.99011081330885, "language_loss": 0.76445562, "learning_rate": 3.7855269896606717e-06, "loss": 0.78618491, "num_input_tokens_seen": 62756240, "step": 2899, "time_per_iteration": 2.8052010536193848 }, { "auxiliary_loss_clip": 0.01095069, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.04680347, "balance_loss_mlp": 1.02632213, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 3.2965812335226357, "language_loss": 0.72860038, "learning_rate": 3.785351493339121e-06, "loss": 0.75001007, "num_input_tokens_seen": 62775910, "step": 2900, "time_per_iteration": 2.868218421936035 }, { "auxiliary_loss_clip": 0.01110522, "auxiliary_loss_mlp": 0.00776698, "balance_loss_clip": 1.05202782, "balance_loss_mlp": 1.000983, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.5488662608930523, "language_loss": 0.69946706, "learning_rate": 3.785175929316863e-06, "loss": 0.71833932, "num_input_tokens_seen": 62799385, "step": 2901, "time_per_iteration": 4.407040596008301 }, { "auxiliary_loss_clip": 0.01129098, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.05246592, "balance_loss_mlp": 1.02764344, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.1785959913748965, "language_loss": 0.76588804, "learning_rate": 3.7850002976005543e-06, "loss": 0.78763425, "num_input_tokens_seen": 62819380, "step": 2902, "time_per_iteration": 4.2244462966918945 }, { "auxiliary_loss_clip": 0.01145685, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.0531354, "balance_loss_mlp": 1.02567625, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.2508699895191073, "language_loss": 0.81588745, "learning_rate": 3.7848245981968558e-06, "loss": 0.83778256, "num_input_tokens_seen": 62836205, "step": 2903, "time_per_iteration": 4.132925271987915 }, { "auxiliary_loss_clip": 0.01126443, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02135992, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 2.4085694554154187, "language_loss": 0.73316491, "learning_rate": 3.784648831112429e-06, "loss": 0.75482351, "num_input_tokens_seen": 62854045, "step": 2904, "time_per_iteration": 2.7033374309539795 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.0104577, "balance_loss_clip": 1.05250716, "balance_loss_mlp": 1.02822256, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 1.8783326609306377, "language_loss": 0.64233291, "learning_rate": 3.7844729963539406e-06, "loss": 0.66384256, "num_input_tokens_seen": 62873075, "step": 2905, "time_per_iteration": 2.8325791358947754 }, { "auxiliary_loss_clip": 0.01135256, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.05869055, "balance_loss_mlp": 1.03370619, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 2.820817719352069, "language_loss": 0.79504299, "learning_rate": 3.7842970939280566e-06, "loss": 0.81692564, "num_input_tokens_seen": 62892675, "step": 2906, "time_per_iteration": 4.491498231887817 }, { "auxiliary_loss_clip": 0.01146195, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.05623174, "balance_loss_mlp": 1.03258538, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 2.262709441571415, "language_loss": 0.81318873, "learning_rate": 3.784121123841449e-06, "loss": 0.83515799, "num_input_tokens_seen": 62910675, "step": 2907, "time_per_iteration": 2.6855854988098145 }, { "auxiliary_loss_clip": 0.01143202, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.0253861, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.068635027461873, "language_loss": 0.81342787, "learning_rate": 3.7839450861007886e-06, "loss": 0.83529305, "num_input_tokens_seen": 62928130, "step": 2908, "time_per_iteration": 2.6449570655822754 }, { "auxiliary_loss_clip": 0.01127136, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.05178046, "balance_loss_mlp": 1.03163743, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 3.147433356867123, "language_loss": 0.80020624, "learning_rate": 3.7837689807127518e-06, "loss": 0.82198691, "num_input_tokens_seen": 62944290, "step": 2909, "time_per_iteration": 2.6820569038391113 }, { "auxiliary_loss_clip": 0.0109059, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.05020881, "balance_loss_mlp": 1.0310595, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.6978440546881337, "language_loss": 0.76742244, "learning_rate": 3.783592807684017e-06, "loss": 0.7888546, "num_input_tokens_seen": 62963505, "step": 2910, "time_per_iteration": 2.6980416774749756 }, { "auxiliary_loss_clip": 0.01158552, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05618358, "balance_loss_mlp": 1.03059566, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.9812610358315632, "language_loss": 0.8698765, "learning_rate": 3.7834165670212645e-06, "loss": 0.89195609, "num_input_tokens_seen": 62985020, "step": 2911, "time_per_iteration": 2.692662477493286 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.00777232, "balance_loss_clip": 1.05323184, "balance_loss_mlp": 1.00110698, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 3.030740090796483, "language_loss": 0.89883876, "learning_rate": 3.7832402587311764e-06, "loss": 0.91815847, "num_input_tokens_seen": 63001745, "step": 2912, "time_per_iteration": 2.600738763809204 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01045616, "balance_loss_clip": 1.0538094, "balance_loss_mlp": 1.02655411, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 2.03479884577424, "language_loss": 0.72818935, "learning_rate": 3.783063882820439e-06, "loss": 0.75010711, "num_input_tokens_seen": 63019750, "step": 2913, "time_per_iteration": 2.623342275619507 }, { "auxiliary_loss_clip": 0.01140074, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.05781865, "balance_loss_mlp": 1.02557003, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 2.137073079496124, "language_loss": 0.6891731, "learning_rate": 3.782887439295741e-06, "loss": 0.71101314, "num_input_tokens_seen": 63039500, "step": 2914, "time_per_iteration": 2.7065770626068115 }, { "auxiliary_loss_clip": 0.01142434, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05532789, "balance_loss_mlp": 1.02649403, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 2.051329837479214, "language_loss": 0.93125081, "learning_rate": 3.782710928163772e-06, "loss": 0.9531256, "num_input_tokens_seen": 63059785, "step": 2915, "time_per_iteration": 2.659029245376587 }, { "auxiliary_loss_clip": 0.01114731, "auxiliary_loss_mlp": 0.01040999, "balance_loss_clip": 1.04957223, "balance_loss_mlp": 1.02243853, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 1.604344576738792, "language_loss": 0.81092978, "learning_rate": 3.782534349431226e-06, "loss": 0.83248705, "num_input_tokens_seen": 63079385, "step": 2916, "time_per_iteration": 2.7099549770355225 }, { "auxiliary_loss_clip": 0.0114211, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.05090034, "balance_loss_mlp": 1.02780342, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 3.7582760939418716, "language_loss": 0.73829222, "learning_rate": 3.782357703104799e-06, "loss": 0.76017547, "num_input_tokens_seen": 63098970, "step": 2917, "time_per_iteration": 2.666717767715454 }, { "auxiliary_loss_clip": 0.01133449, "auxiliary_loss_mlp": 0.01047353, "balance_loss_clip": 1.05319786, "balance_loss_mlp": 1.02821994, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.813699779869167, "language_loss": 0.76739681, "learning_rate": 3.7821809891911897e-06, "loss": 0.78920484, "num_input_tokens_seen": 63118750, "step": 2918, "time_per_iteration": 2.647634744644165 }, { "auxiliary_loss_clip": 0.01093958, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.0476644, "balance_loss_mlp": 1.02425694, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.436739755969174, "language_loss": 0.73624814, "learning_rate": 3.782004207697098e-06, "loss": 0.75764406, "num_input_tokens_seen": 63136865, "step": 2919, "time_per_iteration": 2.7904632091522217 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.04938293, "balance_loss_mlp": 1.02805829, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 2.5113730227003814, "language_loss": 0.74840331, "learning_rate": 3.781827358629228e-06, "loss": 0.77011508, "num_input_tokens_seen": 63158325, "step": 2920, "time_per_iteration": 2.727890968322754 }, { "auxiliary_loss_clip": 0.01117257, "auxiliary_loss_mlp": 0.01042893, "balance_loss_clip": 1.0462867, "balance_loss_mlp": 1.02371216, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 3.6617213109535536, "language_loss": 0.79731411, "learning_rate": 3.7816504419942873e-06, "loss": 0.81891561, "num_input_tokens_seen": 63173115, "step": 2921, "time_per_iteration": 2.753817558288574 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.05232286, "balance_loss_mlp": 1.02679133, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 2.6301689129577546, "language_loss": 0.87826073, "learning_rate": 3.7814734577989823e-06, "loss": 0.89994025, "num_input_tokens_seen": 63192880, "step": 2922, "time_per_iteration": 2.7411837577819824 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.05196273, "balance_loss_mlp": 1.02778149, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 4.4893841411537085, "language_loss": 0.62347209, "learning_rate": 3.7812964060500253e-06, "loss": 0.64538622, "num_input_tokens_seen": 63214395, "step": 2923, "time_per_iteration": 2.7666683197021484 }, { "auxiliary_loss_clip": 0.01134872, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.05887377, "balance_loss_mlp": 1.02847457, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 2.8552131957437914, "language_loss": 0.80392253, "learning_rate": 3.78111928675413e-06, "loss": 0.82575822, "num_input_tokens_seen": 63231020, "step": 2924, "time_per_iteration": 2.729403257369995 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01051456, "balance_loss_clip": 1.05193377, "balance_loss_mlp": 1.03082108, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 5.080042666316876, "language_loss": 0.71374178, "learning_rate": 3.7809420999180126e-06, "loss": 0.73558426, "num_input_tokens_seen": 63246245, "step": 2925, "time_per_iteration": 2.9538233280181885 }, { "auxiliary_loss_clip": 0.01117196, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.05052948, "balance_loss_mlp": 1.02744341, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.6620026542608322, "language_loss": 0.71931666, "learning_rate": 3.7807648455483934e-06, "loss": 0.74094564, "num_input_tokens_seen": 63267790, "step": 2926, "time_per_iteration": 2.7738964557647705 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04915071, "balance_loss_mlp": 1.02253425, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 2.6318732447225837, "language_loss": 0.84724289, "learning_rate": 3.7805875236519918e-06, "loss": 0.86875963, "num_input_tokens_seen": 63286830, "step": 2927, "time_per_iteration": 2.704437494277954 }, { "auxiliary_loss_clip": 0.01100437, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05039644, "balance_loss_mlp": 1.02887452, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.9547597089289632, "language_loss": 0.72147644, "learning_rate": 3.7804101342355336e-06, "loss": 0.74294758, "num_input_tokens_seen": 63308870, "step": 2928, "time_per_iteration": 2.793802261352539 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.0516876, "balance_loss_mlp": 1.02679992, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 1.8474008440192304, "language_loss": 0.83097279, "learning_rate": 3.780232677305744e-06, "loss": 0.85263157, "num_input_tokens_seen": 63329005, "step": 2929, "time_per_iteration": 2.733339786529541 }, { "auxiliary_loss_clip": 0.01124127, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04853475, "balance_loss_mlp": 1.02479422, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 2.4427170552109163, "language_loss": 0.79211783, "learning_rate": 3.7800551528693535e-06, "loss": 0.81378424, "num_input_tokens_seen": 63349390, "step": 2930, "time_per_iteration": 2.748080015182495 }, { "auxiliary_loss_clip": 0.01160654, "auxiliary_loss_mlp": 0.01047281, "balance_loss_clip": 1.05925918, "balance_loss_mlp": 1.02758813, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 2.504124366499191, "language_loss": 0.76502466, "learning_rate": 3.7798775609330927e-06, "loss": 0.78710401, "num_input_tokens_seen": 63368835, "step": 2931, "time_per_iteration": 2.6691603660583496 }, { "auxiliary_loss_clip": 0.01076453, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.04577017, "balance_loss_mlp": 1.02478647, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.941321746162514, "language_loss": 0.76070881, "learning_rate": 3.779699901503696e-06, "loss": 0.78190923, "num_input_tokens_seen": 63385220, "step": 2932, "time_per_iteration": 2.809630870819092 }, { "auxiliary_loss_clip": 0.01148627, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.05284405, "balance_loss_mlp": 1.0229789, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 5.168612276821382, "language_loss": 0.90027422, "learning_rate": 3.7795221745879016e-06, "loss": 0.92219198, "num_input_tokens_seen": 63400865, "step": 2933, "time_per_iteration": 2.6665337085723877 }, { "auxiliary_loss_clip": 0.01154114, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05539656, "balance_loss_mlp": 1.03766203, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 2.009210784374188, "language_loss": 0.88323247, "learning_rate": 3.779344380192448e-06, "loss": 0.90533352, "num_input_tokens_seen": 63421390, "step": 2934, "time_per_iteration": 2.6649580001831055 }, { "auxiliary_loss_clip": 0.01128495, "auxiliary_loss_mlp": 0.01048067, "balance_loss_clip": 1.05581188, "balance_loss_mlp": 1.03028131, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.6302121247923247, "language_loss": 0.70403945, "learning_rate": 3.779166518324077e-06, "loss": 0.72580504, "num_input_tokens_seen": 63444715, "step": 2935, "time_per_iteration": 3.006019115447998 }, { "auxiliary_loss_clip": 0.01126189, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.05360174, "balance_loss_mlp": 1.02135396, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.5931578566124807, "language_loss": 0.69721985, "learning_rate": 3.7789885889895325e-06, "loss": 0.71888208, "num_input_tokens_seen": 63465525, "step": 2936, "time_per_iteration": 2.7517428398132324 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.05023837, "balance_loss_mlp": 1.02737129, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 1.9170676229980566, "language_loss": 0.71288073, "learning_rate": 3.7788105921955634e-06, "loss": 0.73439616, "num_input_tokens_seen": 63485815, "step": 2937, "time_per_iteration": 2.837181329727173 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05945122, "balance_loss_mlp": 1.02674472, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.267148270780071, "language_loss": 0.75439745, "learning_rate": 3.7786325279489184e-06, "loss": 0.77627593, "num_input_tokens_seen": 63503905, "step": 2938, "time_per_iteration": 2.883162021636963 }, { "auxiliary_loss_clip": 0.01147345, "auxiliary_loss_mlp": 0.01043976, "balance_loss_clip": 1.05576169, "balance_loss_mlp": 1.02553487, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.921726967662053, "language_loss": 0.71015209, "learning_rate": 3.7784543962563495e-06, "loss": 0.73206532, "num_input_tokens_seen": 63521985, "step": 2939, "time_per_iteration": 2.6938419342041016 }, { "auxiliary_loss_clip": 0.01160437, "auxiliary_loss_mlp": 0.01046921, "balance_loss_clip": 1.05818558, "balance_loss_mlp": 1.02794337, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 3.114901170192376, "language_loss": 0.73513985, "learning_rate": 3.7782761971246115e-06, "loss": 0.75721341, "num_input_tokens_seen": 63539830, "step": 2940, "time_per_iteration": 4.145469665527344 }, { "auxiliary_loss_clip": 0.0112582, "auxiliary_loss_mlp": 0.01046611, "balance_loss_clip": 1.05631542, "balance_loss_mlp": 1.02731109, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 3.071469776016301, "language_loss": 0.85375023, "learning_rate": 3.7780979305604616e-06, "loss": 0.87547457, "num_input_tokens_seen": 63555495, "step": 2941, "time_per_iteration": 4.279599666595459 }, { "auxiliary_loss_clip": 0.01161068, "auxiliary_loss_mlp": 0.01045254, "balance_loss_clip": 1.05717027, "balance_loss_mlp": 1.0257628, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.434766510066968, "language_loss": 0.76885259, "learning_rate": 3.7779195965706607e-06, "loss": 0.79091585, "num_input_tokens_seen": 63575290, "step": 2942, "time_per_iteration": 4.2280871868133545 }, { "auxiliary_loss_clip": 0.01106234, "auxiliary_loss_mlp": 0.00780676, "balance_loss_clip": 1.04992843, "balance_loss_mlp": 1.00087166, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 3.301743041114179, "language_loss": 0.8024286, "learning_rate": 3.77774119516197e-06, "loss": 0.82129776, "num_input_tokens_seen": 63594670, "step": 2943, "time_per_iteration": 2.8921029567718506 }, { "auxiliary_loss_clip": 0.01132848, "auxiliary_loss_mlp": 0.01052225, "balance_loss_clip": 1.05352235, "balance_loss_mlp": 1.03124392, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 5.7613375603973465, "language_loss": 0.80809408, "learning_rate": 3.777562726341155e-06, "loss": 0.82994485, "num_input_tokens_seen": 63614780, "step": 2944, "time_per_iteration": 2.692831039428711 }, { "auxiliary_loss_clip": 0.01161854, "auxiliary_loss_mlp": 0.01056825, "balance_loss_clip": 1.05807233, "balance_loss_mlp": 1.03796625, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 2.4257754996125227, "language_loss": 0.73812854, "learning_rate": 3.7773841901149835e-06, "loss": 0.7603153, "num_input_tokens_seen": 63637190, "step": 2945, "time_per_iteration": 2.782910108566284 }, { "auxiliary_loss_clip": 0.011481, "auxiliary_loss_mlp": 0.01047361, "balance_loss_clip": 1.05756998, "balance_loss_mlp": 1.02862108, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.8106797532110637, "language_loss": 0.7793628, "learning_rate": 3.7772055864902256e-06, "loss": 0.80131739, "num_input_tokens_seen": 63652140, "step": 2946, "time_per_iteration": 4.278741121292114 }, { "auxiliary_loss_clip": 0.01109059, "auxiliary_loss_mlp": 0.01052842, "balance_loss_clip": 1.04997015, "balance_loss_mlp": 1.03341079, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 2.172386857191393, "language_loss": 0.76068008, "learning_rate": 3.7770269154736535e-06, "loss": 0.7822991, "num_input_tokens_seen": 63671700, "step": 2947, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.0114934, "auxiliary_loss_mlp": 0.01044342, "balance_loss_clip": 1.05480659, "balance_loss_mlp": 1.025388, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.6793588646204745, "language_loss": 0.72557831, "learning_rate": 3.7768481770720424e-06, "loss": 0.74751514, "num_input_tokens_seen": 63691685, "step": 2948, "time_per_iteration": 2.901662826538086 }, { "auxiliary_loss_clip": 0.01151572, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.05921662, "balance_loss_mlp": 1.03236949, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.8296543316983853, "language_loss": 0.81782824, "learning_rate": 3.776669371292171e-06, "loss": 0.8398509, "num_input_tokens_seen": 63711720, "step": 2949, "time_per_iteration": 2.7284891605377197 }, { "auxiliary_loss_clip": 0.01080853, "auxiliary_loss_mlp": 0.0100651, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.00226629, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.768126622018234, "language_loss": 0.64989161, "learning_rate": 3.7764904981408186e-06, "loss": 0.67076528, "num_input_tokens_seen": 63776280, "step": 2950, "time_per_iteration": 3.2761552333831787 }, { "auxiliary_loss_clip": 0.01121454, "auxiliary_loss_mlp": 0.01045861, "balance_loss_clip": 1.05373287, "balance_loss_mlp": 1.02743077, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 2.9882590699755927, "language_loss": 0.83619881, "learning_rate": 3.7763115576247686e-06, "loss": 0.85787189, "num_input_tokens_seen": 63797535, "step": 2951, "time_per_iteration": 2.7637627124786377 }, { "auxiliary_loss_clip": 0.01125929, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.02682269, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 2.3133151959471796, "language_loss": 0.80395055, "learning_rate": 3.776132549750806e-06, "loss": 0.82567012, "num_input_tokens_seen": 63817045, "step": 2952, "time_per_iteration": 2.7605957984924316 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01044862, "balance_loss_clip": 1.05858529, "balance_loss_mlp": 1.02513337, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.8185319653472116, "language_loss": 0.79273909, "learning_rate": 3.7759534745257194e-06, "loss": 0.81481451, "num_input_tokens_seen": 63837665, "step": 2953, "time_per_iteration": 2.798912525177002 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.02470589, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 2.017710353628998, "language_loss": 0.87963271, "learning_rate": 3.7757743319562994e-06, "loss": 0.90130568, "num_input_tokens_seen": 63858455, "step": 2954, "time_per_iteration": 2.838931083679199 }, { "auxiliary_loss_clip": 0.01144028, "auxiliary_loss_mlp": 0.01052958, "balance_loss_clip": 1.06043494, "balance_loss_mlp": 1.03296697, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.9130853947826985, "language_loss": 0.85313326, "learning_rate": 3.7755951220493386e-06, "loss": 0.87510312, "num_input_tokens_seen": 63876935, "step": 2955, "time_per_iteration": 2.7965714931488037 }, { "auxiliary_loss_clip": 0.01127677, "auxiliary_loss_mlp": 0.01047004, "balance_loss_clip": 1.05093336, "balance_loss_mlp": 1.02660692, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 18.24238703278013, "language_loss": 0.71152055, "learning_rate": 3.7754158448116327e-06, "loss": 0.73326737, "num_input_tokens_seen": 63896815, "step": 2956, "time_per_iteration": 2.8358442783355713 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.010506, "balance_loss_clip": 1.05813813, "balance_loss_mlp": 1.03156281, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 2.981126112172262, "language_loss": 0.82881534, "learning_rate": 3.7752365002499795e-06, "loss": 0.85082197, "num_input_tokens_seen": 63916140, "step": 2957, "time_per_iteration": 2.7034976482391357 }, { "auxiliary_loss_clip": 0.01100452, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.04976833, "balance_loss_mlp": 1.02789164, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 2.7180995933425622, "language_loss": 0.75164193, "learning_rate": 3.7750570883711807e-06, "loss": 0.77311885, "num_input_tokens_seen": 63935220, "step": 2958, "time_per_iteration": 2.8312718868255615 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.06117964, "balance_loss_mlp": 1.02502513, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 9.439636088267013, "language_loss": 0.80363399, "learning_rate": 3.7748776091820397e-06, "loss": 0.82552785, "num_input_tokens_seen": 63954550, "step": 2959, "time_per_iteration": 2.722102642059326 }, { "auxiliary_loss_clip": 0.01164621, "auxiliary_loss_mlp": 0.01049069, "balance_loss_clip": 1.05812871, "balance_loss_mlp": 1.02938771, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 2.62580469975692, "language_loss": 0.51511085, "learning_rate": 3.774698062689362e-06, "loss": 0.53724772, "num_input_tokens_seen": 63972425, "step": 2960, "time_per_iteration": 2.6222047805786133 }, { "auxiliary_loss_clip": 0.01111843, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.05275989, "balance_loss_mlp": 1.03228474, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 1.7626913000215665, "language_loss": 0.88908094, "learning_rate": 3.7745184488999548e-06, "loss": 0.91072738, "num_input_tokens_seen": 63992165, "step": 2961, "time_per_iteration": 2.8088786602020264 }, { "auxiliary_loss_clip": 0.01116231, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.05181062, "balance_loss_mlp": 1.03385067, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.716412227369414, "language_loss": 0.79170465, "learning_rate": 3.774338767820631e-06, "loss": 0.81341565, "num_input_tokens_seen": 64013470, "step": 2962, "time_per_iteration": 2.7546913623809814 }, { "auxiliary_loss_clip": 0.01145526, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.05649889, "balance_loss_mlp": 1.03104997, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 2.3241756501763446, "language_loss": 0.74910223, "learning_rate": 3.774159019458203e-06, "loss": 0.77108544, "num_input_tokens_seen": 64030975, "step": 2963, "time_per_iteration": 2.680356979370117 }, { "auxiliary_loss_clip": 0.01140656, "auxiliary_loss_mlp": 0.01043225, "balance_loss_clip": 1.05769885, "balance_loss_mlp": 1.02347231, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 1.747536927551571, "language_loss": 0.78837025, "learning_rate": 3.7739792038194877e-06, "loss": 0.81020904, "num_input_tokens_seen": 64050075, "step": 2964, "time_per_iteration": 2.748398780822754 }, { "auxiliary_loss_clip": 0.01151685, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.05950594, "balance_loss_mlp": 1.00098181, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 3.046027397796258, "language_loss": 0.81160808, "learning_rate": 3.7737993209113027e-06, "loss": 0.83089471, "num_input_tokens_seen": 64071920, "step": 2965, "time_per_iteration": 2.8090012073516846 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01047086, "balance_loss_clip": 1.06002402, "balance_loss_mlp": 1.02916884, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.554359630612449, "language_loss": 0.95307338, "learning_rate": 3.7736193707404698e-06, "loss": 0.97506082, "num_input_tokens_seen": 64086835, "step": 2966, "time_per_iteration": 2.7159550189971924 }, { "auxiliary_loss_clip": 0.01112928, "auxiliary_loss_mlp": 0.00777395, "balance_loss_clip": 1.05336046, "balance_loss_mlp": 1.00083637, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 7.5683867487642065, "language_loss": 0.72833109, "learning_rate": 3.7734393533138127e-06, "loss": 0.74723434, "num_input_tokens_seen": 64107360, "step": 2967, "time_per_iteration": 2.9540669918060303 }, { "auxiliary_loss_clip": 0.01129124, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02775562, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 2.1617023205672523, "language_loss": 0.76897681, "learning_rate": 3.773259268638157e-06, "loss": 0.7907362, "num_input_tokens_seen": 64124690, "step": 2968, "time_per_iteration": 2.752717971801758 }, { "auxiliary_loss_clip": 0.01085006, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.04640651, "balance_loss_mlp": 1.02559829, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 2.039560504387258, "language_loss": 0.75839806, "learning_rate": 3.7730791167203333e-06, "loss": 0.77969772, "num_input_tokens_seen": 64146315, "step": 2969, "time_per_iteration": 2.9161994457244873 }, { "auxiliary_loss_clip": 0.01075271, "auxiliary_loss_mlp": 0.01013071, "balance_loss_clip": 1.06177902, "balance_loss_mlp": 1.00932813, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8520394227890811, "language_loss": 0.69012916, "learning_rate": 3.772898897567171e-06, "loss": 0.7110126, "num_input_tokens_seen": 64210875, "step": 2970, "time_per_iteration": 3.3269262313842773 }, { "auxiliary_loss_clip": 0.011313, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.05561864, "balance_loss_mlp": 1.02493763, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 1.9951166568015506, "language_loss": 0.67617297, "learning_rate": 3.772718611185505e-06, "loss": 0.69792765, "num_input_tokens_seen": 64230740, "step": 2971, "time_per_iteration": 2.8691961765289307 }, { "auxiliary_loss_clip": 0.01110831, "auxiliary_loss_mlp": 0.01052779, "balance_loss_clip": 1.05309939, "balance_loss_mlp": 1.03266823, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 1.5664358375440484, "language_loss": 0.8971802, "learning_rate": 3.7725382575821717e-06, "loss": 0.91881633, "num_input_tokens_seen": 64252300, "step": 2972, "time_per_iteration": 2.893923759460449 }, { "auxiliary_loss_clip": 0.01124705, "auxiliary_loss_mlp": 0.01055871, "balance_loss_clip": 1.05635929, "balance_loss_mlp": 1.03466403, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.4611679901229153, "language_loss": 0.88593906, "learning_rate": 3.77235783676401e-06, "loss": 0.90774482, "num_input_tokens_seen": 64270105, "step": 2973, "time_per_iteration": 2.7340333461761475 }, { "auxiliary_loss_clip": 0.01164127, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.06285155, "balance_loss_mlp": 1.0283215, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 3.4039298885336557, "language_loss": 0.7668556, "learning_rate": 3.7721773487378615e-06, "loss": 0.78896761, "num_input_tokens_seen": 64287250, "step": 2974, "time_per_iteration": 2.632495403289795 }, { "auxiliary_loss_clip": 0.0114187, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.06101942, "balance_loss_mlp": 1.03390288, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 2.484949778027245, "language_loss": 0.74701655, "learning_rate": 3.7719967935105705e-06, "loss": 0.76896524, "num_input_tokens_seen": 64307140, "step": 2975, "time_per_iteration": 2.704012870788574 }, { "auxiliary_loss_clip": 0.01149026, "auxiliary_loss_mlp": 0.01048788, "balance_loss_clip": 1.05678535, "balance_loss_mlp": 1.03004813, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.518747487377626, "language_loss": 0.73032069, "learning_rate": 3.7718161710889833e-06, "loss": 0.75229883, "num_input_tokens_seen": 64328760, "step": 2976, "time_per_iteration": 2.7357017993927 }, { "auxiliary_loss_clip": 0.01150398, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.06239033, "balance_loss_mlp": 1.0229373, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.4579507247258654, "language_loss": 0.770594, "learning_rate": 3.7716354814799495e-06, "loss": 0.79248488, "num_input_tokens_seen": 64348800, "step": 2977, "time_per_iteration": 2.727318286895752 }, { "auxiliary_loss_clip": 0.01131521, "auxiliary_loss_mlp": 0.01045834, "balance_loss_clip": 1.06618452, "balance_loss_mlp": 1.02841735, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 2.7286854986191282, "language_loss": 0.80235189, "learning_rate": 3.7714547246903203e-06, "loss": 0.82412547, "num_input_tokens_seen": 64367955, "step": 2978, "time_per_iteration": 2.8178791999816895 }, { "auxiliary_loss_clip": 0.0114307, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.05818772, "balance_loss_mlp": 1.03330874, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4967765935497133, "language_loss": 0.76192784, "learning_rate": 3.7712739007269508e-06, "loss": 0.7838884, "num_input_tokens_seen": 64389805, "step": 2979, "time_per_iteration": 4.241487741470337 }, { "auxiliary_loss_clip": 0.01122958, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.0590893, "balance_loss_mlp": 1.02660525, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 1.9491816848203256, "language_loss": 0.68945503, "learning_rate": 3.7710930095966976e-06, "loss": 0.71113026, "num_input_tokens_seen": 64408220, "step": 2980, "time_per_iteration": 2.6817352771759033 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0588038, "balance_loss_mlp": 1.02497244, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.9134992191513662, "language_loss": 0.70793843, "learning_rate": 3.7709120513064196e-06, "loss": 0.72992027, "num_input_tokens_seen": 64426380, "step": 2981, "time_per_iteration": 4.310532331466675 }, { "auxiliary_loss_clip": 0.01137747, "auxiliary_loss_mlp": 0.01056086, "balance_loss_clip": 1.06083858, "balance_loss_mlp": 1.03686976, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.529665562311581, "language_loss": 0.8190546, "learning_rate": 3.7707310258629796e-06, "loss": 0.84099293, "num_input_tokens_seen": 64444355, "step": 2982, "time_per_iteration": 2.710726261138916 }, { "auxiliary_loss_clip": 0.01162978, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.06181359, "balance_loss_mlp": 1.02306128, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.6440716861921114, "language_loss": 0.83123535, "learning_rate": 3.7705499332732413e-06, "loss": 0.85327524, "num_input_tokens_seen": 64467800, "step": 2983, "time_per_iteration": 2.700378656387329 }, { "auxiliary_loss_clip": 0.01153001, "auxiliary_loss_mlp": 0.01048341, "balance_loss_clip": 1.05694914, "balance_loss_mlp": 1.02932739, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.6703280507743268, "language_loss": 0.85149562, "learning_rate": 3.7703687735440718e-06, "loss": 0.87350899, "num_input_tokens_seen": 64487230, "step": 2984, "time_per_iteration": 2.6529407501220703 }, { "auxiliary_loss_clip": 0.01126981, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05520201, "balance_loss_mlp": 1.02424896, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 2.4609160562432053, "language_loss": 0.8935222, "learning_rate": 3.7701875466823416e-06, "loss": 0.9152264, "num_input_tokens_seen": 64509165, "step": 2985, "time_per_iteration": 4.528426170349121 }, { "auxiliary_loss_clip": 0.01160091, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.06142831, "balance_loss_mlp": 1.02434587, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 2.095497349072142, "language_loss": 0.69538593, "learning_rate": 3.770006252694922e-06, "loss": 0.71739429, "num_input_tokens_seen": 64527940, "step": 2986, "time_per_iteration": 2.6890172958374023 }, { "auxiliary_loss_clip": 0.01158556, "auxiliary_loss_mlp": 0.00776, "balance_loss_clip": 1.05752599, "balance_loss_mlp": 1.00081134, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4599229747435123, "language_loss": 0.77855188, "learning_rate": 3.769824891588688e-06, "loss": 0.79789746, "num_input_tokens_seen": 64545230, "step": 2987, "time_per_iteration": 2.650761842727661 }, { "auxiliary_loss_clip": 0.0116216, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02441502, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 2.0190394876224467, "language_loss": 0.77958816, "learning_rate": 3.7696434633705164e-06, "loss": 0.80164748, "num_input_tokens_seen": 64563820, "step": 2988, "time_per_iteration": 2.6151437759399414 }, { "auxiliary_loss_clip": 0.01059513, "auxiliary_loss_mlp": 0.00756906, "balance_loss_clip": 1.07071137, "balance_loss_mlp": 1.00131369, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.7650122273387262, "language_loss": 0.62709254, "learning_rate": 3.7694619680472875e-06, "loss": 0.64525676, "num_input_tokens_seen": 64621315, "step": 2989, "time_per_iteration": 3.1990275382995605 }, { "auxiliary_loss_clip": 0.01137168, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02128983, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.3566032567209483, "language_loss": 0.71070904, "learning_rate": 3.7692804056258837e-06, "loss": 0.73246896, "num_input_tokens_seen": 64639885, "step": 2990, "time_per_iteration": 2.7275335788726807 }, { "auxiliary_loss_clip": 0.01135847, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.05398035, "balance_loss_mlp": 1.02639365, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 1.8035266350414116, "language_loss": 0.68888462, "learning_rate": 3.7690987761131893e-06, "loss": 0.7106927, "num_input_tokens_seen": 64661220, "step": 2991, "time_per_iteration": 2.8237311840057373 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01046061, "balance_loss_clip": 1.05156851, "balance_loss_mlp": 1.02663028, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.6063564491400402, "language_loss": 0.82933879, "learning_rate": 3.7689170795160924e-06, "loss": 0.85084313, "num_input_tokens_seen": 64682530, "step": 2992, "time_per_iteration": 2.8303778171539307 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.05302262, "balance_loss_mlp": 1.0187583, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 2.076285453641059, "language_loss": 0.82228035, "learning_rate": 3.7687353158414822e-06, "loss": 0.84404445, "num_input_tokens_seen": 64701025, "step": 2993, "time_per_iteration": 2.710369110107422 }, { "auxiliary_loss_clip": 0.01135151, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.05135202, "balance_loss_mlp": 1.02236176, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.7027458997386926, "language_loss": 0.78129464, "learning_rate": 3.7685534850962517e-06, "loss": 0.80305111, "num_input_tokens_seen": 64719570, "step": 2994, "time_per_iteration": 2.6666738986968994 }, { "auxiliary_loss_clip": 0.01158877, "auxiliary_loss_mlp": 0.01045455, "balance_loss_clip": 1.05657315, "balance_loss_mlp": 1.02819359, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.4198973911698434, "language_loss": 0.81139499, "learning_rate": 3.768371587287296e-06, "loss": 0.83343828, "num_input_tokens_seen": 64738110, "step": 2995, "time_per_iteration": 2.699521541595459 }, { "auxiliary_loss_clip": 0.01142902, "auxiliary_loss_mlp": 0.01047606, "balance_loss_clip": 1.05350447, "balance_loss_mlp": 1.0310601, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 1.8607496799697536, "language_loss": 0.84162772, "learning_rate": 3.768189622421512e-06, "loss": 0.86353278, "num_input_tokens_seen": 64756345, "step": 2996, "time_per_iteration": 2.696723461151123 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.06094205, "balance_loss_mlp": 1.02273917, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 2.1291201116421283, "language_loss": 0.88189137, "learning_rate": 3.7680075905058006e-06, "loss": 0.90356302, "num_input_tokens_seen": 64776375, "step": 2997, "time_per_iteration": 2.785522699356079 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.04949927, "balance_loss_mlp": 1.02753246, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.7579499924576911, "language_loss": 0.85068727, "learning_rate": 3.7678254915470643e-06, "loss": 0.87249064, "num_input_tokens_seen": 64796210, "step": 2998, "time_per_iteration": 2.6912384033203125 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.01044427, "balance_loss_clip": 1.06019807, "balance_loss_mlp": 1.02641416, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.8075624565441775, "language_loss": 0.84176779, "learning_rate": 3.7676433255522084e-06, "loss": 0.86380744, "num_input_tokens_seen": 64818590, "step": 2999, "time_per_iteration": 2.722447395324707 }, { "auxiliary_loss_clip": 0.01143605, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.05324686, "balance_loss_mlp": 1.02870023, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 1.8789697336390492, "language_loss": 0.75206578, "learning_rate": 3.76746109252814e-06, "loss": 0.77397501, "num_input_tokens_seen": 64838350, "step": 3000, "time_per_iteration": 2.669875144958496 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.00775745, "balance_loss_clip": 1.0526886, "balance_loss_mlp": 1.00060582, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 2.1714361871851704, "language_loss": 0.71088028, "learning_rate": 3.76727879248177e-06, "loss": 0.72993821, "num_input_tokens_seen": 64858065, "step": 3001, "time_per_iteration": 2.7207603454589844 }, { "auxiliary_loss_clip": 0.01150091, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.05701649, "balance_loss_mlp": 1.02605033, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 2.218812983953599, "language_loss": 0.8849982, "learning_rate": 3.767096425420011e-06, "loss": 0.90694606, "num_input_tokens_seen": 64877305, "step": 3002, "time_per_iteration": 2.6577625274658203 }, { "auxiliary_loss_clip": 0.01157827, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.05624068, "balance_loss_mlp": 1.03076851, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 1.6287780165264572, "language_loss": 0.80328667, "learning_rate": 3.7669139913497788e-06, "loss": 0.8253476, "num_input_tokens_seen": 64896955, "step": 3003, "time_per_iteration": 2.6274783611297607 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.01043654, "balance_loss_clip": 1.05622995, "balance_loss_mlp": 1.02596307, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 2.3308952017896956, "language_loss": 0.67250973, "learning_rate": 3.7667314902779907e-06, "loss": 0.69452989, "num_input_tokens_seen": 64917080, "step": 3004, "time_per_iteration": 2.6652631759643555 }, { "auxiliary_loss_clip": 0.01147517, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.05606318, "balance_loss_mlp": 1.03528929, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 2.592432277036083, "language_loss": 0.85111535, "learning_rate": 3.7665489222115677e-06, "loss": 0.87313569, "num_input_tokens_seen": 64935215, "step": 3005, "time_per_iteration": 2.654977560043335 }, { "auxiliary_loss_clip": 0.0114499, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.05690646, "balance_loss_mlp": 1.02489829, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.5217876402754629, "language_loss": 0.83215338, "learning_rate": 3.766366287157432e-06, "loss": 0.85402322, "num_input_tokens_seen": 64956275, "step": 3006, "time_per_iteration": 2.7118306159973145 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05063033, "balance_loss_mlp": 1.03105807, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.6327495611050657, "language_loss": 0.77377248, "learning_rate": 3.7661835851225103e-06, "loss": 0.79554498, "num_input_tokens_seen": 64979390, "step": 3007, "time_per_iteration": 2.7996537685394287 }, { "auxiliary_loss_clip": 0.01070026, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.04936945, "balance_loss_mlp": 1.02712655, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.801982400183398, "language_loss": 0.56987137, "learning_rate": 3.7660008161137294e-06, "loss": 0.5908761, "num_input_tokens_seen": 65043135, "step": 3008, "time_per_iteration": 3.4269092082977295 }, { "auxiliary_loss_clip": 0.01130838, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.05308366, "balance_loss_mlp": 1.02686691, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.8424126412451678, "language_loss": 0.67248082, "learning_rate": 3.765817980138021e-06, "loss": 0.69426012, "num_input_tokens_seen": 65062845, "step": 3009, "time_per_iteration": 2.7875866889953613 }, { "auxiliary_loss_clip": 0.01161719, "auxiliary_loss_mlp": 0.01044187, "balance_loss_clip": 1.0595516, "balance_loss_mlp": 1.02673507, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.4429360498363986, "language_loss": 0.75690198, "learning_rate": 3.7656350772023177e-06, "loss": 0.778961, "num_input_tokens_seen": 65082110, "step": 3010, "time_per_iteration": 2.6060268878936768 }, { "auxiliary_loss_clip": 0.01127916, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.05715132, "balance_loss_mlp": 1.02063942, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.6324915654296899, "language_loss": 0.67356348, "learning_rate": 3.7654521073135553e-06, "loss": 0.69522083, "num_input_tokens_seen": 65101985, "step": 3011, "time_per_iteration": 2.763596534729004 }, { "auxiliary_loss_clip": 0.01105034, "auxiliary_loss_mlp": 0.00777475, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.00078559, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.551526807882757, "language_loss": 0.71288514, "learning_rate": 3.7652690704786723e-06, "loss": 0.73171026, "num_input_tokens_seen": 65129295, "step": 3012, "time_per_iteration": 3.037775993347168 }, { "auxiliary_loss_clip": 0.01132189, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.05564284, "balance_loss_mlp": 1.03348863, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 2.095737131475866, "language_loss": 0.62309992, "learning_rate": 3.765085966704609e-06, "loss": 0.64494264, "num_input_tokens_seen": 65150625, "step": 3013, "time_per_iteration": 2.7692227363586426 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.0105253, "balance_loss_clip": 1.05343401, "balance_loss_mlp": 1.03486276, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.6679267545988328, "language_loss": 0.76147234, "learning_rate": 3.764902795998309e-06, "loss": 0.78331089, "num_input_tokens_seen": 65170880, "step": 3014, "time_per_iteration": 2.7296786308288574 }, { "auxiliary_loss_clip": 0.01163543, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.05964816, "balance_loss_mlp": 1.02987087, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 2.1234423596691796, "language_loss": 0.66310829, "learning_rate": 3.7647195583667184e-06, "loss": 0.6852442, "num_input_tokens_seen": 65192530, "step": 3015, "time_per_iteration": 2.7575571537017822 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.00776613, "balance_loss_clip": 1.05429327, "balance_loss_mlp": 1.00067461, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 1.7837261279259933, "language_loss": 0.78152305, "learning_rate": 3.764536253816785e-06, "loss": 0.80058956, "num_input_tokens_seen": 65211675, "step": 3016, "time_per_iteration": 2.6718828678131104 }, { "auxiliary_loss_clip": 0.01145073, "auxiliary_loss_mlp": 0.01049504, "balance_loss_clip": 1.05684161, "balance_loss_mlp": 1.03068125, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.7248072345223011, "language_loss": 0.8351965, "learning_rate": 3.7643528823554602e-06, "loss": 0.85714233, "num_input_tokens_seen": 65231185, "step": 3017, "time_per_iteration": 2.6879045963287354 }, { "auxiliary_loss_clip": 0.0114091, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.05404854, "balance_loss_mlp": 1.02539897, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 2.2664795482488924, "language_loss": 0.6769017, "learning_rate": 3.764169443989697e-06, "loss": 0.69874066, "num_input_tokens_seen": 65251645, "step": 3018, "time_per_iteration": 4.31333327293396 }, { "auxiliary_loss_clip": 0.01147629, "auxiliary_loss_mlp": 0.00776661, "balance_loss_clip": 1.05706179, "balance_loss_mlp": 1.00074184, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 1.8935259017451227, "language_loss": 0.76396847, "learning_rate": 3.7639859387264518e-06, "loss": 0.78321135, "num_input_tokens_seen": 65271125, "step": 3019, "time_per_iteration": 2.7667160034179688 }, { "auxiliary_loss_clip": 0.01121465, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.05550635, "balance_loss_mlp": 1.02722728, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.042490471678265, "language_loss": 0.81550395, "learning_rate": 3.7638023665726834e-06, "loss": 0.83718598, "num_input_tokens_seen": 65290600, "step": 3020, "time_per_iteration": 4.3900346755981445 }, { "auxiliary_loss_clip": 0.01136424, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.05758023, "balance_loss_mlp": 1.02567708, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.9628186536024828, "language_loss": 0.7757082, "learning_rate": 3.763618727535352e-06, "loss": 0.79753458, "num_input_tokens_seen": 65311040, "step": 3021, "time_per_iteration": 4.3029396533966064 }, { "auxiliary_loss_clip": 0.01143245, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.05453348, "balance_loss_mlp": 1.02907431, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.725306643191844, "language_loss": 0.84863859, "learning_rate": 3.763435021621422e-06, "loss": 0.87054378, "num_input_tokens_seen": 65332115, "step": 3022, "time_per_iteration": 2.7353312969207764 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.05769348, "balance_loss_mlp": 1.0235188, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 2.230341519134859, "language_loss": 0.69367266, "learning_rate": 3.763251248837859e-06, "loss": 0.71542448, "num_input_tokens_seen": 65352210, "step": 3023, "time_per_iteration": 2.775200605392456 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.04900002, "balance_loss_mlp": 1.02556491, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 2.150764188548567, "language_loss": 0.74107385, "learning_rate": 3.7630674091916317e-06, "loss": 0.76278937, "num_input_tokens_seen": 65370600, "step": 3024, "time_per_iteration": 2.7364041805267334 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.01046837, "balance_loss_clip": 1.05719447, "balance_loss_mlp": 1.02900314, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.148591016046099, "language_loss": 0.8835662, "learning_rate": 3.7628835026897123e-06, "loss": 0.90548658, "num_input_tokens_seen": 65387270, "step": 3025, "time_per_iteration": 4.274658679962158 }, { "auxiliary_loss_clip": 0.01133667, "auxiliary_loss_mlp": 0.01050575, "balance_loss_clip": 1.05470932, "balance_loss_mlp": 1.03137028, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 3.6399614210311206, "language_loss": 0.79041791, "learning_rate": 3.7626995293390735e-06, "loss": 0.81226033, "num_input_tokens_seen": 65406550, "step": 3026, "time_per_iteration": 2.7589778900146484 }, { "auxiliary_loss_clip": 0.01132736, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.05774415, "balance_loss_mlp": 1.03679442, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 1.6980721374313217, "language_loss": 0.759978, "learning_rate": 3.762515489146692e-06, "loss": 0.78185904, "num_input_tokens_seen": 65425955, "step": 3027, "time_per_iteration": 2.7347826957702637 }, { "auxiliary_loss_clip": 0.01163558, "auxiliary_loss_mlp": 0.01053369, "balance_loss_clip": 1.05835891, "balance_loss_mlp": 1.03378284, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 2.2893837743041368, "language_loss": 0.85592651, "learning_rate": 3.762331382119546e-06, "loss": 0.87809575, "num_input_tokens_seen": 65442820, "step": 3028, "time_per_iteration": 2.598905563354492 }, { "auxiliary_loss_clip": 0.01156921, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.0578618, "balance_loss_mlp": 1.0260129, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 1.8897570500397638, "language_loss": 0.82807779, "learning_rate": 3.7621472082646183e-06, "loss": 0.85009193, "num_input_tokens_seen": 65461825, "step": 3029, "time_per_iteration": 2.677332639694214 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01050232, "balance_loss_clip": 1.05223596, "balance_loss_mlp": 1.02931094, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 10.840079090220346, "language_loss": 0.78091359, "learning_rate": 3.761962967588891e-06, "loss": 0.80256593, "num_input_tokens_seen": 65479480, "step": 3030, "time_per_iteration": 2.6865499019622803 }, { "auxiliary_loss_clip": 0.01139676, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.05401075, "balance_loss_mlp": 1.0240562, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 2.05958060196279, "language_loss": 0.85162055, "learning_rate": 3.761778660099352e-06, "loss": 0.87345004, "num_input_tokens_seen": 65497775, "step": 3031, "time_per_iteration": 2.6336488723754883 }, { "auxiliary_loss_clip": 0.01116657, "auxiliary_loss_mlp": 0.00776186, "balance_loss_clip": 1.0497843, "balance_loss_mlp": 1.00052071, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 1.83501853384953, "language_loss": 0.79992211, "learning_rate": 3.76159428580299e-06, "loss": 0.81885058, "num_input_tokens_seen": 65516505, "step": 3032, "time_per_iteration": 2.6879780292510986 }, { "auxiliary_loss_clip": 0.01166412, "auxiliary_loss_mlp": 0.01048902, "balance_loss_clip": 1.06163025, "balance_loss_mlp": 1.03038836, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 1.8132660189598853, "language_loss": 0.81316388, "learning_rate": 3.761409844706795e-06, "loss": 0.83531702, "num_input_tokens_seen": 65536160, "step": 3033, "time_per_iteration": 2.628100872039795 }, { "auxiliary_loss_clip": 0.01048591, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 0.99850291, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.8825814513625035, "language_loss": 0.63439631, "learning_rate": 3.7612253368177625e-06, "loss": 0.65489495, "num_input_tokens_seen": 65589375, "step": 3034, "time_per_iteration": 3.2329187393188477 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.05698252, "balance_loss_mlp": 1.02384114, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 3.107937736318082, "language_loss": 0.79893476, "learning_rate": 3.7610407621428893e-06, "loss": 0.82063049, "num_input_tokens_seen": 65606720, "step": 3035, "time_per_iteration": 2.7644357681274414 }, { "auxiliary_loss_clip": 0.01134115, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.05675578, "balance_loss_mlp": 1.02906322, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 1.870086430131469, "language_loss": 0.85076666, "learning_rate": 3.7608561206891735e-06, "loss": 0.87257177, "num_input_tokens_seen": 65625495, "step": 3036, "time_per_iteration": 2.7102303504943848 }, { "auxiliary_loss_clip": 0.01140083, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.05572963, "balance_loss_mlp": 1.02192414, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 2.1821496235124727, "language_loss": 0.80254716, "learning_rate": 3.760671412463617e-06, "loss": 0.82433879, "num_input_tokens_seen": 65643515, "step": 3037, "time_per_iteration": 2.6703832149505615 }, { "auxiliary_loss_clip": 0.01139652, "auxiliary_loss_mlp": 0.00776941, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.00062871, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 3.0764011293768023, "language_loss": 0.7950514, "learning_rate": 3.7604866374732246e-06, "loss": 0.81421733, "num_input_tokens_seen": 65658155, "step": 3038, "time_per_iteration": 2.7410895824432373 }, { "auxiliary_loss_clip": 0.01125628, "auxiliary_loss_mlp": 0.01044597, "balance_loss_clip": 1.05254972, "balance_loss_mlp": 1.02551126, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.9524772610579864, "language_loss": 0.67722493, "learning_rate": 3.7603017957250023e-06, "loss": 0.69892722, "num_input_tokens_seen": 65679310, "step": 3039, "time_per_iteration": 2.756833076477051 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01051065, "balance_loss_clip": 1.053087, "balance_loss_mlp": 1.03304029, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.8757227718998248, "language_loss": 0.73394251, "learning_rate": 3.7601168872259593e-06, "loss": 0.75575823, "num_input_tokens_seen": 65705235, "step": 3040, "time_per_iteration": 3.026679039001465 }, { "auxiliary_loss_clip": 0.01143558, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 1.02373624, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 2.017308993436446, "language_loss": 0.60348576, "learning_rate": 3.7599319119831075e-06, "loss": 0.62534392, "num_input_tokens_seen": 65727575, "step": 3041, "time_per_iteration": 2.738554000854492 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.03544497, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 2.3558133433802104, "language_loss": 0.59825706, "learning_rate": 3.7597468700034616e-06, "loss": 0.61996508, "num_input_tokens_seen": 65751370, "step": 3042, "time_per_iteration": 3.0009193420410156 }, { "auxiliary_loss_clip": 0.0112422, "auxiliary_loss_mlp": 0.01046569, "balance_loss_clip": 1.05319464, "balance_loss_mlp": 1.02917695, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.5313119565207096, "language_loss": 0.8757726, "learning_rate": 3.7595617612940374e-06, "loss": 0.89748049, "num_input_tokens_seen": 65771040, "step": 3043, "time_per_iteration": 2.7406487464904785 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.04592645, "balance_loss_mlp": 1.03712869, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 2.144378235575635, "language_loss": 0.70980251, "learning_rate": 3.7593765858618552e-06, "loss": 0.73093396, "num_input_tokens_seen": 65789345, "step": 3044, "time_per_iteration": 2.785931348800659 }, { "auxiliary_loss_clip": 0.01105073, "auxiliary_loss_mlp": 0.01059118, "balance_loss_clip": 1.05111921, "balance_loss_mlp": 1.0381608, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 3.097061979225562, "language_loss": 0.64460731, "learning_rate": 3.7591913437139365e-06, "loss": 0.66624922, "num_input_tokens_seen": 65810990, "step": 3045, "time_per_iteration": 2.8085720539093018 }, { "auxiliary_loss_clip": 0.01155246, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.05604315, "balance_loss_mlp": 1.02780676, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 11.455833434854163, "language_loss": 0.78461385, "learning_rate": 3.7590060348573066e-06, "loss": 0.80661607, "num_input_tokens_seen": 65827230, "step": 3046, "time_per_iteration": 2.603299140930176 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.04837, "balance_loss_mlp": 1.0240643, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 1.9889932097770582, "language_loss": 0.78733194, "learning_rate": 3.7588206592989903e-06, "loss": 0.8089478, "num_input_tokens_seen": 65845900, "step": 3047, "time_per_iteration": 2.7109453678131104 }, { "auxiliary_loss_clip": 0.01144516, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.05723858, "balance_loss_mlp": 1.0254705, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5191744259185578, "language_loss": 0.80704039, "learning_rate": 3.7586352170460194e-06, "loss": 0.82890975, "num_input_tokens_seen": 65868730, "step": 3048, "time_per_iteration": 2.7485053539276123 }, { "auxiliary_loss_clip": 0.01139433, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.05405188, "balance_loss_mlp": 1.02552414, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 2.1437824577601354, "language_loss": 0.86579728, "learning_rate": 3.758449708105424e-06, "loss": 0.88765168, "num_input_tokens_seen": 65888420, "step": 3049, "time_per_iteration": 2.6876962184906006 }, { "auxiliary_loss_clip": 0.01143881, "auxiliary_loss_mlp": 0.01045208, "balance_loss_clip": 1.05379057, "balance_loss_mlp": 1.02544308, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 2.616661567020713, "language_loss": 0.77827966, "learning_rate": 3.75826413248424e-06, "loss": 0.80017054, "num_input_tokens_seen": 65905840, "step": 3050, "time_per_iteration": 2.5814058780670166 }, { "auxiliary_loss_clip": 0.01126116, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.04954183, "balance_loss_mlp": 1.0238502, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.3686375880611656, "language_loss": 0.99064422, "learning_rate": 3.7580784901895035e-06, "loss": 1.01231837, "num_input_tokens_seen": 65922845, "step": 3051, "time_per_iteration": 2.701848268508911 }, { "auxiliary_loss_clip": 0.01125492, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.05189931, "balance_loss_mlp": 1.02078128, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 2.0338529701436237, "language_loss": 0.8607648, "learning_rate": 3.7578927812282542e-06, "loss": 0.88241673, "num_input_tokens_seen": 65945555, "step": 3052, "time_per_iteration": 2.7252042293548584 }, { "auxiliary_loss_clip": 0.01152967, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02737474, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 1.8649432496703628, "language_loss": 0.73393309, "learning_rate": 3.7577070056075356e-06, "loss": 0.7559092, "num_input_tokens_seen": 65963965, "step": 3053, "time_per_iteration": 2.6331369876861572 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.05783379, "balance_loss_mlp": 1.02565801, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.5358769917973574, "language_loss": 0.61891186, "learning_rate": 3.7575211633343902e-06, "loss": 0.64093965, "num_input_tokens_seen": 65985965, "step": 3054, "time_per_iteration": 2.6792421340942383 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.05558836, "balance_loss_mlp": 1.02502322, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.2474279661883667, "language_loss": 0.78218341, "learning_rate": 3.7573352544158663e-06, "loss": 0.80369824, "num_input_tokens_seen": 66005645, "step": 3055, "time_per_iteration": 2.778691053390503 }, { "auxiliary_loss_clip": 0.01096638, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.05003095, "balance_loss_mlp": 1.03211594, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.8043720478204575, "language_loss": 0.7022509, "learning_rate": 3.757149278859014e-06, "loss": 0.72372401, "num_input_tokens_seen": 66025675, "step": 3056, "time_per_iteration": 2.794254779815674 }, { "auxiliary_loss_clip": 0.01140367, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.05211461, "balance_loss_mlp": 1.02181149, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.8709784760841586, "language_loss": 0.80357504, "learning_rate": 3.7569632366708842e-06, "loss": 0.82536227, "num_input_tokens_seen": 66046125, "step": 3057, "time_per_iteration": 2.644728899002075 }, { "auxiliary_loss_clip": 0.01150041, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.05482352, "balance_loss_mlp": 1.02332497, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 7.225766788646501, "language_loss": 0.82570755, "learning_rate": 3.756777127858533e-06, "loss": 0.84764576, "num_input_tokens_seen": 66064375, "step": 3058, "time_per_iteration": 4.136845588684082 }, { "auxiliary_loss_clip": 0.01119139, "auxiliary_loss_mlp": 0.00776668, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.00066566, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.277694088171661, "language_loss": 0.85071868, "learning_rate": 3.756590952429017e-06, "loss": 0.86967677, "num_input_tokens_seen": 66084590, "step": 3059, "time_per_iteration": 2.745020866394043 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.00775088, "balance_loss_clip": 1.05359423, "balance_loss_mlp": 1.00077426, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 2.3540516696336216, "language_loss": 0.72983348, "learning_rate": 3.756404710389396e-06, "loss": 0.74910271, "num_input_tokens_seen": 66107105, "step": 3060, "time_per_iteration": 5.792214393615723 }, { "auxiliary_loss_clip": 0.01149482, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.05812132, "balance_loss_mlp": 1.02266574, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.5810457302838978, "language_loss": 0.73126459, "learning_rate": 3.7562184017467323e-06, "loss": 0.75317359, "num_input_tokens_seen": 66129295, "step": 3061, "time_per_iteration": 2.754167318344116 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.05435956, "balance_loss_mlp": 1.02379823, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.8413104246803462, "language_loss": 0.81937188, "learning_rate": 3.7560320265080906e-06, "loss": 0.8411907, "num_input_tokens_seen": 66146910, "step": 3062, "time_per_iteration": 2.7545394897460938 }, { "auxiliary_loss_clip": 0.01144664, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05668104, "balance_loss_mlp": 1.02259111, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 2.011374259171591, "language_loss": 0.72994816, "learning_rate": 3.7558455846805383e-06, "loss": 0.75180125, "num_input_tokens_seen": 66165370, "step": 3063, "time_per_iteration": 2.738293170928955 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.05164194, "balance_loss_mlp": 1.02490544, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 2.2975785147287953, "language_loss": 0.65614092, "learning_rate": 3.7556590762711463e-06, "loss": 0.67788899, "num_input_tokens_seen": 66186210, "step": 3064, "time_per_iteration": 4.404583930969238 }, { "auxiliary_loss_clip": 0.01141547, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02498376, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 2.1874829734431898, "language_loss": 0.68347883, "learning_rate": 3.7554725012869853e-06, "loss": 0.70532429, "num_input_tokens_seen": 66204800, "step": 3065, "time_per_iteration": 2.7149577140808105 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.05518305, "balance_loss_mlp": 1.02674615, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 2.2758854533642925, "language_loss": 0.73142231, "learning_rate": 3.7552858597351318e-06, "loss": 0.75324523, "num_input_tokens_seen": 66222195, "step": 3066, "time_per_iteration": 2.672675609588623 }, { "auxiliary_loss_clip": 0.01125186, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.04947495, "balance_loss_mlp": 1.0256983, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.1067167513095444, "language_loss": 0.82191038, "learning_rate": 3.7550991516226622e-06, "loss": 0.8435961, "num_input_tokens_seen": 66239505, "step": 3067, "time_per_iteration": 2.697768211364746 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.00756782, "balance_loss_clip": 1.04466891, "balance_loss_mlp": 1.00113225, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.7960107429271657, "language_loss": 0.59750569, "learning_rate": 3.754912376956657e-06, "loss": 0.61589694, "num_input_tokens_seen": 66295695, "step": 3068, "time_per_iteration": 3.0305213928222656 }, { "auxiliary_loss_clip": 0.01127048, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.05452299, "balance_loss_mlp": 1.02356791, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 3.7299324256794244, "language_loss": 0.76434112, "learning_rate": 3.7547255357441987e-06, "loss": 0.78602457, "num_input_tokens_seen": 66315315, "step": 3069, "time_per_iteration": 2.6757962703704834 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.05468106, "balance_loss_mlp": 1.02798057, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.9225240149566294, "language_loss": 0.8491416, "learning_rate": 3.7545386279923718e-06, "loss": 0.87106168, "num_input_tokens_seen": 66333675, "step": 3070, "time_per_iteration": 2.617023229598999 }, { "auxiliary_loss_clip": 0.01127789, "auxiliary_loss_mlp": 0.01043452, "balance_loss_clip": 1.0553112, "balance_loss_mlp": 1.02510571, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 6.700503585098448, "language_loss": 0.77807182, "learning_rate": 3.754351653708265e-06, "loss": 0.79978424, "num_input_tokens_seen": 66354075, "step": 3071, "time_per_iteration": 2.847329616546631 }, { "auxiliary_loss_clip": 0.01109458, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.05054557, "balance_loss_mlp": 1.03154778, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.0836336776071565, "language_loss": 0.77414191, "learning_rate": 3.7541646128989674e-06, "loss": 0.79573631, "num_input_tokens_seen": 66372520, "step": 3072, "time_per_iteration": 2.780921220779419 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.05106127, "balance_loss_mlp": 1.02465141, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 4.959080593148226, "language_loss": 0.86546457, "learning_rate": 3.7539775055715715e-06, "loss": 0.88729048, "num_input_tokens_seen": 66390745, "step": 3073, "time_per_iteration": 2.631913661956787 }, { "auxiliary_loss_clip": 0.01158717, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.05862749, "balance_loss_mlp": 1.02366686, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.162700927804164, "language_loss": 0.91831195, "learning_rate": 3.7537903317331732e-06, "loss": 0.94030046, "num_input_tokens_seen": 66410525, "step": 3074, "time_per_iteration": 2.6152567863464355 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.04757643, "balance_loss_mlp": 1.02763104, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.9967983521568784, "language_loss": 0.64783108, "learning_rate": 3.75360309139087e-06, "loss": 0.66935796, "num_input_tokens_seen": 66432535, "step": 3075, "time_per_iteration": 2.763559103012085 }, { "auxiliary_loss_clip": 0.01135247, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.05689573, "balance_loss_mlp": 1.02913702, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 1.8996898495981898, "language_loss": 0.72803432, "learning_rate": 3.753415784551761e-06, "loss": 0.74985278, "num_input_tokens_seen": 66450620, "step": 3076, "time_per_iteration": 2.76629376411438 }, { "auxiliary_loss_clip": 0.01124833, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.0584389, "balance_loss_mlp": 1.0249157, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 2.4862024108169556, "language_loss": 0.80772626, "learning_rate": 3.7532284112229507e-06, "loss": 0.82939804, "num_input_tokens_seen": 66467865, "step": 3077, "time_per_iteration": 2.7296142578125 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.05401397, "balance_loss_mlp": 1.02428079, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.8214336253769514, "language_loss": 0.78693211, "learning_rate": 3.7530409714115424e-06, "loss": 0.80863178, "num_input_tokens_seen": 66486245, "step": 3078, "time_per_iteration": 2.715838670730591 }, { "auxiliary_loss_clip": 0.01154963, "auxiliary_loss_mlp": 0.01043373, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02655268, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 1.7455066055145632, "language_loss": 0.77326959, "learning_rate": 3.7528534651246453e-06, "loss": 0.79525292, "num_input_tokens_seen": 66506510, "step": 3079, "time_per_iteration": 2.674128770828247 }, { "auxiliary_loss_clip": 0.01119079, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.04717147, "balance_loss_mlp": 1.02328515, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.885086933557342, "language_loss": 0.82143807, "learning_rate": 3.752665892369369e-06, "loss": 0.84304404, "num_input_tokens_seen": 66530960, "step": 3080, "time_per_iteration": 2.906940460205078 }, { "auxiliary_loss_clip": 0.01123637, "auxiliary_loss_mlp": 0.01044031, "balance_loss_clip": 1.05894399, "balance_loss_mlp": 1.02563691, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 2.065822240576764, "language_loss": 0.73973286, "learning_rate": 3.7524782531528266e-06, "loss": 0.76140958, "num_input_tokens_seen": 66550275, "step": 3081, "time_per_iteration": 2.7960739135742188 }, { "auxiliary_loss_clip": 0.01126977, "auxiliary_loss_mlp": 0.01051674, "balance_loss_clip": 1.05360913, "balance_loss_mlp": 1.03286242, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 1.9854893879184425, "language_loss": 0.71991849, "learning_rate": 3.7522905474821334e-06, "loss": 0.74170506, "num_input_tokens_seen": 66569040, "step": 3082, "time_per_iteration": 2.6965079307556152 }, { "auxiliary_loss_clip": 0.01124933, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.05649543, "balance_loss_mlp": 1.02694798, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 2.0424653419479886, "language_loss": 0.69580144, "learning_rate": 3.752102775364407e-06, "loss": 0.71751374, "num_input_tokens_seen": 66587775, "step": 3083, "time_per_iteration": 2.727252721786499 }, { "auxiliary_loss_clip": 0.01122388, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.05204451, "balance_loss_mlp": 1.02964258, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 2.185713468975319, "language_loss": 0.68965334, "learning_rate": 3.751914936806767e-06, "loss": 0.71134722, "num_input_tokens_seen": 66610800, "step": 3084, "time_per_iteration": 2.95849871635437 }, { "auxiliary_loss_clip": 0.01155184, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.05578482, "balance_loss_mlp": 1.0257436, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.6859724806626923, "language_loss": 0.77390355, "learning_rate": 3.7517270318163377e-06, "loss": 0.79588568, "num_input_tokens_seen": 66630960, "step": 3085, "time_per_iteration": 2.68961501121521 }, { "auxiliary_loss_clip": 0.01152089, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05316019, "balance_loss_mlp": 1.03142118, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 1.993169596996871, "language_loss": 0.73752379, "learning_rate": 3.751539060400244e-06, "loss": 0.75953472, "num_input_tokens_seen": 66650585, "step": 3086, "time_per_iteration": 2.652475595474243 }, { "auxiliary_loss_clip": 0.01142754, "auxiliary_loss_mlp": 0.01049865, "balance_loss_clip": 1.05530787, "balance_loss_mlp": 1.03134012, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 7.927127736744579, "language_loss": 0.69762361, "learning_rate": 3.7513510225656132e-06, "loss": 0.71954978, "num_input_tokens_seen": 66670045, "step": 3087, "time_per_iteration": 2.668849229812622 }, { "auxiliary_loss_clip": 0.01119022, "auxiliary_loss_mlp": 0.01055302, "balance_loss_clip": 1.05543649, "balance_loss_mlp": 1.03546548, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 2.1117122734340263, "language_loss": 0.72513628, "learning_rate": 3.7511629183195764e-06, "loss": 0.74687952, "num_input_tokens_seen": 66688790, "step": 3088, "time_per_iteration": 2.7150719165802 }, { "auxiliary_loss_clip": 0.0112638, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04933047, "balance_loss_mlp": 1.02616334, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 2.112009927874319, "language_loss": 0.91859758, "learning_rate": 3.7509747476692663e-06, "loss": 0.94030321, "num_input_tokens_seen": 66708090, "step": 3089, "time_per_iteration": 2.7239248752593994 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.0494597, "balance_loss_mlp": 1.02919531, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 2.490831087537115, "language_loss": 0.57275403, "learning_rate": 3.7507865106218176e-06, "loss": 0.59427136, "num_input_tokens_seen": 66727320, "step": 3090, "time_per_iteration": 2.8263309001922607 }, { "auxiliary_loss_clip": 0.01125877, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.02636242, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.7797305478565062, "language_loss": 0.81704801, "learning_rate": 3.7505982071843695e-06, "loss": 0.83875453, "num_input_tokens_seen": 66747505, "step": 3091, "time_per_iteration": 2.697525978088379 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.03277707, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.0826959244757832, "language_loss": 0.83704746, "learning_rate": 3.7504098373640617e-06, "loss": 0.8585732, "num_input_tokens_seen": 66766425, "step": 3092, "time_per_iteration": 2.8379435539245605 }, { "auxiliary_loss_clip": 0.01136846, "auxiliary_loss_mlp": 0.01048758, "balance_loss_clip": 1.05389428, "balance_loss_mlp": 1.03036356, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 5.439917179387958, "language_loss": 0.93443698, "learning_rate": 3.750221401168038e-06, "loss": 0.95629299, "num_input_tokens_seen": 66781130, "step": 3093, "time_per_iteration": 2.8053483963012695 }, { "auxiliary_loss_clip": 0.01130362, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.05440521, "balance_loss_mlp": 1.02464092, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.7318887555782294, "language_loss": 0.77516603, "learning_rate": 3.750032898603443e-06, "loss": 0.7968933, "num_input_tokens_seen": 66797535, "step": 3094, "time_per_iteration": 2.7402310371398926 }, { "auxiliary_loss_clip": 0.0109741, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.0519228, "balance_loss_mlp": 1.0323391, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.7033453736007413, "language_loss": 0.69854707, "learning_rate": 3.749844329677425e-06, "loss": 0.72001338, "num_input_tokens_seen": 66821720, "step": 3095, "time_per_iteration": 3.133192777633667 }, { "auxiliary_loss_clip": 0.01113224, "auxiliary_loss_mlp": 0.010546, "balance_loss_clip": 1.0511899, "balance_loss_mlp": 1.03415525, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.2828801406167307, "language_loss": 0.81214821, "learning_rate": 3.749655694397135e-06, "loss": 0.83382642, "num_input_tokens_seen": 66839060, "step": 3096, "time_per_iteration": 2.7599101066589355 }, { "auxiliary_loss_clip": 0.01147399, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.05678356, "balance_loss_mlp": 1.02810192, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 2.430947734084612, "language_loss": 0.75326216, "learning_rate": 3.7494669927697255e-06, "loss": 0.77520448, "num_input_tokens_seen": 66857760, "step": 3097, "time_per_iteration": 4.255983114242554 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01050365, "balance_loss_clip": 1.05756521, "balance_loss_mlp": 1.03228104, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.553895603581972, "language_loss": 0.66602015, "learning_rate": 3.749278224802352e-06, "loss": 0.68786132, "num_input_tokens_seen": 66876460, "step": 3098, "time_per_iteration": 2.723567247390747 }, { "auxiliary_loss_clip": 0.01163461, "auxiliary_loss_mlp": 0.01052357, "balance_loss_clip": 1.05991709, "balance_loss_mlp": 1.03212702, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.6168121451860142, "language_loss": 0.69838905, "learning_rate": 3.7490893905021733e-06, "loss": 0.7205472, "num_input_tokens_seen": 66897960, "step": 3099, "time_per_iteration": 5.687380075454712 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.05713868, "balance_loss_mlp": 1.03243458, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.7060244708994476, "language_loss": 0.71840072, "learning_rate": 3.7489004898763494e-06, "loss": 0.74039996, "num_input_tokens_seen": 66917675, "step": 3100, "time_per_iteration": 2.6711015701293945 }, { "auxiliary_loss_clip": 0.01138377, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.05749035, "balance_loss_mlp": 1.03133154, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 1.9639279354826686, "language_loss": 0.80343997, "learning_rate": 3.7487115229320444e-06, "loss": 0.82533038, "num_input_tokens_seen": 66936000, "step": 3101, "time_per_iteration": 2.6996583938598633 }, { "auxiliary_loss_clip": 0.01112778, "auxiliary_loss_mlp": 0.01042097, "balance_loss_clip": 1.05307627, "balance_loss_mlp": 1.02478826, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.8804860702941575, "language_loss": 0.77053607, "learning_rate": 3.7485224896764222e-06, "loss": 0.79208481, "num_input_tokens_seen": 66955700, "step": 3102, "time_per_iteration": 2.726146936416626 }, { "auxiliary_loss_clip": 0.01150817, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.057688, "balance_loss_mlp": 1.0213027, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.314682178811096, "language_loss": 0.76689744, "learning_rate": 3.7483333901166525e-06, "loss": 0.78879869, "num_input_tokens_seen": 66972815, "step": 3103, "time_per_iteration": 4.374122619628906 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.05477643, "balance_loss_mlp": 1.02671361, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 1.6956506235876265, "language_loss": 0.79252636, "learning_rate": 3.7481442242599054e-06, "loss": 0.8142997, "num_input_tokens_seen": 66992280, "step": 3104, "time_per_iteration": 2.695012092590332 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01050273, "balance_loss_clip": 1.05117702, "balance_loss_mlp": 1.03096056, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 2.065624302338532, "language_loss": 0.8496474, "learning_rate": 3.747954992113354e-06, "loss": 0.87121809, "num_input_tokens_seen": 67012220, "step": 3105, "time_per_iteration": 2.761521816253662 }, { "auxiliary_loss_clip": 0.0112324, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.05166531, "balance_loss_mlp": 1.02407932, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.8352441384571676, "language_loss": 0.86880243, "learning_rate": 3.7477656936841742e-06, "loss": 0.8904717, "num_input_tokens_seen": 67032030, "step": 3106, "time_per_iteration": 2.785738706588745 }, { "auxiliary_loss_clip": 0.01150222, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.0566026, "balance_loss_mlp": 1.02281737, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 2.128833658771433, "language_loss": 0.78226906, "learning_rate": 3.7475763289795445e-06, "loss": 0.80418098, "num_input_tokens_seen": 67048920, "step": 3107, "time_per_iteration": 2.693995237350464 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.05873394, "balance_loss_mlp": 1.03341043, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 3.0927798335187506, "language_loss": 0.74159014, "learning_rate": 3.7473868980066446e-06, "loss": 0.7636584, "num_input_tokens_seen": 67068645, "step": 3108, "time_per_iteration": 2.795715570449829 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.05207491, "balance_loss_mlp": 1.02451098, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 1.6837485322309411, "language_loss": 0.74348569, "learning_rate": 3.747197400772658e-06, "loss": 0.76498872, "num_input_tokens_seen": 67087075, "step": 3109, "time_per_iteration": 2.7627830505371094 }, { "auxiliary_loss_clip": 0.01145572, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.05631042, "balance_loss_mlp": 1.02526462, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.499459601293056, "language_loss": 0.84250218, "learning_rate": 3.747007837284772e-06, "loss": 0.86439908, "num_input_tokens_seen": 67108040, "step": 3110, "time_per_iteration": 2.7665328979492188 }, { "auxiliary_loss_clip": 0.01147578, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.05929494, "balance_loss_mlp": 1.02381575, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.9108380391903876, "language_loss": 0.84738445, "learning_rate": 3.7468182075501737e-06, "loss": 0.86928415, "num_input_tokens_seen": 67127605, "step": 3111, "time_per_iteration": 2.729233741760254 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01044544, "balance_loss_clip": 1.05348754, "balance_loss_mlp": 1.02635229, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 1.8704338434966796, "language_loss": 0.76875687, "learning_rate": 3.7466285115760536e-06, "loss": 0.79048228, "num_input_tokens_seen": 67145785, "step": 3112, "time_per_iteration": 2.7392494678497314 }, { "auxiliary_loss_clip": 0.0114846, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.05636978, "balance_loss_mlp": 1.02913654, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 1.8996972204761096, "language_loss": 0.64466536, "learning_rate": 3.7464387493696046e-06, "loss": 0.66662085, "num_input_tokens_seen": 67165930, "step": 3113, "time_per_iteration": 2.7393765449523926 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.05685568, "balance_loss_mlp": 1.02900672, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 6.483287708452815, "language_loss": 0.817972, "learning_rate": 3.746248920938024e-06, "loss": 0.83999759, "num_input_tokens_seen": 67185830, "step": 3114, "time_per_iteration": 2.740229368209839 }, { "auxiliary_loss_clip": 0.01104278, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.04921412, "balance_loss_mlp": 1.03024614, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 2.3064843449079175, "language_loss": 0.57413173, "learning_rate": 3.74605902628851e-06, "loss": 0.59568191, "num_input_tokens_seen": 67206930, "step": 3115, "time_per_iteration": 2.811549663543701 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05446446, "balance_loss_mlp": 1.03241396, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 2.577640519639585, "language_loss": 0.70842528, "learning_rate": 3.745869065428261e-06, "loss": 0.73017788, "num_input_tokens_seen": 67226290, "step": 3116, "time_per_iteration": 2.8053951263427734 }, { "auxiliary_loss_clip": 0.0115042, "auxiliary_loss_mlp": 0.01035569, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.01787841, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 3.010261965906642, "language_loss": 0.78994375, "learning_rate": 3.7456790383644833e-06, "loss": 0.81180358, "num_input_tokens_seen": 67244410, "step": 3117, "time_per_iteration": 2.819415330886841 }, { "auxiliary_loss_clip": 0.01132901, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05260777, "balance_loss_mlp": 1.03047204, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 2.2828109389679865, "language_loss": 0.83903432, "learning_rate": 3.745488945104381e-06, "loss": 0.86086059, "num_input_tokens_seen": 67264470, "step": 3118, "time_per_iteration": 2.783804416656494 }, { "auxiliary_loss_clip": 0.01144867, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.05412436, "balance_loss_mlp": 1.02688873, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 3.566737352043019, "language_loss": 0.76283264, "learning_rate": 3.7452987856551636e-06, "loss": 0.78472656, "num_input_tokens_seen": 67284315, "step": 3119, "time_per_iteration": 2.6872506141662598 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.05519438, "balance_loss_mlp": 1.02899814, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.7224942549361077, "language_loss": 0.82017547, "learning_rate": 3.7451085600240406e-06, "loss": 0.84221041, "num_input_tokens_seen": 67302780, "step": 3120, "time_per_iteration": 2.637505292892456 }, { "auxiliary_loss_clip": 0.0113033, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.05060756, "balance_loss_mlp": 1.01828837, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 2.5027223446471982, "language_loss": 0.84992659, "learning_rate": 3.7449182682182263e-06, "loss": 0.87158525, "num_input_tokens_seen": 67323405, "step": 3121, "time_per_iteration": 2.788353681564331 }, { "auxiliary_loss_clip": 0.01096681, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.045645, "balance_loss_mlp": 1.02599168, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 2.1738591443482362, "language_loss": 0.70032287, "learning_rate": 3.744727910244937e-06, "loss": 0.72173256, "num_input_tokens_seen": 67345800, "step": 3122, "time_per_iteration": 3.0225250720977783 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.05445123, "balance_loss_mlp": 1.02288795, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 4.839579375412361, "language_loss": 0.70661515, "learning_rate": 3.7445374861113905e-06, "loss": 0.72857308, "num_input_tokens_seen": 67363575, "step": 3123, "time_per_iteration": 2.779904365539551 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.02454507, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 2.057520579072589, "language_loss": 0.74103826, "learning_rate": 3.7443469958248066e-06, "loss": 0.76287514, "num_input_tokens_seen": 67381765, "step": 3124, "time_per_iteration": 2.6336071491241455 }, { "auxiliary_loss_clip": 0.01157579, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05653572, "balance_loss_mlp": 1.03333998, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 3.0670363966795096, "language_loss": 0.80654436, "learning_rate": 3.7441564393924106e-06, "loss": 0.82864523, "num_input_tokens_seen": 67405000, "step": 3125, "time_per_iteration": 2.7224199771881104 }, { "auxiliary_loss_clip": 0.01046615, "auxiliary_loss_mlp": 0.01006504, "balance_loss_clip": 1.04444218, "balance_loss_mlp": 1.00435853, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9424570711133922, "language_loss": 0.63647306, "learning_rate": 3.7439658168214273e-06, "loss": 0.65700436, "num_input_tokens_seen": 67467140, "step": 3126, "time_per_iteration": 3.313321113586426 }, { "auxiliary_loss_clip": 0.01128308, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.05377257, "balance_loss_mlp": 1.02236164, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.8734163453478039, "language_loss": 0.81308508, "learning_rate": 3.7437751281190857e-06, "loss": 0.83477271, "num_input_tokens_seen": 67487980, "step": 3127, "time_per_iteration": 2.7137866020202637 }, { "auxiliary_loss_clip": 0.01088267, "auxiliary_loss_mlp": 0.0101138, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.00912714, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7699217277386954, "language_loss": 0.61922526, "learning_rate": 3.7435843732926164e-06, "loss": 0.64022171, "num_input_tokens_seen": 67552500, "step": 3128, "time_per_iteration": 3.264270782470703 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01049422, "balance_loss_clip": 1.04763842, "balance_loss_mlp": 1.02907288, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.4867495334212175, "language_loss": 0.70985162, "learning_rate": 3.7433935523492536e-06, "loss": 0.73142785, "num_input_tokens_seen": 67573295, "step": 3129, "time_per_iteration": 2.79929256439209 }, { "auxiliary_loss_clip": 0.01158485, "auxiliary_loss_mlp": 0.01050611, "balance_loss_clip": 1.05767536, "balance_loss_mlp": 1.03109634, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 2.4831518001798676, "language_loss": 0.85035253, "learning_rate": 3.7432026652962314e-06, "loss": 0.87244344, "num_input_tokens_seen": 67590010, "step": 3130, "time_per_iteration": 2.60624361038208 }, { "auxiliary_loss_clip": 0.01107202, "auxiliary_loss_mlp": 0.01049966, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.03023696, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 9.096753382647533, "language_loss": 0.7643525, "learning_rate": 3.7430117121407897e-06, "loss": 0.7859242, "num_input_tokens_seen": 67611110, "step": 3131, "time_per_iteration": 2.759230136871338 }, { "auxiliary_loss_clip": 0.0112329, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.05344164, "balance_loss_mlp": 1.03014708, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 2.109252219381847, "language_loss": 0.80713749, "learning_rate": 3.74282069289017e-06, "loss": 0.82886261, "num_input_tokens_seen": 67631990, "step": 3132, "time_per_iteration": 2.773817777633667 }, { "auxiliary_loss_clip": 0.01093588, "auxiliary_loss_mlp": 0.00779094, "balance_loss_clip": 1.04652429, "balance_loss_mlp": 1.00091529, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 2.092242478448591, "language_loss": 0.79653811, "learning_rate": 3.742629607551614e-06, "loss": 0.81526494, "num_input_tokens_seen": 67650490, "step": 3133, "time_per_iteration": 2.7873754501342773 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.01059381, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03921056, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 1.9069857551930867, "language_loss": 0.83001804, "learning_rate": 3.7424384561323698e-06, "loss": 0.85181063, "num_input_tokens_seen": 67668860, "step": 3134, "time_per_iteration": 2.9284298419952393 }, { "auxiliary_loss_clip": 0.01131578, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.05168402, "balance_loss_mlp": 1.02802503, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 2.0376543711114152, "language_loss": 0.82859468, "learning_rate": 3.742247238639684e-06, "loss": 0.85037726, "num_input_tokens_seen": 67690220, "step": 3135, "time_per_iteration": 2.8006811141967773 }, { "auxiliary_loss_clip": 0.01143148, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.05505157, "balance_loss_mlp": 1.03146911, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.9728388324049713, "language_loss": 0.78658557, "learning_rate": 3.7420559550808083e-06, "loss": 0.80851901, "num_input_tokens_seen": 67709820, "step": 3136, "time_per_iteration": 4.256143569946289 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.05388892, "balance_loss_mlp": 1.03006911, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 1.7483697887361769, "language_loss": 0.80820233, "learning_rate": 3.741864605462996e-06, "loss": 0.83004391, "num_input_tokens_seen": 67729490, "step": 3137, "time_per_iteration": 2.7538130283355713 }, { "auxiliary_loss_clip": 0.01159054, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05827475, "balance_loss_mlp": 1.03107548, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.9799764624272802, "language_loss": 0.81274408, "learning_rate": 3.741673189793504e-06, "loss": 0.83481836, "num_input_tokens_seen": 67749665, "step": 3138, "time_per_iteration": 4.143909931182861 }, { "auxiliary_loss_clip": 0.01150082, "auxiliary_loss_mlp": 0.01056444, "balance_loss_clip": 1.05626798, "balance_loss_mlp": 1.03713167, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 2.326218248348143, "language_loss": 0.63655496, "learning_rate": 3.7414817080795896e-06, "loss": 0.65862024, "num_input_tokens_seen": 67776230, "step": 3139, "time_per_iteration": 4.30991268157959 }, { "auxiliary_loss_clip": 0.0115289, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.05286491, "balance_loss_mlp": 1.02356625, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 2.1185902638296525, "language_loss": 0.7148211, "learning_rate": 3.741290160328514e-06, "loss": 0.73678017, "num_input_tokens_seen": 67795080, "step": 3140, "time_per_iteration": 2.6880578994750977 }, { "auxiliary_loss_clip": 0.01154738, "auxiliary_loss_mlp": 0.01043099, "balance_loss_clip": 1.05349982, "balance_loss_mlp": 1.02382278, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.6250212982316574, "language_loss": 0.87069929, "learning_rate": 3.7410985465475412e-06, "loss": 0.89267766, "num_input_tokens_seen": 67813110, "step": 3141, "time_per_iteration": 2.6677181720733643 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.01052882, "balance_loss_clip": 1.0507834, "balance_loss_mlp": 1.03243756, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 1.873404502116747, "language_loss": 0.7744689, "learning_rate": 3.7409068667439378e-06, "loss": 0.79630429, "num_input_tokens_seen": 67831070, "step": 3142, "time_per_iteration": 2.63077449798584 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.05298221, "balance_loss_mlp": 1.02132463, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 1.6611052928231447, "language_loss": 0.78867507, "learning_rate": 3.740715120924971e-06, "loss": 0.81033778, "num_input_tokens_seen": 67852170, "step": 3143, "time_per_iteration": 4.417406797409058 }, { "auxiliary_loss_clip": 0.0111986, "auxiliary_loss_mlp": 0.01048019, "balance_loss_clip": 1.05024099, "balance_loss_mlp": 1.02821851, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.855732191409361, "language_loss": 0.71476078, "learning_rate": 3.740523309097912e-06, "loss": 0.73643959, "num_input_tokens_seen": 67869945, "step": 3144, "time_per_iteration": 2.8104894161224365 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.05102479, "balance_loss_mlp": 1.02492023, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.5973078221757144, "language_loss": 0.73390597, "learning_rate": 3.7403314312700356e-06, "loss": 0.75560808, "num_input_tokens_seen": 67890240, "step": 3145, "time_per_iteration": 2.715609312057495 }, { "auxiliary_loss_clip": 0.01110308, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.04543984, "balance_loss_mlp": 1.02446938, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.915733862437625, "language_loss": 0.76263785, "learning_rate": 3.740139487448616e-06, "loss": 0.78416634, "num_input_tokens_seen": 67907825, "step": 3146, "time_per_iteration": 2.777221202850342 }, { "auxiliary_loss_clip": 0.01092807, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.04319823, "balance_loss_mlp": 1.02829611, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 1.988128972125699, "language_loss": 0.7837925, "learning_rate": 3.7399474776409326e-06, "loss": 0.80521393, "num_input_tokens_seen": 67926670, "step": 3147, "time_per_iteration": 2.8039205074310303 }, { "auxiliary_loss_clip": 0.01143577, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.0548687, "balance_loss_mlp": 1.02454758, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 3.932544798883504, "language_loss": 0.67477876, "learning_rate": 3.739755401854267e-06, "loss": 0.69664401, "num_input_tokens_seen": 67943645, "step": 3148, "time_per_iteration": 2.7273359298706055 }, { "auxiliary_loss_clip": 0.01112331, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04617155, "balance_loss_mlp": 1.02014899, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.9848849244070315, "language_loss": 0.76207471, "learning_rate": 3.739563260095902e-06, "loss": 0.78358936, "num_input_tokens_seen": 67962345, "step": 3149, "time_per_iteration": 2.8031978607177734 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.05438852, "balance_loss_mlp": 1.02797484, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.3661599820320136, "language_loss": 0.80378366, "learning_rate": 3.7393710523731245e-06, "loss": 0.82554519, "num_input_tokens_seen": 67979760, "step": 3150, "time_per_iteration": 2.7836129665374756 }, { "auxiliary_loss_clip": 0.01137112, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.0528239, "balance_loss_mlp": 1.03019929, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.0711129864945956, "language_loss": 0.85251844, "learning_rate": 3.7391787786932215e-06, "loss": 0.87437713, "num_input_tokens_seen": 67996895, "step": 3151, "time_per_iteration": 2.7782201766967773 }, { "auxiliary_loss_clip": 0.01121267, "auxiliary_loss_mlp": 0.01046776, "balance_loss_clip": 1.05223882, "balance_loss_mlp": 1.02839363, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 2.1337439707996673, "language_loss": 0.74114192, "learning_rate": 3.7389864390634857e-06, "loss": 0.76282233, "num_input_tokens_seen": 68018365, "step": 3152, "time_per_iteration": 2.8767755031585693 }, { "auxiliary_loss_clip": 0.01120312, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05119991, "balance_loss_mlp": 1.02463925, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 1.9471461777193173, "language_loss": 0.75520492, "learning_rate": 3.738794033491209e-06, "loss": 0.77685189, "num_input_tokens_seen": 68037985, "step": 3153, "time_per_iteration": 2.7722980976104736 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.0559293, "balance_loss_mlp": 1.03102183, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.099749434473157, "language_loss": 0.79984629, "learning_rate": 3.7386015619836887e-06, "loss": 0.82192594, "num_input_tokens_seen": 68057975, "step": 3154, "time_per_iteration": 2.6530587673187256 }, { "auxiliary_loss_clip": 0.01117992, "auxiliary_loss_mlp": 0.01056707, "balance_loss_clip": 1.04851115, "balance_loss_mlp": 1.03536844, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.210440214164498, "language_loss": 0.73046303, "learning_rate": 3.738409024548223e-06, "loss": 0.75220996, "num_input_tokens_seen": 68074175, "step": 3155, "time_per_iteration": 2.729832410812378 }, { "auxiliary_loss_clip": 0.01126019, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05104291, "balance_loss_mlp": 1.02626419, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 1.8299076145086866, "language_loss": 0.73869717, "learning_rate": 3.7382164211921136e-06, "loss": 0.76041389, "num_input_tokens_seen": 68095230, "step": 3156, "time_per_iteration": 2.6747231483459473 }, { "auxiliary_loss_clip": 0.01156549, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.05489409, "balance_loss_mlp": 1.02645326, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.9629652277148564, "language_loss": 0.68053937, "learning_rate": 3.7380237519226623e-06, "loss": 0.70254672, "num_input_tokens_seen": 68113805, "step": 3157, "time_per_iteration": 2.7092478275299072 }, { "auxiliary_loss_clip": 0.01114914, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.04805827, "balance_loss_mlp": 1.02533436, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 1.7829025355963362, "language_loss": 0.79893303, "learning_rate": 3.737831016747176e-06, "loss": 0.82052404, "num_input_tokens_seen": 68133190, "step": 3158, "time_per_iteration": 2.7921364307403564 }, { "auxiliary_loss_clip": 0.01163231, "auxiliary_loss_mlp": 0.01049502, "balance_loss_clip": 1.05787683, "balance_loss_mlp": 1.02923679, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 1.856283461980025, "language_loss": 0.72348613, "learning_rate": 3.737638215672964e-06, "loss": 0.74561346, "num_input_tokens_seen": 68152330, "step": 3159, "time_per_iteration": 2.6111273765563965 }, { "auxiliary_loss_clip": 0.01149613, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.05840325, "balance_loss_mlp": 1.03386414, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 2.2573250756933647, "language_loss": 0.84977192, "learning_rate": 3.7374453487073366e-06, "loss": 0.87180614, "num_input_tokens_seen": 68170185, "step": 3160, "time_per_iteration": 2.659259796142578 }, { "auxiliary_loss_clip": 0.01129342, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.05297387, "balance_loss_mlp": 1.03289795, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 2.752358611011079, "language_loss": 0.73407793, "learning_rate": 3.7372524158576074e-06, "loss": 0.7558704, "num_input_tokens_seen": 68191665, "step": 3161, "time_per_iteration": 2.784040689468384 }, { "auxiliary_loss_clip": 0.01139858, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.05456805, "balance_loss_mlp": 1.03476942, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.6629026055958476, "language_loss": 0.8115741, "learning_rate": 3.7370594171310926e-06, "loss": 0.83352458, "num_input_tokens_seen": 68214635, "step": 3162, "time_per_iteration": 2.9375386238098145 }, { "auxiliary_loss_clip": 0.01157449, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.05625844, "balance_loss_mlp": 1.02062798, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 2.448016750033594, "language_loss": 0.75615001, "learning_rate": 3.73686635253511e-06, "loss": 0.77812481, "num_input_tokens_seen": 68232150, "step": 3163, "time_per_iteration": 2.7344541549682617 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01050093, "balance_loss_clip": 1.050578, "balance_loss_mlp": 1.02880192, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 2.2644227245470514, "language_loss": 0.74093997, "learning_rate": 3.736673222076982e-06, "loss": 0.76245081, "num_input_tokens_seen": 68253370, "step": 3164, "time_per_iteration": 2.9165730476379395 }, { "auxiliary_loss_clip": 0.01141317, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.05518687, "balance_loss_mlp": 1.0195303, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 1.5484522746055986, "language_loss": 0.66844344, "learning_rate": 3.7364800257640313e-06, "loss": 0.69023699, "num_input_tokens_seen": 68278895, "step": 3165, "time_per_iteration": 3.006096124649048 }, { "auxiliary_loss_clip": 0.01146225, "auxiliary_loss_mlp": 0.0104856, "balance_loss_clip": 1.05512285, "balance_loss_mlp": 1.02848506, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.8598536292657144, "language_loss": 0.74239767, "learning_rate": 3.7362867636035835e-06, "loss": 0.76434553, "num_input_tokens_seen": 68294880, "step": 3166, "time_per_iteration": 2.678844928741455 }, { "auxiliary_loss_clip": 0.01050093, "auxiliary_loss_mlp": 0.01014959, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.01201403, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7754190343967906, "language_loss": 0.50311053, "learning_rate": 3.736093435602968e-06, "loss": 0.52376103, "num_input_tokens_seen": 68359665, "step": 3167, "time_per_iteration": 3.277529239654541 }, { "auxiliary_loss_clip": 0.01138483, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.05485487, "balance_loss_mlp": 1.03293037, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 2.3487387451986192, "language_loss": 0.74504036, "learning_rate": 3.7359000417695156e-06, "loss": 0.76693863, "num_input_tokens_seen": 68378950, "step": 3168, "time_per_iteration": 2.690995216369629 }, { "auxiliary_loss_clip": 0.01040165, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.03869283, "balance_loss_mlp": 1.02085996, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8605055473788603, "language_loss": 0.60079956, "learning_rate": 3.73570658211056e-06, "loss": 0.62143636, "num_input_tokens_seen": 68434235, "step": 3169, "time_per_iteration": 3.2108101844787598 }, { "auxiliary_loss_clip": 0.01103792, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.05267787, "balance_loss_mlp": 1.03741288, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.5575975614891868, "language_loss": 0.78179795, "learning_rate": 3.735513056633436e-06, "loss": 0.80340189, "num_input_tokens_seen": 68453830, "step": 3170, "time_per_iteration": 2.832043409347534 }, { "auxiliary_loss_clip": 0.01142047, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.05325115, "balance_loss_mlp": 1.02605128, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.7671932984988854, "language_loss": 0.78177166, "learning_rate": 3.7353194653454834e-06, "loss": 0.80364257, "num_input_tokens_seen": 68473005, "step": 3171, "time_per_iteration": 2.7823612689971924 }, { "auxiliary_loss_clip": 0.01158227, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05499291, "balance_loss_mlp": 1.0285697, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.1976685633770905, "language_loss": 0.77953529, "learning_rate": 3.7351258082540426e-06, "loss": 0.80159104, "num_input_tokens_seen": 68493470, "step": 3172, "time_per_iteration": 2.746279001235962 }, { "auxiliary_loss_clip": 0.01145112, "auxiliary_loss_mlp": 0.01055334, "balance_loss_clip": 1.05438328, "balance_loss_mlp": 1.03703523, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 1.5258786569967644, "language_loss": 0.80223799, "learning_rate": 3.7349320853664576e-06, "loss": 0.82424247, "num_input_tokens_seen": 68511290, "step": 3173, "time_per_iteration": 2.7396810054779053 }, { "auxiliary_loss_clip": 0.01113266, "auxiliary_loss_mlp": 0.00778142, "balance_loss_clip": 1.04967713, "balance_loss_mlp": 1.00094676, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 1.5341307852526682, "language_loss": 0.78495061, "learning_rate": 3.7347382966900735e-06, "loss": 0.80386466, "num_input_tokens_seen": 68532575, "step": 3174, "time_per_iteration": 2.8579304218292236 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.04928994, "balance_loss_mlp": 1.02838778, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 1.8075853216546063, "language_loss": 0.81067109, "learning_rate": 3.7345444422322395e-06, "loss": 0.83221382, "num_input_tokens_seen": 68548760, "step": 3175, "time_per_iteration": 2.718254804611206 }, { "auxiliary_loss_clip": 0.01080497, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.04361629, "balance_loss_mlp": 1.0342685, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 2.2545261224105873, "language_loss": 0.85529047, "learning_rate": 3.7343505220003067e-06, "loss": 0.87663192, "num_input_tokens_seen": 68563100, "step": 3176, "time_per_iteration": 4.2962729930877686 }, { "auxiliary_loss_clip": 0.0113361, "auxiliary_loss_mlp": 0.01059849, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.03928506, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.0896270593066832, "language_loss": 0.813025, "learning_rate": 3.7341565360016285e-06, "loss": 0.83495957, "num_input_tokens_seen": 68581650, "step": 3177, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01122377, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.0482533, "balance_loss_mlp": 1.0265398, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.67963335978105, "language_loss": 0.7530241, "learning_rate": 3.73396248424356e-06, "loss": 0.7746973, "num_input_tokens_seen": 68600360, "step": 3178, "time_per_iteration": 4.351228475570679 }, { "auxiliary_loss_clip": 0.01146729, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02458286, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 4.753014277211421, "language_loss": 0.81381619, "learning_rate": 3.7337683667334606e-06, "loss": 0.83570826, "num_input_tokens_seen": 68617885, "step": 3179, "time_per_iteration": 4.259284019470215 }, { "auxiliary_loss_clip": 0.01147837, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.05645823, "balance_loss_mlp": 1.0291661, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.753081884541086, "language_loss": 0.79384613, "learning_rate": 3.733574183478691e-06, "loss": 0.81580591, "num_input_tokens_seen": 68634550, "step": 3180, "time_per_iteration": 2.6609203815460205 }, { "auxiliary_loss_clip": 0.01129361, "auxiliary_loss_mlp": 0.0105402, "balance_loss_clip": 1.05249727, "balance_loss_mlp": 1.03445804, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.660238694189741, "language_loss": 0.79517245, "learning_rate": 3.733379934486615e-06, "loss": 0.81700623, "num_input_tokens_seen": 68651895, "step": 3181, "time_per_iteration": 2.6877176761627197 }, { "auxiliary_loss_clip": 0.0114301, "auxiliary_loss_mlp": 0.01053621, "balance_loss_clip": 1.05339336, "balance_loss_mlp": 1.03527462, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 2.2179888965480243, "language_loss": 0.74570775, "learning_rate": 3.7331856197645973e-06, "loss": 0.76767409, "num_input_tokens_seen": 68671500, "step": 3182, "time_per_iteration": 4.2829508781433105 }, { "auxiliary_loss_clip": 0.01128679, "auxiliary_loss_mlp": 0.01044063, "balance_loss_clip": 1.05578041, "balance_loss_mlp": 1.02575254, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 1.7534728284311585, "language_loss": 0.64618582, "learning_rate": 3.7329912393200084e-06, "loss": 0.66791326, "num_input_tokens_seen": 68690570, "step": 3183, "time_per_iteration": 2.7652854919433594 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.0512805, "balance_loss_mlp": 1.0311259, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.555926798692704, "language_loss": 0.73459226, "learning_rate": 3.7327967931602173e-06, "loss": 0.75642347, "num_input_tokens_seen": 68709735, "step": 3184, "time_per_iteration": 2.6929056644439697 }, { "auxiliary_loss_clip": 0.01122578, "auxiliary_loss_mlp": 0.01054123, "balance_loss_clip": 1.05015373, "balance_loss_mlp": 1.03347623, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 2.0989643169058514, "language_loss": 0.87983418, "learning_rate": 3.732602281292598e-06, "loss": 0.9016012, "num_input_tokens_seen": 68727565, "step": 3185, "time_per_iteration": 2.6859230995178223 }, { "auxiliary_loss_clip": 0.01153787, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.02505302, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.4520480945942587, "language_loss": 0.73240852, "learning_rate": 3.7324077037245267e-06, "loss": 0.75439072, "num_input_tokens_seen": 68748110, "step": 3186, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01132874, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.05609488, "balance_loss_mlp": 1.02379346, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 2.739457234253781, "language_loss": 0.83550584, "learning_rate": 3.7322130604633825e-06, "loss": 0.85728443, "num_input_tokens_seen": 68769765, "step": 3187, "time_per_iteration": 2.7476372718811035 }, { "auxiliary_loss_clip": 0.01076264, "auxiliary_loss_mlp": 0.01021317, "balance_loss_clip": 1.04604995, "balance_loss_mlp": 1.01892138, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8659386797819415, "language_loss": 0.55824959, "learning_rate": 3.732018351516544e-06, "loss": 0.57922542, "num_input_tokens_seen": 68826815, "step": 3188, "time_per_iteration": 3.2144031524658203 }, { "auxiliary_loss_clip": 0.01139007, "auxiliary_loss_mlp": 0.01054399, "balance_loss_clip": 1.054564, "balance_loss_mlp": 1.03537333, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 2.2897904709915573, "language_loss": 0.69839454, "learning_rate": 3.731823576891397e-06, "loss": 0.72032857, "num_input_tokens_seen": 68847585, "step": 3189, "time_per_iteration": 2.7998950481414795 }, { "auxiliary_loss_clip": 0.01118438, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.04930174, "balance_loss_mlp": 1.02116132, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 2.362312815249866, "language_loss": 0.74320328, "learning_rate": 3.7316287365953266e-06, "loss": 0.76477331, "num_input_tokens_seen": 68866620, "step": 3190, "time_per_iteration": 2.7386670112609863 }, { "auxiliary_loss_clip": 0.01111071, "auxiliary_loss_mlp": 0.0106718, "balance_loss_clip": 1.04946983, "balance_loss_mlp": 1.04702199, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 3.545467698458187, "language_loss": 0.8444041, "learning_rate": 3.73143383063572e-06, "loss": 0.8661865, "num_input_tokens_seen": 68885515, "step": 3191, "time_per_iteration": 2.7025794982910156 }, { "auxiliary_loss_clip": 0.01127894, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.02231336, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 2.0663841109071526, "language_loss": 0.89985192, "learning_rate": 3.73123885901997e-06, "loss": 0.92152941, "num_input_tokens_seen": 68903225, "step": 3192, "time_per_iteration": 2.802852153778076 }, { "auxiliary_loss_clip": 0.01130336, "auxiliary_loss_mlp": 0.01054766, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.03509688, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 2.3467564445058775, "language_loss": 0.75159264, "learning_rate": 3.7310438217554687e-06, "loss": 0.77344358, "num_input_tokens_seen": 68922860, "step": 3193, "time_per_iteration": 2.7680914402008057 }, { "auxiliary_loss_clip": 0.01128303, "auxiliary_loss_mlp": 0.00777332, "balance_loss_clip": 1.05222785, "balance_loss_mlp": 1.00071752, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 2.078743387775855, "language_loss": 0.75189757, "learning_rate": 3.730848718849612e-06, "loss": 0.77095383, "num_input_tokens_seen": 68943000, "step": 3194, "time_per_iteration": 2.7537553310394287 }, { "auxiliary_loss_clip": 0.01068142, "auxiliary_loss_mlp": 0.01004387, "balance_loss_clip": 1.03910232, "balance_loss_mlp": 1.00182378, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7955224937316553, "language_loss": 0.68507159, "learning_rate": 3.7306535503097985e-06, "loss": 0.70579696, "num_input_tokens_seen": 69000255, "step": 3195, "time_per_iteration": 3.117191791534424 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.01052392, "balance_loss_clip": 1.05205238, "balance_loss_mlp": 1.0320189, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 2.6559439291645757, "language_loss": 0.73141015, "learning_rate": 3.730458316143429e-06, "loss": 0.75309479, "num_input_tokens_seen": 69019665, "step": 3196, "time_per_iteration": 2.7234303951263428 }, { "auxiliary_loss_clip": 0.01139018, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.06151462, "balance_loss_mlp": 1.02596927, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 3.0997718824135734, "language_loss": 0.83654135, "learning_rate": 3.7302630163579068e-06, "loss": 0.85838103, "num_input_tokens_seen": 69039055, "step": 3197, "time_per_iteration": 2.72575306892395 }, { "auxiliary_loss_clip": 0.01086216, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.04615641, "balance_loss_mlp": 1.03320754, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.2465298420006383, "language_loss": 0.80656433, "learning_rate": 3.7300676509606373e-06, "loss": 0.82797706, "num_input_tokens_seen": 69056370, "step": 3198, "time_per_iteration": 2.741678237915039 }, { "auxiliary_loss_clip": 0.01135487, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.05502987, "balance_loss_mlp": 1.03655636, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.9205907836873994, "language_loss": 0.78993976, "learning_rate": 3.729872219959029e-06, "loss": 0.81186032, "num_input_tokens_seen": 69075915, "step": 3199, "time_per_iteration": 2.7821297645568848 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.05010581, "balance_loss_mlp": 1.036412, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 3.662083840248298, "language_loss": 0.83574522, "learning_rate": 3.7296767233604934e-06, "loss": 0.85748297, "num_input_tokens_seen": 69094145, "step": 3200, "time_per_iteration": 2.7095022201538086 }, { "auxiliary_loss_clip": 0.01159025, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.05997193, "balance_loss_mlp": 1.03060746, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.9278966392289572, "language_loss": 0.79092836, "learning_rate": 3.729481161172443e-06, "loss": 0.81300688, "num_input_tokens_seen": 69111110, "step": 3201, "time_per_iteration": 2.684979200363159 }, { "auxiliary_loss_clip": 0.01103349, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.04825675, "balance_loss_mlp": 1.03418541, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 2.4062417134527645, "language_loss": 0.69276404, "learning_rate": 3.7292855334022927e-06, "loss": 0.71433127, "num_input_tokens_seen": 69130280, "step": 3202, "time_per_iteration": 2.8284943103790283 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.05389905, "balance_loss_mlp": 1.02256894, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.9491265782204168, "language_loss": 0.91396749, "learning_rate": 3.7290898400574627e-06, "loss": 0.93570089, "num_input_tokens_seen": 69149570, "step": 3203, "time_per_iteration": 2.802433729171753 }, { "auxiliary_loss_clip": 0.0114953, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.05674863, "balance_loss_mlp": 1.02959776, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 5.05881669068558, "language_loss": 0.81689429, "learning_rate": 3.7288940811453725e-06, "loss": 0.83888692, "num_input_tokens_seen": 69168190, "step": 3204, "time_per_iteration": 2.671285629272461 }, { "auxiliary_loss_clip": 0.01116988, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.04950142, "balance_loss_mlp": 1.0298202, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.296941025186916, "language_loss": 0.76167846, "learning_rate": 3.7286982566734454e-06, "loss": 0.78333771, "num_input_tokens_seen": 69186950, "step": 3205, "time_per_iteration": 2.8654470443725586 }, { "auxiliary_loss_clip": 0.01140852, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05839586, "balance_loss_mlp": 1.02749407, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 3.761768843322395, "language_loss": 0.83394569, "learning_rate": 3.728502366649107e-06, "loss": 0.85582072, "num_input_tokens_seen": 69204850, "step": 3206, "time_per_iteration": 2.8610613346099854 }, { "auxiliary_loss_clip": 0.0105715, "auxiliary_loss_mlp": 0.01004055, "balance_loss_clip": 1.03779244, "balance_loss_mlp": 1.00174224, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8644529519848262, "language_loss": 0.60561717, "learning_rate": 3.728306411079786e-06, "loss": 0.62622917, "num_input_tokens_seen": 69259200, "step": 3207, "time_per_iteration": 3.126537322998047 }, { "auxiliary_loss_clip": 0.01120285, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.05201781, "balance_loss_mlp": 1.02678764, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.296187182186814, "language_loss": 0.75463599, "learning_rate": 3.7281103899729125e-06, "loss": 0.77629405, "num_input_tokens_seen": 69275835, "step": 3208, "time_per_iteration": 2.6978750228881836 }, { "auxiliary_loss_clip": 0.01150534, "auxiliary_loss_mlp": 0.00777875, "balance_loss_clip": 1.05520236, "balance_loss_mlp": 1.00063884, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 1.9483983315924505, "language_loss": 0.60869855, "learning_rate": 3.7279143033359195e-06, "loss": 0.62798262, "num_input_tokens_seen": 69294810, "step": 3209, "time_per_iteration": 2.699798107147217 }, { "auxiliary_loss_clip": 0.01158758, "auxiliary_loss_mlp": 0.01053815, "balance_loss_clip": 1.05472994, "balance_loss_mlp": 1.03261995, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 1.9992177661428934, "language_loss": 0.80025005, "learning_rate": 3.727718151176243e-06, "loss": 0.82237577, "num_input_tokens_seen": 69316065, "step": 3210, "time_per_iteration": 2.832665205001831 }, { "auxiliary_loss_clip": 0.01118997, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.05044246, "balance_loss_mlp": 1.02920699, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.515510367397107, "language_loss": 0.82571948, "learning_rate": 3.7275219335013217e-06, "loss": 0.84738445, "num_input_tokens_seen": 69332900, "step": 3211, "time_per_iteration": 2.7664191722869873 }, { "auxiliary_loss_clip": 0.01073663, "auxiliary_loss_mlp": 0.01002544, "balance_loss_clip": 1.03501034, "balance_loss_mlp": 1.00021982, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9633495631759209, "language_loss": 0.63641912, "learning_rate": 3.7273256503185953e-06, "loss": 0.6571812, "num_input_tokens_seen": 69382535, "step": 3212, "time_per_iteration": 2.974940299987793 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05586314, "balance_loss_mlp": 1.02565336, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.7209148950717332, "language_loss": 0.76375663, "learning_rate": 3.7271293016355074e-06, "loss": 0.78551459, "num_input_tokens_seen": 69400600, "step": 3213, "time_per_iteration": 2.7898454666137695 }, { "auxiliary_loss_clip": 0.01123196, "auxiliary_loss_mlp": 0.0105066, "balance_loss_clip": 1.05261111, "balance_loss_mlp": 1.03116894, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.349758973823363, "language_loss": 0.70871878, "learning_rate": 3.726932887459503e-06, "loss": 0.73045731, "num_input_tokens_seen": 69417350, "step": 3214, "time_per_iteration": 2.8155152797698975 }, { "auxiliary_loss_clip": 0.01155585, "auxiliary_loss_mlp": 0.01047831, "balance_loss_clip": 1.05412841, "balance_loss_mlp": 1.02807808, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 2.190607045917922, "language_loss": 0.75067955, "learning_rate": 3.72673640779803e-06, "loss": 0.77271378, "num_input_tokens_seen": 69431845, "step": 3215, "time_per_iteration": 4.111938238143921 }, { "auxiliary_loss_clip": 0.01112217, "auxiliary_loss_mlp": 0.01049964, "balance_loss_clip": 1.04928339, "balance_loss_mlp": 1.0323447, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 1.7842520268521305, "language_loss": 0.88426638, "learning_rate": 3.72653986265854e-06, "loss": 0.9058882, "num_input_tokens_seen": 69453275, "step": 3216, "time_per_iteration": 2.7699615955352783 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01052131, "balance_loss_clip": 1.05435801, "balance_loss_mlp": 1.03442836, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 1.6996051239972392, "language_loss": 0.7974773, "learning_rate": 3.726343252048485e-06, "loss": 0.81953669, "num_input_tokens_seen": 69471830, "step": 3217, "time_per_iteration": 2.6788718700408936 }, { "auxiliary_loss_clip": 0.01143281, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.05695105, "balance_loss_mlp": 1.02864754, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 4.708784796317305, "language_loss": 0.6161437, "learning_rate": 3.7261465759753206e-06, "loss": 0.6380679, "num_input_tokens_seen": 69489320, "step": 3218, "time_per_iteration": 4.352849960327148 }, { "auxiliary_loss_clip": 0.01157355, "auxiliary_loss_mlp": 0.01047211, "balance_loss_clip": 1.05723107, "balance_loss_mlp": 1.02873373, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.9724785552136583, "language_loss": 0.80345452, "learning_rate": 3.7259498344465053e-06, "loss": 0.82550013, "num_input_tokens_seen": 69506665, "step": 3219, "time_per_iteration": 4.1739161014556885 }, { "auxiliary_loss_clip": 0.01104687, "auxiliary_loss_mlp": 0.01047672, "balance_loss_clip": 1.05145359, "balance_loss_mlp": 1.02819324, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.7508533279024077, "language_loss": 0.85693008, "learning_rate": 3.7257530274694993e-06, "loss": 0.87845367, "num_input_tokens_seen": 69523835, "step": 3220, "time_per_iteration": 2.777284622192383 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.05441856, "balance_loss_mlp": 1.02511764, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.05545450883527, "language_loss": 0.84637755, "learning_rate": 3.725556155051766e-06, "loss": 0.86829084, "num_input_tokens_seen": 69542620, "step": 3221, "time_per_iteration": 4.224115371704102 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.05466259, "balance_loss_mlp": 1.02730846, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 2.658004231066563, "language_loss": 0.86087942, "learning_rate": 3.7253592172007702e-06, "loss": 0.8827616, "num_input_tokens_seen": 69561130, "step": 3222, "time_per_iteration": 2.6400530338287354 }, { "auxiliary_loss_clip": 0.01069453, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.04206085, "balance_loss_mlp": 1.02599275, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 1.8604116943694204, "language_loss": 0.78510809, "learning_rate": 3.72516221392398e-06, "loss": 0.8062554, "num_input_tokens_seen": 69580425, "step": 3223, "time_per_iteration": 2.9685652256011963 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.05697751, "balance_loss_mlp": 1.02819431, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 1.8958208586464897, "language_loss": 0.75391948, "learning_rate": 3.7249651452288653e-06, "loss": 0.77584827, "num_input_tokens_seen": 69597085, "step": 3224, "time_per_iteration": 2.665294885635376 }, { "auxiliary_loss_clip": 0.01102293, "auxiliary_loss_mlp": 0.01050181, "balance_loss_clip": 1.04728186, "balance_loss_mlp": 1.02927208, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 3.358076005999295, "language_loss": 0.71180636, "learning_rate": 3.7247680111229e-06, "loss": 0.73333108, "num_input_tokens_seen": 69618885, "step": 3225, "time_per_iteration": 2.997511863708496 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.0519309, "balance_loss_mlp": 1.03480864, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.42331686427639, "language_loss": 0.69379079, "learning_rate": 3.7245708116135585e-06, "loss": 0.71552593, "num_input_tokens_seen": 69638200, "step": 3226, "time_per_iteration": 2.746338129043579 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.02264214, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 2.1006513764454864, "language_loss": 0.76236808, "learning_rate": 3.7243735467083193e-06, "loss": 0.78405869, "num_input_tokens_seen": 69657550, "step": 3227, "time_per_iteration": 2.760087728500366 }, { "auxiliary_loss_clip": 0.01117794, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.05304587, "balance_loss_mlp": 1.0256561, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 2.8268368707906397, "language_loss": 0.69577461, "learning_rate": 3.724176216414662e-06, "loss": 0.71739054, "num_input_tokens_seen": 69675005, "step": 3228, "time_per_iteration": 2.6779348850250244 }, { "auxiliary_loss_clip": 0.01148199, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02445757, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 1.7694943420266864, "language_loss": 0.74160898, "learning_rate": 3.72397882074007e-06, "loss": 0.76351416, "num_input_tokens_seen": 69696455, "step": 3229, "time_per_iteration": 2.7229623794555664 }, { "auxiliary_loss_clip": 0.01119678, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.05435359, "balance_loss_mlp": 1.0262022, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.9766126324167548, "language_loss": 0.65722096, "learning_rate": 3.7237813596920285e-06, "loss": 0.67885935, "num_input_tokens_seen": 69714245, "step": 3230, "time_per_iteration": 2.740324020385742 }, { "auxiliary_loss_clip": 0.01124671, "auxiliary_loss_mlp": 0.00776003, "balance_loss_clip": 1.05223823, "balance_loss_mlp": 1.00081468, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 1.9307338208311895, "language_loss": 0.82042694, "learning_rate": 3.7235838332780254e-06, "loss": 0.83943367, "num_input_tokens_seen": 69731515, "step": 3231, "time_per_iteration": 2.7453513145446777 }, { "auxiliary_loss_clip": 0.0113141, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.05393946, "balance_loss_mlp": 1.02220988, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 10.866686758212083, "language_loss": 0.87038374, "learning_rate": 3.72338624150555e-06, "loss": 0.89212114, "num_input_tokens_seen": 69748885, "step": 3232, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.05029583, "balance_loss_mlp": 1.03102958, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 2.531838729905544, "language_loss": 0.85189134, "learning_rate": 3.723188584382096e-06, "loss": 0.87340462, "num_input_tokens_seen": 69767540, "step": 3233, "time_per_iteration": 2.8617444038391113 }, { "auxiliary_loss_clip": 0.01149478, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.0574832, "balance_loss_mlp": 1.0357672, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.7408859410354203, "language_loss": 0.89099532, "learning_rate": 3.722990861915158e-06, "loss": 0.91303527, "num_input_tokens_seen": 69789340, "step": 3234, "time_per_iteration": 2.7648239135742188 }, { "auxiliary_loss_clip": 0.01135157, "auxiliary_loss_mlp": 0.01044708, "balance_loss_clip": 1.05003643, "balance_loss_mlp": 1.02544403, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 2.4074482975555926, "language_loss": 0.78673434, "learning_rate": 3.722793074112234e-06, "loss": 0.80853301, "num_input_tokens_seen": 69806470, "step": 3235, "time_per_iteration": 2.76930832862854 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.0580672, "balance_loss_mlp": 1.0293448, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 2.2511193258734354, "language_loss": 0.79391634, "learning_rate": 3.7225952209808233e-06, "loss": 0.81574875, "num_input_tokens_seen": 69822655, "step": 3236, "time_per_iteration": 2.7060179710388184 }, { "auxiliary_loss_clip": 0.01156991, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.02482522, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 2.1553329609131713, "language_loss": 0.76224017, "learning_rate": 3.72239730252843e-06, "loss": 0.78425646, "num_input_tokens_seen": 69841895, "step": 3237, "time_per_iteration": 2.642235040664673 }, { "auxiliary_loss_clip": 0.01158804, "auxiliary_loss_mlp": 0.01051059, "balance_loss_clip": 1.05648041, "balance_loss_mlp": 1.03289127, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.5204653275468003, "language_loss": 0.74828202, "learning_rate": 3.7221993187625583e-06, "loss": 0.77038062, "num_input_tokens_seen": 69862220, "step": 3238, "time_per_iteration": 2.6618688106536865 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.04992437, "balance_loss_mlp": 1.02791595, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 3.1324225641798518, "language_loss": 0.734164, "learning_rate": 3.7220012696907155e-06, "loss": 0.75571299, "num_input_tokens_seen": 69881830, "step": 3239, "time_per_iteration": 2.7637152671813965 }, { "auxiliary_loss_clip": 0.01132567, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.05458641, "balance_loss_mlp": 1.02947509, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 2.155392951393246, "language_loss": 0.73291272, "learning_rate": 3.721803155320412e-06, "loss": 0.7547183, "num_input_tokens_seen": 69900515, "step": 3240, "time_per_iteration": 2.6980888843536377 }, { "auxiliary_loss_clip": 0.01131601, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.05846488, "balance_loss_mlp": 1.02208555, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 5.847648280625993, "language_loss": 0.65809447, "learning_rate": 3.7216049756591606e-06, "loss": 0.6798048, "num_input_tokens_seen": 69920060, "step": 3241, "time_per_iteration": 2.659707546234131 }, { "auxiliary_loss_clip": 0.01128971, "auxiliary_loss_mlp": 0.01048707, "balance_loss_clip": 1.05226684, "balance_loss_mlp": 1.03039646, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 1.4408225707306088, "language_loss": 0.82747853, "learning_rate": 3.7214067307144754e-06, "loss": 0.84925532, "num_input_tokens_seen": 69939820, "step": 3242, "time_per_iteration": 2.7137632369995117 }, { "auxiliary_loss_clip": 0.01077632, "auxiliary_loss_mlp": 0.01014225, "balance_loss_clip": 1.04083347, "balance_loss_mlp": 1.01131678, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.853263603243422, "language_loss": 0.57500821, "learning_rate": 3.721208420493875e-06, "loss": 0.59592682, "num_input_tokens_seen": 70002145, "step": 3243, "time_per_iteration": 3.1446309089660645 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01050428, "balance_loss_clip": 1.05331421, "balance_loss_mlp": 1.02988815, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 7.2345723863132, "language_loss": 0.83789021, "learning_rate": 3.7210100450048784e-06, "loss": 0.85977995, "num_input_tokens_seen": 70020510, "step": 3244, "time_per_iteration": 2.6194229125976562 }, { "auxiliary_loss_clip": 0.01143261, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05732584, "balance_loss_mlp": 1.02869976, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 2.0710390949438837, "language_loss": 0.7739507, "learning_rate": 3.7208116042550088e-06, "loss": 0.79584551, "num_input_tokens_seen": 70040760, "step": 3245, "time_per_iteration": 2.6684374809265137 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.05566645, "balance_loss_mlp": 1.02431464, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.1010289547443133, "language_loss": 0.83988321, "learning_rate": 3.7206130982517906e-06, "loss": 0.86174309, "num_input_tokens_seen": 70058720, "step": 3246, "time_per_iteration": 2.6595354080200195 }, { "auxiliary_loss_clip": 0.0114599, "auxiliary_loss_mlp": 0.00776442, "balance_loss_clip": 1.05517101, "balance_loss_mlp": 1.00080454, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 3.3581015873305438, "language_loss": 0.76840878, "learning_rate": 3.7204145270027514e-06, "loss": 0.78763306, "num_input_tokens_seen": 70076470, "step": 3247, "time_per_iteration": 2.7777793407440186 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02651262, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.8981807103962522, "language_loss": 0.75459039, "learning_rate": 3.720215890515421e-06, "loss": 0.77624786, "num_input_tokens_seen": 70096220, "step": 3248, "time_per_iteration": 2.8088901042938232 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.05548215, "balance_loss_mlp": 1.03008783, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 2.7209722336942135, "language_loss": 0.77774823, "learning_rate": 3.7200171887973316e-06, "loss": 0.79979146, "num_input_tokens_seen": 70114800, "step": 3249, "time_per_iteration": 2.610877752304077 }, { "auxiliary_loss_clip": 0.01148434, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05689144, "balance_loss_mlp": 1.03299928, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.5551573885822045, "language_loss": 0.73118901, "learning_rate": 3.7198184218560176e-06, "loss": 0.75317669, "num_input_tokens_seen": 70134930, "step": 3250, "time_per_iteration": 2.5901567935943604 }, { "auxiliary_loss_clip": 0.01101628, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05080378, "balance_loss_mlp": 1.02876413, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.030501302548557, "language_loss": 0.79203367, "learning_rate": 3.719619589699017e-06, "loss": 0.81351459, "num_input_tokens_seen": 70152045, "step": 3251, "time_per_iteration": 2.6619749069213867 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.05741858, "balance_loss_mlp": 1.02606022, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 7.451515078679223, "language_loss": 0.83871722, "learning_rate": 3.7194206923338695e-06, "loss": 0.86074108, "num_input_tokens_seen": 70169240, "step": 3252, "time_per_iteration": 2.5029656887054443 }, { "auxiliary_loss_clip": 0.01142752, "auxiliary_loss_mlp": 0.01057294, "balance_loss_clip": 1.05278862, "balance_loss_mlp": 1.03518057, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.7140417843701068, "language_loss": 0.73995864, "learning_rate": 3.719221729768117e-06, "loss": 0.76195908, "num_input_tokens_seen": 70192690, "step": 3253, "time_per_iteration": 2.609117269515991 }, { "auxiliary_loss_clip": 0.01102675, "auxiliary_loss_mlp": 0.01046707, "balance_loss_clip": 1.04759037, "balance_loss_mlp": 1.02782381, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.1302159220485675, "language_loss": 0.76167047, "learning_rate": 3.7190227020093037e-06, "loss": 0.78316426, "num_input_tokens_seen": 70209685, "step": 3254, "time_per_iteration": 4.174965858459473 }, { "auxiliary_loss_clip": 0.01043127, "auxiliary_loss_mlp": 0.01006966, "balance_loss_clip": 1.04737842, "balance_loss_mlp": 1.0036757, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.84452007287803, "language_loss": 0.55275303, "learning_rate": 3.7188236090649774e-06, "loss": 0.57325399, "num_input_tokens_seen": 70265050, "step": 3255, "time_per_iteration": 3.2241716384887695 }, { "auxiliary_loss_clip": 0.01133721, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.0557251, "balance_loss_mlp": 1.02349281, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 2.6103802859468392, "language_loss": 0.70870697, "learning_rate": 3.718624450942688e-06, "loss": 0.73046112, "num_input_tokens_seen": 70281830, "step": 3256, "time_per_iteration": 2.641296148300171 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.0544858, "balance_loss_mlp": 1.02523613, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.649319646209249, "language_loss": 0.80722409, "learning_rate": 3.718425227649987e-06, "loss": 0.82918048, "num_input_tokens_seen": 70297420, "step": 3257, "time_per_iteration": 4.258259057998657 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.05470431, "balance_loss_mlp": 1.02601588, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 6.015808523610408, "language_loss": 0.75124931, "learning_rate": 3.7182259391944292e-06, "loss": 0.77274966, "num_input_tokens_seen": 70319210, "step": 3258, "time_per_iteration": 4.386433362960815 }, { "auxiliary_loss_clip": 0.01082287, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.04533339, "balance_loss_mlp": 1.0237875, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 1.8034996675319444, "language_loss": 0.73872411, "learning_rate": 3.7180265855835714e-06, "loss": 0.75998843, "num_input_tokens_seen": 70339045, "step": 3259, "time_per_iteration": 2.815469264984131 }, { "auxiliary_loss_clip": 0.01131793, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.05167735, "balance_loss_mlp": 1.02392125, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.2096667980592, "language_loss": 0.77053022, "learning_rate": 3.7178271668249735e-06, "loss": 0.79228187, "num_input_tokens_seen": 70356505, "step": 3260, "time_per_iteration": 4.2817702293396 }, { "auxiliary_loss_clip": 0.01148118, "auxiliary_loss_mlp": 0.01043761, "balance_loss_clip": 1.0551343, "balance_loss_mlp": 1.0248661, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 5.605178759176999, "language_loss": 0.82261205, "learning_rate": 3.7176276829261975e-06, "loss": 0.84453082, "num_input_tokens_seen": 70375410, "step": 3261, "time_per_iteration": 2.673092842102051 }, { "auxiliary_loss_clip": 0.01121379, "auxiliary_loss_mlp": 0.01044043, "balance_loss_clip": 1.0550617, "balance_loss_mlp": 1.02488637, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 1.8492209450679535, "language_loss": 0.76671481, "learning_rate": 3.717428133894807e-06, "loss": 0.78836906, "num_input_tokens_seen": 70396315, "step": 3262, "time_per_iteration": 2.803938150405884 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01047259, "balance_loss_clip": 1.05960584, "balance_loss_mlp": 1.02950907, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.7278621785184562, "language_loss": 0.8668195, "learning_rate": 3.71722851973837e-06, "loss": 0.88878107, "num_input_tokens_seen": 70417945, "step": 3263, "time_per_iteration": 2.6677918434143066 }, { "auxiliary_loss_clip": 0.0113123, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.05328059, "balance_loss_mlp": 1.02505815, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 3.447639973868791, "language_loss": 0.73775035, "learning_rate": 3.717028840464455e-06, "loss": 0.75948811, "num_input_tokens_seen": 70438690, "step": 3264, "time_per_iteration": 2.6973094940185547 }, { "auxiliary_loss_clip": 0.01144053, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.05736756, "balance_loss_mlp": 1.03223944, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 2.4424358562200927, "language_loss": 0.78513813, "learning_rate": 3.7168290960806344e-06, "loss": 0.80707777, "num_input_tokens_seen": 70455385, "step": 3265, "time_per_iteration": 2.625739336013794 }, { "auxiliary_loss_clip": 0.01031434, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 1.03386986, "balance_loss_mlp": 0.99983466, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.7932330660809486, "language_loss": 0.53389955, "learning_rate": 3.716629286594483e-06, "loss": 0.55423868, "num_input_tokens_seen": 70514280, "step": 3266, "time_per_iteration": 3.2586586475372314 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.00776501, "balance_loss_clip": 1.04957044, "balance_loss_mlp": 1.00080895, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 2.0008611208986133, "language_loss": 0.80109024, "learning_rate": 3.7164294120135767e-06, "loss": 0.8201015, "num_input_tokens_seen": 70531800, "step": 3267, "time_per_iteration": 2.678537368774414 }, { "auxiliary_loss_clip": 0.01130982, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.05263019, "balance_loss_mlp": 1.02660179, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 1.9909459598185588, "language_loss": 0.86758262, "learning_rate": 3.7162294723454953e-06, "loss": 0.88933229, "num_input_tokens_seen": 70550615, "step": 3268, "time_per_iteration": 2.6949849128723145 }, { "auxiliary_loss_clip": 0.01099432, "auxiliary_loss_mlp": 0.01041621, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.02408528, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 2.2632495429204127, "language_loss": 0.68785441, "learning_rate": 3.7160294675978197e-06, "loss": 0.70926493, "num_input_tokens_seen": 70568690, "step": 3269, "time_per_iteration": 2.770078182220459 }, { "auxiliary_loss_clip": 0.01116538, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.03330541, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 7.1863103423452355, "language_loss": 0.80241841, "learning_rate": 3.715829397778135e-06, "loss": 0.82410699, "num_input_tokens_seen": 70588665, "step": 3270, "time_per_iteration": 2.7294864654541016 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.05189824, "balance_loss_mlp": 1.02833724, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 1.9668649321541274, "language_loss": 0.83912349, "learning_rate": 3.715629262894028e-06, "loss": 0.86097538, "num_input_tokens_seen": 70606900, "step": 3271, "time_per_iteration": 2.640235662460327 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.05468225, "balance_loss_mlp": 1.0332067, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 1.9968416702279483, "language_loss": 0.79902714, "learning_rate": 3.715429062953087e-06, "loss": 0.82093388, "num_input_tokens_seen": 70625955, "step": 3272, "time_per_iteration": 2.636629343032837 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.05192566, "balance_loss_mlp": 1.03715479, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.7302013075823783, "language_loss": 0.80942369, "learning_rate": 3.7152287979629043e-06, "loss": 0.83125186, "num_input_tokens_seen": 70646090, "step": 3273, "time_per_iteration": 2.6967809200286865 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.01054564, "balance_loss_clip": 1.05456042, "balance_loss_mlp": 1.03655195, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 2.225126358921887, "language_loss": 0.77984649, "learning_rate": 3.7150284679310735e-06, "loss": 0.80183232, "num_input_tokens_seen": 70666065, "step": 3274, "time_per_iteration": 2.6808643341064453 }, { "auxiliary_loss_clip": 0.01141267, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05480242, "balance_loss_mlp": 1.02840877, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.318697297640889, "language_loss": 0.81433225, "learning_rate": 3.7148280728651914e-06, "loss": 0.8362093, "num_input_tokens_seen": 70681580, "step": 3275, "time_per_iteration": 2.672672986984253 }, { "auxiliary_loss_clip": 0.01115756, "auxiliary_loss_mlp": 0.01045314, "balance_loss_clip": 1.05148947, "balance_loss_mlp": 1.02686024, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 2.4665004531377166, "language_loss": 0.80909657, "learning_rate": 3.7146276127728563e-06, "loss": 0.83070731, "num_input_tokens_seen": 70697745, "step": 3276, "time_per_iteration": 2.726970672607422 }, { "auxiliary_loss_clip": 0.01142619, "auxiliary_loss_mlp": 0.01043042, "balance_loss_clip": 1.05443609, "balance_loss_mlp": 1.02491045, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.17541075016206, "language_loss": 0.89113599, "learning_rate": 3.7144270876616713e-06, "loss": 0.9129926, "num_input_tokens_seen": 70715110, "step": 3277, "time_per_iteration": 2.6738827228546143 }, { "auxiliary_loss_clip": 0.01103709, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.02864444, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 2.640727897616601, "language_loss": 0.62070847, "learning_rate": 3.714226497539239e-06, "loss": 0.64222991, "num_input_tokens_seen": 70734715, "step": 3278, "time_per_iteration": 2.7382938861846924 }, { "auxiliary_loss_clip": 0.01115303, "auxiliary_loss_mlp": 0.0105759, "balance_loss_clip": 1.05033016, "balance_loss_mlp": 1.03793263, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 1.930104581155035, "language_loss": 0.73606467, "learning_rate": 3.714025842413166e-06, "loss": 0.75779366, "num_input_tokens_seen": 70752650, "step": 3279, "time_per_iteration": 2.8123648166656494 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.05422091, "balance_loss_mlp": 1.02567458, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.7034036878345749, "language_loss": 0.82685816, "learning_rate": 3.713825122291061e-06, "loss": 0.84875143, "num_input_tokens_seen": 70772365, "step": 3280, "time_per_iteration": 2.7000861167907715 }, { "auxiliary_loss_clip": 0.01106655, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.04887283, "balance_loss_mlp": 1.03071654, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 2.435959864664923, "language_loss": 0.78173983, "learning_rate": 3.713624337180536e-06, "loss": 0.80329525, "num_input_tokens_seen": 70790340, "step": 3281, "time_per_iteration": 2.7017247676849365 }, { "auxiliary_loss_clip": 0.01125353, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.05461836, "balance_loss_mlp": 1.02519727, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.7390973872526612, "language_loss": 0.79777479, "learning_rate": 3.7134234870892045e-06, "loss": 0.8194418, "num_input_tokens_seen": 70809295, "step": 3282, "time_per_iteration": 2.7064146995544434 }, { "auxiliary_loss_clip": 0.01112073, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.05485284, "balance_loss_mlp": 1.02538049, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 2.512566515566025, "language_loss": 0.7192747, "learning_rate": 3.7132225720246826e-06, "loss": 0.74082589, "num_input_tokens_seen": 70828765, "step": 3283, "time_per_iteration": 2.775297164916992 }, { "auxiliary_loss_clip": 0.01137498, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.05320621, "balance_loss_mlp": 1.02665281, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.8864815757917637, "language_loss": 0.78981179, "learning_rate": 3.7130215919945886e-06, "loss": 0.81162113, "num_input_tokens_seen": 70846805, "step": 3284, "time_per_iteration": 2.6344916820526123 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.00776821, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.00114048, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.1903874509936982, "language_loss": 0.86317503, "learning_rate": 3.7128205470065445e-06, "loss": 0.88220835, "num_input_tokens_seen": 70863805, "step": 3285, "time_per_iteration": 2.725186586380005 }, { "auxiliary_loss_clip": 0.01115791, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.05167055, "balance_loss_mlp": 1.02658761, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 2.208260347555195, "language_loss": 0.88770825, "learning_rate": 3.712619437068174e-06, "loss": 0.90931326, "num_input_tokens_seen": 70882660, "step": 3286, "time_per_iteration": 2.6819698810577393 }, { "auxiliary_loss_clip": 0.01118742, "auxiliary_loss_mlp": 0.01052526, "balance_loss_clip": 1.05227792, "balance_loss_mlp": 1.03016233, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.0768117117784874, "language_loss": 0.77941382, "learning_rate": 3.712418262187102e-06, "loss": 0.80112648, "num_input_tokens_seen": 70898765, "step": 3287, "time_per_iteration": 2.641193389892578 }, { "auxiliary_loss_clip": 0.01127955, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.0526104, "balance_loss_mlp": 1.02849019, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 2.061421898899755, "language_loss": 0.80853081, "learning_rate": 3.7122170223709584e-06, "loss": 0.83028376, "num_input_tokens_seen": 70916370, "step": 3288, "time_per_iteration": 2.625068426132202 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01048194, "balance_loss_clip": 1.05143857, "balance_loss_mlp": 1.03045535, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 2.345717890688315, "language_loss": 0.7317158, "learning_rate": 3.712015717627374e-06, "loss": 0.75352174, "num_input_tokens_seen": 70934870, "step": 3289, "time_per_iteration": 2.6319406032562256 }, { "auxiliary_loss_clip": 0.01133413, "auxiliary_loss_mlp": 0.01045224, "balance_loss_clip": 1.05575252, "balance_loss_mlp": 1.02678204, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 1.9087552003653308, "language_loss": 0.79608113, "learning_rate": 3.7118143479639813e-06, "loss": 0.81786746, "num_input_tokens_seen": 70955140, "step": 3290, "time_per_iteration": 2.706570863723755 }, { "auxiliary_loss_clip": 0.01049926, "auxiliary_loss_mlp": 0.0101105, "balance_loss_clip": 1.0327636, "balance_loss_mlp": 1.00853467, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.8952067644857119, "language_loss": 0.60318571, "learning_rate": 3.711612913388418e-06, "loss": 0.62379545, "num_input_tokens_seen": 71012005, "step": 3291, "time_per_iteration": 3.2849009037017822 }, { "auxiliary_loss_clip": 0.01158891, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.05417156, "balance_loss_mlp": 1.02088892, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 1.932789926440358, "language_loss": 0.81595641, "learning_rate": 3.7114114139083204e-06, "loss": 0.83795315, "num_input_tokens_seen": 71031140, "step": 3292, "time_per_iteration": 2.6751551628112793 }, { "auxiliary_loss_clip": 0.01119797, "auxiliary_loss_mlp": 0.00778082, "balance_loss_clip": 1.05296063, "balance_loss_mlp": 1.00086236, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 2.409042629875397, "language_loss": 0.81013, "learning_rate": 3.7112098495313313e-06, "loss": 0.82910883, "num_input_tokens_seen": 71050250, "step": 3293, "time_per_iteration": 4.3039703369140625 }, { "auxiliary_loss_clip": 0.01137316, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.05370128, "balance_loss_mlp": 1.03277683, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 1.8764131105986912, "language_loss": 0.61480314, "learning_rate": 3.711008220265093e-06, "loss": 0.63670063, "num_input_tokens_seen": 71068665, "step": 3294, "time_per_iteration": 2.671241044998169 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.05456376, "balance_loss_mlp": 1.02201271, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 2.0334748560156393, "language_loss": 0.87313825, "learning_rate": 3.710806526117251e-06, "loss": 0.89486015, "num_input_tokens_seen": 71085320, "step": 3295, "time_per_iteration": 2.659680128097534 }, { "auxiliary_loss_clip": 0.01113106, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03256536, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 2.5215255479345067, "language_loss": 0.80839241, "learning_rate": 3.7106047670954544e-06, "loss": 0.83002532, "num_input_tokens_seen": 71102020, "step": 3296, "time_per_iteration": 4.299339294433594 }, { "auxiliary_loss_clip": 0.01123906, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.05233586, "balance_loss_mlp": 1.02522039, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 2.528943220563754, "language_loss": 0.68126047, "learning_rate": 3.710402943207354e-06, "loss": 0.70295388, "num_input_tokens_seen": 71123390, "step": 3297, "time_per_iteration": 4.258284091949463 }, { "auxiliary_loss_clip": 0.01153129, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.05660713, "balance_loss_mlp": 1.02031219, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.9083451106828888, "language_loss": 0.81310993, "learning_rate": 3.7102010544606016e-06, "loss": 0.83501697, "num_input_tokens_seen": 71141800, "step": 3298, "time_per_iteration": 2.6156656742095947 }, { "auxiliary_loss_clip": 0.01137409, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.0573976, "balance_loss_mlp": 1.02159238, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 1.8996943203321497, "language_loss": 0.85154539, "learning_rate": 3.7099991008628544e-06, "loss": 0.87334174, "num_input_tokens_seen": 71159505, "step": 3299, "time_per_iteration": 2.6749041080474854 }, { "auxiliary_loss_clip": 0.01036953, "auxiliary_loss_mlp": 0.01013935, "balance_loss_clip": 1.02875936, "balance_loss_mlp": 1.01106215, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.82907550606663, "language_loss": 0.53206414, "learning_rate": 3.7097970824217706e-06, "loss": 0.55257303, "num_input_tokens_seen": 71223265, "step": 3300, "time_per_iteration": 4.83857798576355 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01064471, "balance_loss_clip": 1.04748702, "balance_loss_mlp": 1.0410459, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 316.1702389408657, "language_loss": 0.73014295, "learning_rate": 3.7095949991450093e-06, "loss": 0.75183886, "num_input_tokens_seen": 71242385, "step": 3301, "time_per_iteration": 2.700654983520508 }, { "auxiliary_loss_clip": 0.01118926, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.05295372, "balance_loss_mlp": 1.02619529, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.410718710355122, "language_loss": 0.88264418, "learning_rate": 3.709392851040235e-06, "loss": 0.90427655, "num_input_tokens_seen": 71258990, "step": 3302, "time_per_iteration": 2.7190146446228027 }, { "auxiliary_loss_clip": 0.01118067, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02661204, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 2.210364764996701, "language_loss": 0.73592931, "learning_rate": 3.709190638115111e-06, "loss": 0.75756073, "num_input_tokens_seen": 71282770, "step": 3303, "time_per_iteration": 2.9379186630249023 }, { "auxiliary_loss_clip": 0.01143275, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.05491257, "balance_loss_mlp": 1.03002524, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.9482807590384623, "language_loss": 0.75103521, "learning_rate": 3.7089883603773084e-06, "loss": 0.77295315, "num_input_tokens_seen": 71301410, "step": 3304, "time_per_iteration": 2.743474245071411 }, { "auxiliary_loss_clip": 0.01133571, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.05309725, "balance_loss_mlp": 1.01710188, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.8722016114425952, "language_loss": 0.8628391, "learning_rate": 3.7087860178344955e-06, "loss": 0.8845247, "num_input_tokens_seen": 71319670, "step": 3305, "time_per_iteration": 2.7129390239715576 }, { "auxiliary_loss_clip": 0.01128329, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04770195, "balance_loss_mlp": 1.02603281, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 2.9829227362861106, "language_loss": 0.68476367, "learning_rate": 3.7085836104943445e-06, "loss": 0.70648777, "num_input_tokens_seen": 71339850, "step": 3306, "time_per_iteration": 2.7083208560943604 }, { "auxiliary_loss_clip": 0.01119386, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04822719, "balance_loss_mlp": 1.02168787, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.683647244561179, "language_loss": 0.76433122, "learning_rate": 3.7083811383645332e-06, "loss": 0.78591287, "num_input_tokens_seen": 71359795, "step": 3307, "time_per_iteration": 2.728661298751831 }, { "auxiliary_loss_clip": 0.01157548, "auxiliary_loss_mlp": 0.01044665, "balance_loss_clip": 1.05895782, "balance_loss_mlp": 1.02714145, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 2.438172575069382, "language_loss": 0.75991976, "learning_rate": 3.708178601452737e-06, "loss": 0.78194201, "num_input_tokens_seen": 71378885, "step": 3308, "time_per_iteration": 2.6580557823181152 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.05453563, "balance_loss_mlp": 1.02307141, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.928689575161362, "language_loss": 0.76043576, "learning_rate": 3.7079759997666374e-06, "loss": 0.7819252, "num_input_tokens_seen": 71397285, "step": 3309, "time_per_iteration": 2.77226185798645 }, { "auxiliary_loss_clip": 0.0114115, "auxiliary_loss_mlp": 0.01045061, "balance_loss_clip": 1.05222607, "balance_loss_mlp": 1.02592754, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 75.17312936609292, "language_loss": 0.87855697, "learning_rate": 3.707773333313917e-06, "loss": 0.90041906, "num_input_tokens_seen": 71415775, "step": 3310, "time_per_iteration": 2.6789662837982178 }, { "auxiliary_loss_clip": 0.01153037, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.05415869, "balance_loss_mlp": 1.02139854, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.3155756588664342, "language_loss": 0.63650048, "learning_rate": 3.70757060210226e-06, "loss": 0.6584295, "num_input_tokens_seen": 71437315, "step": 3311, "time_per_iteration": 2.7604620456695557 }, { "auxiliary_loss_clip": 0.01115133, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.04763019, "balance_loss_mlp": 1.02501202, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 3.8064295514597717, "language_loss": 0.74542546, "learning_rate": 3.707367806139355e-06, "loss": 0.76701546, "num_input_tokens_seen": 71456320, "step": 3312, "time_per_iteration": 2.796475410461426 }, { "auxiliary_loss_clip": 0.01141587, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.05358124, "balance_loss_mlp": 1.02017355, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 2.2312990164825943, "language_loss": 0.84033173, "learning_rate": 3.7071649454328915e-06, "loss": 0.86212194, "num_input_tokens_seen": 71475360, "step": 3313, "time_per_iteration": 2.6044952869415283 }, { "auxiliary_loss_clip": 0.01146797, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.05695391, "balance_loss_mlp": 1.02422476, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 3.856678450124864, "language_loss": 0.810305, "learning_rate": 3.7069620199905625e-06, "loss": 0.83219463, "num_input_tokens_seen": 71496155, "step": 3314, "time_per_iteration": 2.68841814994812 }, { "auxiliary_loss_clip": 0.01112846, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.04617178, "balance_loss_mlp": 1.02643955, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.4822079401394097, "language_loss": 0.87391549, "learning_rate": 3.7067590298200627e-06, "loss": 0.89548075, "num_input_tokens_seen": 71517295, "step": 3315, "time_per_iteration": 2.720093011856079 }, { "auxiliary_loss_clip": 0.0111589, "auxiliary_loss_mlp": 0.00777002, "balance_loss_clip": 1.04992676, "balance_loss_mlp": 1.00093687, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.7805516248937883, "language_loss": 0.70957202, "learning_rate": 3.7065559749290892e-06, "loss": 0.72850096, "num_input_tokens_seen": 71540000, "step": 3316, "time_per_iteration": 2.850100517272949 }, { "auxiliary_loss_clip": 0.01019745, "auxiliary_loss_mlp": 0.01012504, "balance_loss_clip": 1.03032303, "balance_loss_mlp": 1.01003671, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8326978726055106, "language_loss": 0.66287398, "learning_rate": 3.706352855325342e-06, "loss": 0.68319643, "num_input_tokens_seen": 71607880, "step": 3317, "time_per_iteration": 3.425114870071411 }, { "auxiliary_loss_clip": 0.01148059, "auxiliary_loss_mlp": 0.01048913, "balance_loss_clip": 1.05397809, "balance_loss_mlp": 1.02964854, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 2.282515690517884, "language_loss": 0.74494618, "learning_rate": 3.7061496710165233e-06, "loss": 0.76691592, "num_input_tokens_seen": 71625695, "step": 3318, "time_per_iteration": 2.6815896034240723 }, { "auxiliary_loss_clip": 0.01114942, "auxiliary_loss_mlp": 0.01044681, "balance_loss_clip": 1.04767084, "balance_loss_mlp": 1.02786088, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 1.8966456913695608, "language_loss": 0.78894758, "learning_rate": 3.7059464220103385e-06, "loss": 0.81054389, "num_input_tokens_seen": 71648520, "step": 3319, "time_per_iteration": 2.847911834716797 }, { "auxiliary_loss_clip": 0.01134557, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.05354095, "balance_loss_mlp": 1.02312756, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.1348540211051197, "language_loss": 0.76006937, "learning_rate": 3.7057431083144945e-06, "loss": 0.78184479, "num_input_tokens_seen": 71672185, "step": 3320, "time_per_iteration": 2.9324615001678467 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01042998, "balance_loss_clip": 1.05083311, "balance_loss_mlp": 1.02496171, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 2.2436863685702546, "language_loss": 0.80077857, "learning_rate": 3.705539729936701e-06, "loss": 0.82241082, "num_input_tokens_seen": 71692890, "step": 3321, "time_per_iteration": 2.7534186840057373 }, { "auxiliary_loss_clip": 0.01033096, "auxiliary_loss_mlp": 0.01011167, "balance_loss_clip": 1.02391553, "balance_loss_mlp": 1.00828266, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.874673110280983, "language_loss": 0.65145189, "learning_rate": 3.7053362868846696e-06, "loss": 0.67189455, "num_input_tokens_seen": 71745815, "step": 3322, "time_per_iteration": 3.0398683547973633 }, { "auxiliary_loss_clip": 0.01039999, "auxiliary_loss_mlp": 0.01007775, "balance_loss_clip": 1.02971482, "balance_loss_mlp": 1.00479472, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7915334307535052, "language_loss": 0.56919783, "learning_rate": 3.7051327791661153e-06, "loss": 0.58967561, "num_input_tokens_seen": 71806915, "step": 3323, "time_per_iteration": 3.2814581394195557 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.00776139, "balance_loss_clip": 1.05244064, "balance_loss_mlp": 1.00085235, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 1.8766856730809967, "language_loss": 0.80573648, "learning_rate": 3.7049292067887555e-06, "loss": 0.82481277, "num_input_tokens_seen": 71824645, "step": 3324, "time_per_iteration": 2.66456937789917 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.04625165, "balance_loss_mlp": 1.03027487, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 2.4535669107623486, "language_loss": 0.53931105, "learning_rate": 3.7047255697603092e-06, "loss": 0.56113935, "num_input_tokens_seen": 71845125, "step": 3325, "time_per_iteration": 2.696556329727173 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.05065942, "balance_loss_mlp": 1.03328443, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.1570763946475187, "language_loss": 0.86074936, "learning_rate": 3.7045218680884984e-06, "loss": 0.88255823, "num_input_tokens_seen": 71863500, "step": 3326, "time_per_iteration": 2.7167885303497314 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.05427039, "balance_loss_mlp": 1.02511311, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 2.0419576492150395, "language_loss": 0.71793801, "learning_rate": 3.7043181017810476e-06, "loss": 0.73987597, "num_input_tokens_seen": 71881845, "step": 3327, "time_per_iteration": 2.6097662448883057 }, { "auxiliary_loss_clip": 0.01131035, "auxiliary_loss_mlp": 0.01052756, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.03290796, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 1.8948781463857982, "language_loss": 0.7668376, "learning_rate": 3.7041142708456833e-06, "loss": 0.78867549, "num_input_tokens_seen": 71900940, "step": 3328, "time_per_iteration": 2.6869349479675293 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 71900940, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8067294895459533e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }